In [None]:
import numpy as np
import pandas as pd
import tensorflow  as tf

import mlflow
from mlflow.models import infer_signature

import xgboost as xgb
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')

from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from dotenv import load_dotenv

In [None]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

# Loading Environemt

In [None]:
load_dotenv()

train_data_path = os.getenv('TRAIN_DATA_PATH')
test_data_path = os.getenv('TEST_DATA_PATH')

# Data Load

In [None]:
df = pd.read_excel(train_data_path, usecols=['Xs', 'Ys', 'Wafer_Size', 'No_of_Chips'])
X = df[['Xs', 'Ys', 'Wafer_Size']]
y = df['No_of_Chips']
print(df.columns)

In [None]:
rb_scaler = preprocessing.RobustScaler()
X_sc = rb_scaler.fit_transform(X)
col_names = ['Xs', 'Ys', 'Wafer_Size']
X_sc = pd.DataFrame(X_sc, columns=col_names)
df_sc = X_sc
df_sc['No_of_Chips'] = y
df_sc.head

In [None]:
test_list = pd.read_excel(test_data_path)
test_list.columns

In [None]:
X_train = X[['Xs', 'Ys', 'Wafer_Size']]
y_train = y
X_test = test_list.drop(['No_of_Chips'], axis=1)
y_test = test_list['No_of_Chips']

# Scaling

In [None]:
rb_scaler = preprocessing.RobustScaler()

## Scaling training features
X_train_sc = rb_scaler.fit_transform(X_train)
col_names = ['Xs', 'Ys', 'Wafer_Size']
X_train_sc = pd.DataFrame(X_train_sc, columns=col_names)

## Scaling testing features
X_test_sc = rb_scaler.fit_transform(X_test)
col_names = ['Xs', 'Ys', 'Wafer_Size']
X_test_sc = pd.DataFrame(X_test_sc, columns=col_names)

In [None]:
# plot original distribution plot
fig, (ax1) = plt.subplots(ncols=1, figsize=(10, 8))
ax1.set_title('Original Distributions')

sns.kdeplot(X['Xs'], ax=ax1)
sns.kdeplot(X['Ys'], ax=ax1)
sns.kdeplot(X['Wafer_Size'], ax=ax1)

In [None]:
# plot scaled distribution plot
fig, (ax1) = plt.subplots(ncols=1, figsize=(10, 8))
ax1.set_title('Scaled Distributions')

sns.kdeplot(X_sc['Xs'], ax=ax1)
sns.kdeplot(X_sc['Ys'], ax=ax1)
sns.kdeplot(X_sc['Wafer_Size'], ax=ax1)

# Linear Regression

In [None]:
def train_lr(alpha, l1_ratio, exp_name):
    mlflow.sklearn.autolog(disable=True)
    mlflow.set_experiment(exp_name)
    with mlflow.start_run(run_name='LR-basic'):
        params = {
            'alpha': alpha,
            'l1_ratio':l1_ratio,
        }

        mlflow.set_tag('model_name', 'LR')
        mlflow.log_params(params)
        
        lr = ElasticNet(alpha=params['alpha'], l1_ratio=params['l1_ratio'], random_state=42)
        lr.fit(X_train_sc, y_train)

        ################################      TRAIN      ##########################################################
        # Infer model signature
        predictions_train = lr.predict(X_train_sc)
        signature = infer_signature(X_train_sc, predictions_train)
        
        (rmse, mae, r2) = eval_metrics(y_train, predictions_train)
        print('train>')
        print("Elasticnet model (alpha={:f}, l1_ratio={:f}):".format(alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_metric("rmse_tr", rmse)
        mlflow.log_metric("r2_tr", r2)
        mlflow.log_metric("mae_tr", mae)

        ##############################      TEST        ######################################
        predictions_test = lr.predict(X_test_sc)

        mlflow.sklearn.log_model(lr, "lr-model", signature=signature)

        (rmse, mae, r2) = eval_metrics(y_test, predictions_test)

        print('test>')
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        return predictions_train, predictions_test

        

In [None]:
alphas = [0.1, 0.3, 0.5, 0.7, 0.9]
l1s = [0.1, 0.3, 0.5, 0.7, 0.9]

for alpha in alphas:
    for l1 in l1s:
        train_pred, test_pred = train_lr(alpha, l1, exp_name='LR-corr')

# Random Forests

In [None]:

param_grid = {
    'n_estimators': [25, 50, 100, 150, 300],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9, 12, 15, 18],
    'max_leaf_nodes': [3, 6, 9, 12, 15, 18],
}
grid_search = GridSearchCV(RandomForestRegressor(),
                           param_grid=param_grid)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)

In [None]:
def train_rf(n_estimators, max_features, max_depth, max_leaf_nodes, exp_name):
    mlflow.sklearn.autolog(disable=True)
    mlflow.set_experiment(exp_name)
    with mlflow.start_run(run_name='RF-basic'):
        params = {
                'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'max_leaf_nodes': max_leaf_nodes
            }
        mlflow.set_tag('model_name', 'RF')
        mlflow.log_params(params)

        rf = RandomForestRegressor(n_estimators=params['n_estimators'], max_depth=params["max_depth"], max_features=5)
        rf.fit(X_train, y_train)

        predictions_train = rf.predict(X_train)
        signature = infer_signature(X_train, predictions_train)
        (rmse, mae, r2) = eval_metrics(y_train, predictions_train)

        print("RandomForestsRegressor (n_estimators={:d}, max_depth={:d}):".format(params["n_estimators"], params["max_depth"]))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
        mlflow.log_metric("rmse_tr", rmse)
        mlflow.log_metric("r2_tr", r2)
        mlflow.log_metric("mae_tr", mae)

        print('----------------------------------------------------------------------------------------------------------------------')
        predictions_test = rf.predict(X_test)
        (rmse, mae, r2) = eval_metrics(y_test, predictions_test)

        print("RandomForestsRegressor (n_estimators={:d}, max_depth={:d}):".format(params["n_estimators"], params["max_depth"]))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        return predictions_train, predictions_test

In [None]:
N_estimators = param_grid['n_estimators']
Max_features = param_grid['max_features']
Max_depth = param_grid['max_depth']
Max_leaf_nodes = param_grid['max_leaf_nodes']

for n_estimators in N_estimators:
    for max_features in Max_features:
        for max_depth in Max_depth:
            for max_leaf_nodes in Max_leaf_nodes:
                train_pred, test_pred = train_rf(n_estimators, max_features, max_depth, max_leaf_nodes, 'RF-corr')

# XGBoost

In [None]:
from sklearn.model_selection import StratifiedKFold
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5, 8, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0,],
        'max_depth': [3, 4, 5, 6, 7, 8]
}

xgb_model = xgb.XGBRegressor(
    seed=42
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=params,
    scoring = 'neg_mean_absolute_error',
    n_jobs = 10,
    cv = 10,
    verbose=True,
    error_score='raise'
)

grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
def train_xgb(min_child_weight=1, gamma=8, subsample=0.8, colsample_bytree=1.0, max_depth=5, exp_name='XGB'):
    mlflow.sklearn.autolog(disable=True)
    mlflow.set_experiment(exp_name)
    with mlflow.start_run(run_name='XGB-basic'):
        params = {
                'min_child_weight': min_child_weight,
                'gamma': gamma,
                'subsample': subsample,
                'colsample_bytree': colsample_bytree,
                'max_depth': max_depth
            }
        mlflow.set_tag('model_name', 'XGB')
        mlflow.log_params(params)
        xgb_model = xgb.XGBRegressor(
            n_estimators=50,
            reg_lambda=1,
            gamma=0,
            max_depth=2
        )
        xgb_model.fit(X_train, y_train)


        # make predictions for test data
        y_pred = xgb_model.predict(X_train)
        predictions = [round(value) for value in y_pred]


        (rmse, mae, r2) = eval_metrics(y_train, predictions)
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
        mlflow.log_metric("rmse_tr", rmse)
        mlflow.log_metric("r2_tr", r2)
        mlflow.log_metric("mae_tr", mae)

        print('----------------------------------------------------------------------------------------------------------------------')
        y_pred = xgb_model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        (rmse, mae, r2) = eval_metrics(y_test, predictions)

        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

In [None]:
Min_child_weight = params['min_child_weight']
Gamma = params['gamma']
Subsample = params['subsample']
Colsample_bytree = params['colsample_bytree']
Max_depth = params['max_depth']

for min_child_weight in Min_child_weight:
    for gamma in Gamma:
        for subsample in Subsample:
            for colsample_bytree in Colsample_bytree:
                for max_depth in Max_depth:
                    train_xgb(min_child_weight, gamma, subsample, colsample_bytree, max_depth, 'XGB-corr')

# DNN

In [None]:
def build_and_compile_model(norm, L1, L2, LR):
  model = tf.keras.Sequential([
      norm,
      tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=L1, l2=L2)),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(32, activation='relu'),
      tf.keras.layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(LR)) 
  return model

In [None]:
def train_DNN(epochs=5000, l1=0, l2=0, lr=0.001, exp_name='DNN'):
    mlflow.sklearn.autolog(disable=True)
    mlflow.set_experiment(exp_name)
    with mlflow.start_run(run_name='DNN'):
        params = {
            'epochs': epochs,
            'l1':l1,
            'l2':l2,
            'lr':lr
        }

        mlflow.set_tag('model_name', 'DNN')
        mlflow.log_params(params)

        X_normalizer = tf.keras.layers.Normalization(input_shape=[3,], axis=None, )
        X_normalizer.adapt(X)


        dnn_chips_model = build_and_compile_model(X_normalizer, params['l1'], params['l2'], params['lr'])

        history = dnn_chips_model.fit(
            X,
            y,
            validation_split=0.2,
            verbose=0, epochs=params['epochs'],)

        mlflow.sklearn.log_model(dnn_chips_model, "model", registered_model_name="DNN-128-R")

        pred = dnn_chips_model.predict(X)
        predictions = [p[0] for p in pred]
        (rmse, mae, r2) = eval_metrics(y, predictions)
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Shreyas List Test
        pred = dnn_chips_model.predict(X_test)
        predictions = [p[0] for p in pred]
        (rmseS, maeS, r2S) = eval_metrics(y_test, predictions)
        print("  RMSE: %s" % rmseS)
        print("  MAE: %s" % maeS)
        print("  R2: %s" % r2S)

        # modelflow logs
        mlflow.log_metric("rmse_tr", rmse)
        mlflow.log_metric("r2_tr", r2)
        mlflow.log_metric("mae_tr", mae)

        mlflow.log_metric("rmse", rmseS)
        mlflow.log_metric("r2", r2S)
        mlflow.log_metric("mae", maeS)
        
        mlflow.log_metric("loss", min(history.history['loss']))
        mlflow.log_metric("loss_epoch", history.history['loss'].index(min(history.history['loss'])))
        mlflow.log_metric("val_loss", min(history.history['val_loss']))
        mlflow.log_metric("val_loss_epoch", history.history['val_loss'].index(min(history.history['val_loss'])))
        
        return dnn_chips_model
    

In [None]:
train_DNN(10000, 0.1, 0.1, 0.001, 'DNN-corr-10000')

In [None]:

print(X_test.columns)
print(X_train.columns)
model = train_DNN()
epochs = 10000
l1s = [0, 0.001, 0.01, 0.1, 1]
l2s = [0, 0.001, 0.01, 0.1, 1]
lrs = [0.001]
for l1 in l1s:
    for l2 in l2s:
        for lr in lrs:
            train_DNN(epochs, l1, l2, lr, 'DNN-corr-10000')

In [None]:
epochs = 10000
l1s = [0, 0.001, 0.01, 0.1, 1]
l2s = [0, 0.001, 0.01, 0.1, 1]
lrs = [0.01]
for l1 in l1s:
    for l2 in l2s:
        for lr in lrs:
            train_DNN(epochs, l1, l2, lr, '143-DNN-10000-lr-0.01')