In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
import sys

In [None]:
print(sys.executable)

In [None]:
!python --version

### Create functions for all steps involved in complete model training lifecycle

In [None]:
def load_data(path):
    return pd.read_csv(path)

In [None]:
data = load_data("https://raw.githubusercontent.com/TripathiAshutosh/dataset/main/banking.csv")
data.head()

In [None]:
def data_cleaning(data):
    print('na values available in data\n')
    print(data.isna().sum())
    data = data.dropna()
    print('After dropping na values \n')
    print(data.isna().sum())
    return data

In [None]:
def preprocessing(data):
    data['education'] = np.where(data['education'] =='basic.9y', 'Basic', data['education'])
    data['education'] = np.where(data['education'] == 'basic.6y', 'Basic', data['education'])
    data['education'] = np.where(data['education'] == 'basic.4y', 'Basic', data['education'])
    
    cat_vars = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    for var in cat_vars:
        cat_list = 'var'+ '_' + var
        cat_list = pd.get_dummies(data[var], prefix=var)
        data1 = data.join(cat_list)
        data = data1
        
    data_vars = data.columns.values.tolist()
    to_keep = [i for i in data_vars if i not in cat_vars]
    final_data = data[to_keep]
    
    final_data.columns = final_data.columns.str.replace(".", "_")
    final_data.columns = final_data.columns.str.replace(" ", "_")
    return final_data

In [None]:
def df_train_test_split(final_data):
    X = final_data.loc[:, final_data.columns != 'y']
    y = final_data.loc[:, final_data.columns == 'y']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=47)
    return X_train, X_test, y_train, y_test

In [None]:
def over_sampling_target_class(X_train, y_train):
    from imblearn.over_sampling import SMOTE
    os = SMOTE(random_state=0)
    
    columns = X_train.columns
    os_data_X, os_data_y = os.fit_resample(X_train, y_train)
    
    os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
    os_data_y = pd.DataFrame(data=os_data_y, columns=['y'])
    print("length of over-sampled data is:", len(os_data_X))
    print("length of no subscription in oversampled data", len(os_data_y[os_data_y['y']==0]))
    print("Number of subscription", len(os_data_y[os_data_y["y"]==1]))
    print("Proportion of no subscription data in oversampled data is", len(os_data_y[os_data_y['y']==0])/len(os_data_X))
    print("Proportion of subscription data in oversampled data is", len(os_data_y[os_data_y['y']==1])/len(os_data_X))
    
    X_train = os_data_X
    y_train = os_data_y['y']
    
    return X_train, y_train

In [None]:
def training_basic_classifier(X_train, y_train):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100)
    return model.fit(X_train, y_train)
    

In [None]:
def predict_on_test_data(model, X_test):
    return model.predict(X_test)

In [None]:
def predict_prob_on_test_data(model, X_test):
    return model.predict_proba(X_test)

In [None]:
def get_metrics(y_true, y_pred, y_pred_proba):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss
    acc= accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    entropy = log_loss(y_true, y_pred_proba)
    return {'accuracy': round(acc, 2), 'precision': round(prec, 2), 'recall': round(recall, 2), 'entropy': round(entropy, 2)}

In [None]:
def create_roc_auc_plot(clf, X_data, y_data):
    import matplotlib.pyplot as plt
    from sklearn import metrics
    metrics.plot_roc_curve(clf, X_data, y_data)
    plt.savefig('roc_auc_curve.png')

In [None]:
def create_confusion_metrics_plot(clf, X_test, y_test):
    import matplotlib.pyplot as plt
    from sklearn.metrics import plot_confusion_matrix
    plot_confusion_matrix(clf, X_test, y_test)
    plt.savefig('confusion_matrix.png')

In [None]:
def hyper_parameter_tuning(X_train, y_train):
    n_estimators = [5, 21, 51, 101]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 120, num=12)]
    min_samples_split = [2,  6, 10]
    min_samples_leaf = [1, 3, 4]
    bootstrap = [True, False]
    
    random_grid = {'n_estimators': n_estimators,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'min_samples_split': min_samples_split,
                  'min_samples_leaf': min_samples_leaf,
                  'bootstrap': bootstrap
                  }
    
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier()
    model_tuning = RandomizedSearchCV(estimator = classifier, param_distributions=random_grid, 
                                     n_iter = 100, cv=5, verbose=2, random_state=35, n_jobs=-1
                                    )
    model_tuning.fit(X_train, y_train)
    print("Random Grid:", random_grid, '\n')
    
    print('Best Parameters:', model_tuning.best_params_, '\n')
    best_params = model_tuning.best_params_
    
    n_estimators = best_params['n_estimators']
    min_samples_split = best_params['min_samples_split']
    min_samples_leaf = best_params['min_samples_leaf']
    max_features = best_params['max_features']
    max_depth = best_params['max_depth']
    bootstrap = best_params['bootstrap']
    
    model_tuned= RandomForestClassifier(n_estimators = n_estimators, min_samples_split = min_samples_split,
                                        min_samples_leaf= min_samples_leaf, max_features=max_features,
                                        max_depth=max_depth, bootstrap=bootstrap)
    model_tuned.fit(X_train, y_train)
    return model_tuned, best_params

In [None]:
cleaned_data = data_cleaning(data=data)

In [None]:
final_data = preprocessing(cleaned_data)

In [None]:
X_train, X_test, y_train, y_test = df_train_test_split(final_data)

In [None]:
X_train, y_train = over_sampling_target_class(X_train, y_train)

In [None]:
model = training_basic_classifier(X_train, y_train)

In [None]:
y_pred = predict_on_test_data(model=model, X_test=X_test)

In [None]:
y_pred

In [None]:
y_pred_prob = predict_prob_on_test_data(model, X_test)

In [None]:
y_pred_prob

In [None]:
run_metrics = get_metrics(y_test, y_pred, y_pred_prob)

In [None]:
print(run_metrics)

In [None]:
create_roc_auc_plot(model, X_test, y_test)

In [None]:
create_confusion_metrics_plot(model, X_test, y_test)

## ML FLow work Starts from here

### Functions to create an experiment in MLFlow and log parameters, metrics and artifacts files like images etc

In [None]:
def create_experiment(experiment_name, run_name, run_metrics, model, confusion_matrix_path=None, 
                     roc_auc_plot_path=None, run_params=None):
    import mlflow
    mlflow.set_tracking_uri("http://localhost:8000") # Document this line if you want to use any database like sqlite
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run():
        
        if not run_params == None:
            for param in run_params:
                mlflow.log_param(param, run_params[param])
        
        for metric in run_metrics:
            mlflow.log_metric(metric, run_metrics[metric])
        
        mlflow.sklearn.log_model(model, "model")
        
        if not confusion_matrix_path == None:
            mlflow.log_artifact(confusion_matrix_path, 'confusion_matrix')
        
        if not roc_auc_plot_path == None:
            mlflow.log_artifact(roc_auc_plot_path, "roc_auc_plot")
        
        mlflow.set_tag("tag1", "RandomForest")
        mlflow.set_tags({"tag2": "Randomized Search CV", "tag3": "Production"})
    
    print("Run - %s is logged to Experiment - %s" %(run_name, experiment_name))

In [None]:
experiment_name = "Experiment_1"
run_name = "term_deposit"
run_metrics = get_metrics(y_test, y_pred, y_pred_prob)
print(run_metrics)

In [None]:
create_experiment(experiment_name, run_name, run_metrics, model, 'confusion_matrix.png', 'roc_auc_curve.png')

## Create another experiment after tuning hyperparameters and log the best set of parameters fow which model gives the optimal performance


In [None]:
import mlflow
experiment_name = "optimized model"
run_name = "Random_Search_CV_Tuned_Model"

model_tuned, best_params = hyper_parameter_tuning(X_train, y_train)
run_params = best_params

y_pred = predict_on_test_data(model_tuned, X_test)
y_pred_prob = predict_prob_on_test_data(model_tuned, X_test)
run_metrics = get_metrics(y_test, y_pred, y_pred_prob)

In [None]:
run_metrics

In [None]:
for param in run_params:
    print(param, run_params[param])

In [None]:
create_experiment(experiment_name, run_name, run_metrics, model_tuned, 'confusion_matrix.png', 'roc_auc_curve.png', run_params)