# Import Required Libraries

In [None]:
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,roc_curve
import joblib
import os
import warnings
warnings.filterwarnings("ignore")

Create a directory structure to store our assets i.e fitted models and transformers

In [None]:
os.makedirs('assets/scaler')
os.makedirs('assets/models')

## Data Preparation

In [None]:
## Do customized operations according to project requirement
def pre_process(file,training):
    

In [None]:
## Data Preparation Steps
file=pd.read_csv('file_name.csv')
train_final_df=pre_process(file,training=True)

In [None]:
train_Y=train_final_df['y']
train_final_df.drop(columns=['y'],inplace=True)

## Scaling

Created a dictionary to store information about the scalers.
<p> 1. The object on which the scaler is to be fitted on. 
<p> 2. The location where the fitted object is stored.

In [None]:
scalers={'MinMax':{'scaler':MinMaxScaler(),'fitted_scaler':None}}

Function to fit or transform a DataFrame using a Scaler.
<p> If training==True then the 'fit' operation would be carried out and after fitting the scaler would be stored in the assets directory.
<p> If training==False then the 'transform' operation would be carried out. The fitted scaler would be loaded from the location and transformation would take place.
<p>Note: The details of the Scaler should be available in the dictionary - 'scalers'  

In [None]:
def scaling(data,training,scaler_type):
    if training==True:
        scaler=scalers.get(scaler_type).get('scaler')
        scaler.fit(data)
        to_save='assets/scaler/{}.gz'.format(scaler_type)
        joblib.dump(scaler,to_save)
        scalers.get(scaler_type)['fitted_scaler']=to_save
    if training==False:
        path=scalers.get(scaler_type).get('fitted_scaler')
        scaler=joblib.load(path)
        scaled_data=scaler.transform(data)
        return scaled_data         

Fit a MinMax scaler on our entire training data

In [None]:
scaling(train_final_df,True,'MinMax')

In [None]:
train_scaled_df=pd.DataFrame(scaling(train_final_df,False,'MinMax'),columns=train_final_df.columns)

## Feature Importances

In [None]:
features_train=train_scaled_df.copy()

<h3> Extra Trees Classifier

In [None]:
best_features = ExtraTreesClassifier(n_estimators=100,random_state=1)
fit = best_features.fit(features_train,train_Y)
scores = pd.DataFrame(fit.feature_importances_)
columns = pd.DataFrame(features_train.columns)
feature_importances = pd.concat([columns,scores],axis=1)
feature_importances.columns = ['Features','Score']  


importance = pd.Series(fit.feature_importances_, index=features_train.columns)
importance.sort_values().plot(kind='barh',figsize=(10,5))
plt.title('Feature Importances in Descending order')
plt.xlabel('Feature Importance Score')
plt.show()

##  Train Test Split

In [None]:
X_train, X_test, y_train, y_test=train_test_split(train_scaled_df,train_Y,test_size=0.30,stratify=train_Y,random_state=1)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

## Predictive Modeling

Below are the explanations of the functions used:
<p>1.model_train_K_fold - Train a model using Stratified K-Fold cross validation. Stores the trained model in the assets directory. You also need to pass the models and parameters dictionary object
<p>2.model_predict - Use the trained models to predict new data. If Probability==True return the probability instead of the prediction

In [None]:
def model_train_K_fold(features,target,algo,models,parameters,K=8):
    kfold = StratifiedKFold(n_splits=K,shuffle=True,random_state=1)   
    clf = GridSearchCV(estimator=models.get(algo).get('model'), param_grid=parameters.get(algo), cv=kfold, verbose=0, scoring='accuracy',refit=True)
    best_model = clf.fit(X_train,y_train)
    to_save='assets/models/{}.gz'.format(algo)
    joblib.dump(best_model,to_save)
    models.get(algo)['fitted_model']=to_save
    return True

In [None]:
def model_predict(features,algo,probability=False):
    model=joblib.load('assets/models/{}.gz'.format(algo))
    if probability==False:
        prediction=model.predict(features)
    if probability==True:
        prediction=model.predict_proba(features)
        prediction=prediction[:,1]
    return prediction

<p> 1. models - A dictionary containing different models and the location where the fitted model is stored
<p> 2. parameters - A dictionary containing the parameters for every model which the grid_search would search on

In [None]:
models={'random_forest':{'model':RandomForestClassifier(random_state=1),'fitted_model':None},
       'logistic_regression':{'model':LogisticRegression(random_state=1),'fitted_model':None},
       'gradient_boosting_classifier':{'model':GradientBoostingClassifier(random_state=1),'fitted_model':None},
       'svm':{'model':SVC(random_state=1),'fitted_model':None}
       }

In [None]:
parameters={'random_forest':{'n_estimators':[100,500],
                            'max_depth':[4,8]},
           'logistic_regression':{'C':[0.01,1,10],
                                  'penalty': ['l2'],
                                  'max_iter':[1000]},
            'gradient_boosting_classifier':{'learning_rate': [0.1,0.2],
                                            'min_samples_split': [2,4],
                                            'n_estimators':[100,200]},
            'svm': {'kernel':['poly','rbf'],
                    'C':[0.5,1,10],
                   'probability':[True]}
           }

In [None]:
models

In [None]:
model_train_K_fold(X_train,y_train,'random_forest',models,parameters)

In [None]:
model_train_K_fold(X_train,y_train,'logistic_regression',models,parameters)

In [None]:
model_train_K_fold(X_train,y_train,'gradient_boosting_classifier',models,parameters)

In [None]:
model_train_K_fold(X_train,y_train,'svm',models,parameters)

In [None]:
models

## Interpreting Logistic Regression

In [None]:
lr=joblib.load(models.get('logistic_regression').get('fitted_model'))
pd.DataFrame(lr.best_estimator_.coef_,columns=X_train.columns)

Interpretations 
<p> 1. For an increase in number_of_stops the log-odds for the trip to be classified as 'interesting' increases
<p> 2. For an increase in feature9 the log-odds for the trip to be classified as 'interesting' decreases

## Best Hyper-Parameters

Function to retrive the best hyper-parameters

In [None]:
def get_best_hyper_parameters(algo,models):
    model=joblib.load(models.get(algo).get('fitted_model'))
    return model.best_params_

In [None]:
for i in models.keys():
    print('{}: {}'.format(i,get_best_hyper_parameters(i,models)))

## Reporting Score 

Function to report:
<p> 1.K-fold Validation Score
<p> 2.Area Under ROC Curve for Test Data
<p> 3.Confusion Matrix of the Test Data 
<p> 4.Classification Report of the Test Data

In [None]:
def report_scores(algo,X_test,y_test,models):
    print(algo)
    model=joblib.load(models.get(algo).get('fitted_model'))
    prediction=model.predict(X_test)
    print('K-fold Validation Score: {}'.format(model.best_score_))
    print('\nTest Set Results: \n')
    print('Area Under ROC curve test set  {}'.format(roc_auc_score(y_test,prediction)))
    print(confusion_matrix(y_test,prediction))
    print(classification_report(y_test,prediction))
    print('\n')

In [None]:
for i in models.keys():
    report_scores(i,X_test,y_test,models)   

## Creating Ensemble

In [None]:
test_ensemble=pd.DataFrame()

In [None]:
for i in models.keys():
    test_prediction=model_predict(X_test,i,probability=True)
    test_ensemble['{}_prediction'.format(i)]=test_prediction 

In [None]:
test_ensemble['average']=test_ensemble.sum(axis=1)/(len(test_ensemble.columns)-1)

In [None]:
test_ensemble

## Finding Optimum Thresold - Optional

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, test_ensemble['average'])
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

In [None]:
test_ensemble['ensemble_prediction']=np.where(test_ensemble['average'] >= best_thresh, 1, 0)

In [None]:
print(confusion_matrix(y_test,test_ensemble['ensemble_prediction']))
print(classification_report(y_test,test_ensemble['ensemble_prediction']))

In [None]:
print(confusion_matrix(y_test,test_ensemble['ensemble_prediction']))
print(classification_report(y_test,test_ensemble['ensemble_prediction']))

## Storing objects

Storing the models dictionary, scaler dictionary and best_threshold 

In [None]:
joblib.dump(models,'assets/models/models_dict.gz')
joblib.dump(scalers,'assets/scaler/scaler_dict.gz')
joblib.dump(best_threshold,'assets/best_threshold.gz')

## Prediction run

<p>The test run is designed in such a way that it can run individually. There are no dependencies with the code above. Only the required functions need to be imported.
<p> You only need to enter the '.csv' file name and the '.zip' folder name.

<p> Loading Objects

In [None]:
models=joblib.load('assets/models/models_dict.gz')
scalers=joblib.load('assets/scaler/scaler_dict.gz')
best_threshold=joblib.load('assets/best_threshold.gz')

<p> Pre-Processing 

In [None]:
#Data Preprocessing
file=pd.read_csv('file_name.csv')
prediction_final_df=pre_process(file,training=False)

In [None]:
prediction_file_name=prediction_final_df[['filename']]
prediction_final_df.drop(columns=['filename'],inplace=True)

<P>Scaling Data Frame

In [None]:
prediction_scaled_df=pd.DataFrame(scaling(prediction_final_df,False,'MinMax'),columns=prediction_final_df.columns)

<p> Getting prediction for each model

In [None]:
for i in models.keys():
    prediction_pred=model_predict(prediction_scaled_df,i,probability=True)
    prediction_file_name['{}_prediction'.format(i)]=prediction_pred    

In [None]:
prediction_file_name

<p> Averaging out predictions and predicting according to the best threshold

In [None]:
prediction_file_name['average']=prediction_file_name.iloc[:,1:].sum(axis=1)/(len(prediction_file_name.columns)-1)
prediction_file_name['ensemble_prediction']=np.where(prediction_file_name['average'] >= best_thresh, 1, 0) 

In [None]:
prediction_file_name['ensemble_prediction'].value_counts()

In [None]:
prediction_file_name[['filename','ensemble_prediction']].to_csv('predictions.csv',index=False)

## Customization Options

<p>Function to add a model to the model dictionary.
<p>Function to add parameters for the model
<p>Note - Remember to train the new model using model_train_K_fold()

In [None]:
def add_models(model_name,model_object,models):
    models[model_name]={'model':model_object,'fitted_model':None}
    return True
    
def add_parameters(model_name,model_parameters,parameters):
    parameters[model_name]=model_parameters
    return True

Function to add a different scaler

In [None]:
def add_scaler(scaler_name,scaler_object,scalers):
    scalers[scaler_name]:{'scaler':scaler_object,'fitted_scaler':None}
    return True

## Report

<p>The pipeline is made modular.
<p>A new model can be added easily.
<p>New parameters can be added and tested easily.
<p>To predict on a new file all we need to do is just change the location of where the files are present.
<p> The two dictionary objects are important components of the pipeline and are easy to understand.

<p> <b> Improvement Suggestions </b>
<p>1.Create a data directory 
<p>2.Current design choice can accomodate only 1 MinMax scaler. Improve that
<p>3.Search a larger grid search space   
<p>4.Nested K-Fold Cross validation can be carried out to report bias free scores
<p>5.Re-train all the models on all the training data 