In [1]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
%pylab inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, r2_score, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV,StratifiedKFold
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

X = pd.read_csv(r'C:\Users\Krish\Desktop\OneDrive - The University of Texas at Austin\UT Austin\Data Science Programming\Project\input.csv')
y = pd.read_csv(r'C:\Users\Krish\Desktop\OneDrive - The University of Texas at Austin\UT Austin\Data Science Programming\Project\output.csv')

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Log Regression functions

def run_LogisticReg(train_X, test_X, train_y, test_y, plot_graph, classification_thres, print_report ):
    
    model = LogisticRegression(class_weight='balanced',max_iter=5000, solver='lbfgs')
    model.fit(train_X, train_y)

    yhat = model.predict_proba(test_X)
    yhat_positive = yhat[:, 1]

    y_pred = (model.predict_proba(test_X)[:,1]>classification_thres).astype(bool)

    if print_report =='Yes':
        print(classification_report(test_y, y_pred))

    feature_importance = pd.DataFrame(train_X.columns, columns = ["feature"])
    feature_importance["importance"] = model.coef_[0]
    feature_importance = feature_importance.sort_values(by = ["importance"], ascending=True)

    if plot_graph == 'Yes':
        ax = feature_importance.plot.barh(x='feature', y='importance')
        plt.rcParams["figure.figsize"]=(10,10)
        plt.show()

    return yhat, yhat_positive, y_pred, feature_importance


def perform_cross_validation(X, y, n_splits1):

    kf = StratifiedKFold(n_splits=n_splits1,shuffle=True,random_state=42)
    pred_test_full =0
    recall_list = []
    precision_list = []
    accuracy_list = []
    roc_auc_list = []
    i=1

    for train_index,test_index in kf.split(X,y):
        
        print('{} of KFold {}'.format(i,kf.n_splits))
        xtr,xvl = X.loc[train_index],X.loc[test_index]
        ytr,yvl = y.loc[train_index],y.loc[test_index]
        
        yhat, yhat_positive, y_pred, feature_importance = run_LogisticReg(train_X=xtr, test_X=xvl, train_y=ytr, test_y=yvl, plot_graph='No',classification_thres=0.45, print_report='Yes' )
        
        precision = precision_score(y_pred=y_pred, y_true=yvl)
        recall = recall_score(y_pred=y_pred, y_true=yvl)
        accuracy = accuracy_score(y_pred=y_pred, y_true=yvl)
        roc_score = roc_auc_score(yvl, yhat_positive)

        precision_list.append(precision)
        recall_list.append(recall)
        accuracy_list.append(accuracy)
        roc_auc_list.append(roc_score)

        fpr, tpr, thresholds = metrics.roc_curve(yvl,  yhat_positive)
        length = len(thresholds)

        plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
        plt.plot(fpr, tpr, label='Logistic')
        idx_list  =np.linspace(0,length, num=20).astype(int)

        for ix in idx_list:
            try:
                plt.scatter(fpr[ix], tpr[ix], marker='o', color='red', label=thresholds[ix])
                plt.text(fpr[ix], tpr[ix], str(round(thresholds[ix],2)))
            except:
                continue

#         #Calculate accuracy, precision_score, recall_score, specificity, confusion matrix and classification report
#         ps = precision_score(test_y, y_pred)
#         print('Precision Score =',ps)
#         rs = recall_score(test_y, y_pred)
#         print('Recall Score =',rs)
#         tn, fp, fn, tp = confusion_matrix(test_y, y_pred).ravel()
#         specificity = tn / (tn+fp)
#         print('Specificity =',specificity)
#         cm = confusion_matrix(test_y, y_pred)
#         print(cm)
#         print(classification_report(test_y, y_pred))

        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()

        i+=1

    return precision_list, recall_list, accuracy_list, roc_auc_list

In [None]:
# running logistic regression considering all the features with l2 regularization

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train_scaled = np.array(y_train)
X_test_scaled = scaler.fit_transform(X_test)
y_test_scaled = np.array(y_test)

yhat, yhat_positive, y_pred, feature_importance = run_LogisticReg(X_train, X_test, y_train, y_test, plot_graph='No',classification_thres=0.45, print_report='Yes' )

In [None]:
# Cross validation
precision_list1, recall_list1, accuracy_list1, roc_auc_list1 = perform_cross_validation(X,y,5)

In [None]:
def run_RandomForest(X_train, X_test, Y_train, Y_test, classification_thres ):
    
    #Random Forest model
    rf = RandomForestClassifier(n_estimators= 19, max_depth= 12, class_weight='balanced')
    rf.fit(X_train, Y_train)
    yhat = rf.predict_proba(X_test)
    yhat_positive = yhat[:, 1]
    y_pred = rf.predict(X_test)
    
    y_pred = (rf.predict_proba(X_test)[:,1]>0.45).astype(bool)
    print(classification_report(Y_test, y_pred))
    
    # calculate inputs for the roc curve
    fpr, tpr, thresholds = roc_curve(Y_test, yhat_positive)

    #Calculate ROC Score
    roc_score = roc_auc_score(Y_test, yhat_positive)
    print("ROC_AUC: "+ str(roc_score))
    print('\n')
    
    return yhat, yhat_positive, y_pred

#Cross Validation function

def perform_cross_validation(X, y, n_splits1):

    kf = StratifiedKFold(n_splits=n_splits1,shuffle=True,random_state=42)
    pred_test_full =0
    recall_list = []
    precision_list = []
    accuracy_list = []
    roc_auc_list = []
    i=1

    for train_index,test_index in kf.split(X,y):
        
        print('{} of KFold {}'.format(i,kf.n_splits))
        xtr,xvl = X.loc[train_index],X.loc[test_index]
        ytr,yvl = y.loc[train_index],y.loc[test_index]
        
        yhat, yhat_positive, y_pred = run_RandomForest(X_train=xtr, Y_train=ytr, X_test=xvl, Y_test=yvl, classification_thres=0.45)
        
        precision = precision_score(y_pred=y_pred, y_true=yvl)
        recall = recall_score(y_pred=y_pred, y_true=yvl)
        accuracy = accuracy_score(y_pred=y_pred, y_true=yvl)
        roc_score = roc_auc_score(yvl, yhat_positive)

        precision_list.append(precision)
        recall_list.append(recall)
        accuracy_list.append(accuracy)
        roc_auc_list.append(roc_score)

        fpr, tpr, thresholds = metrics.roc_curve(yvl,  yhat_positive)
        length = len(thresholds)

        plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
        plt.plot(fpr, tpr, label='Logistic')
        idx_list  =np.linspace(0,length, num=20).astype(int)

        for ix in idx_list:
            try:
                plt.scatter(fpr[ix], tpr[ix], marker='o', color='red', label=thresholds[ix])
                plt.text(fpr[ix], tpr[ix], str(round(thresholds[ix],2)))
            except:
                continue

        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()

        i+=1

    return precision_list, recall_list, accuracy_list, roc_auc_list

In [None]:
#Splitting the data into Training and Testing data sets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.20, train_size = 0.80, random_state= 42)

#Calling the Random Forest Function
yhat, yhat_positive, y_pred = run_RandomForest(X_train, X_test, Y_train, Y_test,classification_thres=0.45)

In [None]:
#To find the optimal tree depth
df = pd.DataFrame(columns=['Ival','Rs'])

for i in range(1,50):
    rf = RandomForestClassifier(n_estimators= 10, max_depth= i, class_weight='balanced')
    rf.fit(X_train, Y_train)
    y_pred = rf.predict(X_test)
    rs = recall_score(Y_test, y_pred)
    df=df.append({'Ival':i,'Rs':rs},ignore_index=True)
print(df)
df.plot.line(x='Ival',y='Rs',style='.-')

In [None]:
#To find the optimal number of estimators
df = pd.DataFrame(columns=['n_estimators','Rs'])

for j in range(1,100):
    rf = RandomForestClassifier(n_estimators= j, max_depth= 12, class_weight='balanced')
    rf.fit(X_train, Y_train)
    y_pred = rf.predict(X_test)
    rs = recall_score(Y_test, y_pred)
    df=df.append({'n_estimators':j,'Rs':rs},ignore_index=True)
print(df)
df.plot.line(x='n_estimators',y='Rs',style='.-')