## Binary Classification 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import warnings
warnings.filterwarnings("ignore")
from pandas_profiling import ProfileReport
import datetime as DT
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import pickle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import statistics
from time import clock
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


### Data Reading and some basic info about the dataframe

In [None]:
data_read=pd.read_csv("training_set.csv")

In [None]:
data_read.head()
data_read = data_read.drop(data_read.columns[0],axis='columns') #dropping the unneccesary index at position 0
data_read.info() #Feature X1 to X55 contains float values while X56 and X57 have int values, and X58 it the output feature or dependent variable
data_read.describe() #There are some max values which serves as outliers in the X55,X56,X57 feature columns

### Exploratory Data Analysis

In [None]:
profile = ProfileReport(data_read, title="Pandas Profiling Report")
profile
# Then we get to know that there are no missing values in the dataset 
# Whereas most of the columns have average 80-85% of zeroes in that
# While X34 is highly correlated with the X32 so we should drop any of it.
# 2376 values are identified as class'0' whereas 1534 values are from class '1' 

In [None]:
#We have already seen the distribution of the data in profiling we will continue with the correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    #filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print('No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title('Correlation Matrix', fontsize=15)
    plt.show()
    
## Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [None]:
plotCorrelationMatrix(data_read, 15) # There is tight corelation between 32 and 34 but slight less corelation between feature 28 to 38
plotScatterMatrix(data_read, 20, 15) 

### Feature preprocessing and selection


In [None]:
#As we know there is only two features that are strongly corelated, we will remove any of them
data_read=data_read.drop('X32',axis='columns') 

#Getting the shape of dataframe
print("shape of the data:", data_read.shape)

#We don't need to encode into numerical as all the columns are numerical 

#Splitting the data with 4:1(80:20)
X = data_read.iloc[:,0:56]
y = data_read['Y']
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.20, random_state=42)
#for lda
#X_train, X_eval, y_train, y_eval = train_test_split(features_new, y, test_size=0.20, random_state=42)


### Above LDA is just for analysis. No need to run it.

In [None]:
#For splitting with the LDA in case for checking variance in the dataset
LDA_transform = LDA(n_components=1)
LDA_transform.fit(X, y)
features_new = LDA_transform.transform(X)

# Print the number of features
print('Original feature #:', X.shape[1])
print('Reduced feature #:', features_new.shape[1])

# Print the ratio of explained variance
print(LDA_transform.explained_variance_ratio_)
X_train, X_eval, y_train, y_eval = train_test_split(features_new, y, test_size=0.20, random_state=42)

#And then train it with the Logistic Regression,DT and Random Forest models
#But Logistic Regression slightly performed better after new feature set.
#But the Tree classifiers have done poorly on that.

In [None]:
#We will see some feature selection methods for automatically reducing the features according to the variance
sel_variance_threshold = VarianceThreshold() 
X_train_remove_variance = sel_variance_threshold.fit_transform(X_train)
print(X_train_remove_variance.shape)
del X_train_remove_variance
#Shape of X_train and reduced variance train set is same so no feature is reduced. Thus we will be using X-train as is.
#VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features. 

In [None]:

#Creating instances of classifiers
clf_NB = GaussianNB()
clf_DT = DecisionTreeClassifier()
clf_LR = LogisticRegression()


### Training the models


In [None]:
#Default Printing statements
def printing_statements_upper(tablename,model_name):    
    print("============")
    print("Table %s" % tablename)
    print("\nScores for the %s" % model_name)
    print("============")
    print("____________________________________________________________________")
    print("\t\t%s\t" % model_name)
    print("____________________________________________________________________")

def printing_statements_lower(accuracy,model_name,tim,F1):
    print("Accuracy is : %s" % accuracy)
    print("\nThe training time for %s in seconds " % model_name)
    print("============")
    print("Model training time is : %s" % tim)
    print("\nThe F-measure/F1 score is ")
    print("============")
    print("F1 score is : %s " % F1)

In [None]:
#Logistic Regression
def model_logistic():
    roc_auc_lr=[]
    printing_statements_upper('3','Logistic Regression')
    start_time = clock()
    clf_LR.fit(X_train, y_train)
    t_LR=round(clock() - start_time,3)
    pred = clf_LR.predict(X_eval)
    ##ROC characteristics curve
    fpr, tpr, _ = roc_curve(y_eval, clf_LR.predict_proba(X_eval)[:, 1])
    roc_auc_lr.append(auc(fpr, tpr))
    ##
    acc_LR = round(accuracy_score(y_eval, pred),4)
    fm_LR = f1_score(y_eval, pred)
    printing_statements_lower(acc_LR,'Logistic Regression',t_LR,fm_LR)  

model_logistic()

In [None]:
#Naive Bayes Classifier
def model_training_naive():
    roc_auc_nb=[]
    printing_statements_upper('1','Naive Bayes')
    start_time_naive = clock()
    clf_NB.fit(X_train, y_train)
    t_NB=round(clock()-start_time_naive,3)
    pred_naive = clf_NB.predict(X_eval)
    ##ROC characteristics curve
    fpr, tpr, _ = roc_curve(y_eval, clf_LR.predict_proba(X_eval)[:, 1])
    roc_auc_nb.append(auc(fpr, tpr))
    ##
    acc_NB = round(accuracy_score(y_eval, pred_naive),4)
    fm_NB = f1_score(y_eval, pred_naive)
    printing_statements_lower(acc_NB,'Naive Bayes',t_NB,fm_NB)   
    
    
model_training_naive()

In [None]:
#Decision Tree
def model_training_dt():
    roc_auc_dt=[]
    printing_statements_upper('2','Decision Tree')
    start_time = clock()
    clf_DT.fit(X_train, y_train)
    t_DT=round(clock() - start_time,3)
    pred = clf_DT.predict(X_eval)
    ##ROC characterisitcs
    fpr, tpr, _ = roc_curve(y_eval, clf_DT.predict_proba(X_eval)[:, 1])
    roc_auc_dt.append(auc(fpr, tpr))
    ##
    acc_DT = round(accuracy_score(y_eval, pred),4)
    fm_DT = f1_score(y_eval, pred)
    printing_statements_lower(acc_DT,'Decision Tree',t_DT,fm_DT)
    
model_training_dt()


importances = clf_DT.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
shaped_df=X.values
for f in range(shaped_df.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    
#X51 and X6 have highest feature importance in the prediction 

In [None]:
#Random Forest
def random_for():
    roc_auc_rf = []
    printing_statements_upper('4','Random Forest')
    rf = RandomForestClassifier(n_estimators=73)
    #Tried various combinations for the estimators
    start_time_rdm = clock()
    rf.fit(X_train, y_train)
    t_RF=round(clock() - start_time_rdm,3)
    y_pred = rf.predict(X_eval)
    ####
    fpr, tpr, _ = roc_curve(y_eval, rf.predict_proba(X_eval)[:, 1])
    roc_auc_rf.append(auc(fpr, tpr))
    ####
    rf_acc=rf.score(X_eval, y_eval)
    print("The accuracy is : ",rf_acc)
    print("Training Time is : ",t_RF)
    # Evaluate the confusion_matrix
    confusion_matrix(y_eval, y_pred)
    print(classification_report(y_eval,y_pred))

random_for()

In [None]:
def show_auc(y_true, y_score,model_name):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic for %s' % model_name)
    plt.legend(loc="lower right")
    plt.show()

#ROC curves for different models    
#show_auc(y_eval,rf.predict_proba(X_eval)[:, 1],'Random Forest')
show_auc(y_eval,clf_LR.predict_proba(X_eval)[:,1],'Logistic Regression')
show_auc(y_eval,clf_DT.predict_proba(X_eval)[:,1],'Decision Tree') ## For decision tree
show_auc(y_eval,clf_NB.predict_proba(X_eval)[:,1],'Naive Bayes')

#Hence I will go with the Random Forest for testing it with hold out set we have reserved.
#So will just save the model file of Random Forest for testing it.

In [None]:
#Saving the model Only one time activity
filename = 'Best_classifier.pckl' 
pickle.dump(rf, open(filename, 'wb'))

#loaded_model = pickle.load(open(filename, 'rb'))



### For the sake of speed the naive bayes outperforms all the other models. But the accuracy,F1 and ROC characteristics are better for Random Forest and DT.
### So I will choose the Random forest algorithm for the test data set.

In [None]:
#Again want to test some scaling if it works with the data.
# Scaling X_train and X_test
#So comparing with average performance model to check for accuracy and F1 score
#So there is no change in scores while scaling the dataset and training it.
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_eval = scaler.transform(X_eval)

In [None]:
def model_training_naive():
    roc_auc_nb=[]
    printing_statements_upper('1','Naive Bayes')
    start_time_naive = clock()
    clf_NB.fit(rescaledX_train, y_train)
    t_NB=round(clock()-start_time_naive,3)
    pred_naive = clf_NB.predict(rescaledX_eval)
    ##ROC characteristics curve
    fpr, tpr, _ = roc_curve(y_eval, clf_LR.predict_proba(rescaledX_eval)[:, 1])
    roc_auc_nb.append(auc(fpr, tpr))
    ##
    acc_NB = round(accuracy_score(y_eval, pred_naive),4)
    fm_NB = f1_score(y_eval, pred_naive)
    printing_statements_lower(acc_NB,'Naive Bayes',t_NB,fm_NB) 
    
model_training_naive()

In [None]:
def model_training_dt():
    roc_auc_dt=[]
    printing_statements_upper('2','Decision Tree')
    start_time = clock()
    clf_DT.fit(rescaledX_train, y_train)
    t_DT=round(clock() - start_time,3)
    pred = clf_DT.predict(rescaledX_eval)
    ##ROC characterisitcs
    fpr, tpr, _ = roc_curve(y_eval, clf_DT.predict_proba(rescaledX_eval)[:, 1])
    roc_auc_dt.append(auc(fpr, tpr))
    ##
    acc_DT = round(accuracy_score(y_eval, pred),4)
    fm_DT = f1_score(y_eval, pred)
    printing_statements_lower(acc_DT,'Decision Tree',t_DT,fm_DT)
    
model_training_dt()


importances = clf_DT.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
shaped_df=X.values
for f in range(shaped_df.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    
#X51 and X6 have highest feature importance in the prediction 