# **Binary Classification on Widebot Data**

# *Download Needed Dependencies:*

In [153]:
!pip install category_encoders



# Get Access to Google drive through Colab:

In [154]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Libraries:

In [184]:
import csv
from numpy import genfromtxt
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import copy
import seaborn as sns
import sklearn as sk
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold,cross_val_score
from sklearn import preprocessing,tree,svm
from sklearn.metrics import confusion_matrix, classification_report 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss,NeighbourhoodCleaningRule,CondensedNearestNeighbour,OneSidedSelection,EditedNearestNeighbours,TomekLinks
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
import warnings
import random
from sklearn.preprocessing import normalize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers  import Dense,Dropout
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
%matplotlib inline
warnings.filterwarnings('ignore')

# Data Analysis :

After reviewing data, It's noticed that:

- The data contains numerical and categorical features. the unique values:
    - feature 1 -> a, b
    - feature 2, 3, 8, 14, 15, 17 -> floats
    - feature 4 -> u, y, l
    - feature 5 -> g, p, gg
    - feature 6 -> c, k, ff, i, j, q, W, d, m, cc, aa, r, x, e
    - feature 7 -> v, ff, o, h, j, bb, n, z, dd
    - feature 9, 10, 18 -> f, t
    - feature 11, 19 -> 0, 1
    - feature 12 -> f, t
    - feature 13 -> g, s, p
    - classLabel -> yes, no
- The data contains missing values in both the numerical and categorical.


## Blocks of the Algorithm :

- Calculate correlation between data to determine the linearly mapped features as it will make the data modeling biased.

- Handling Missing Data for categorical and numerical features.

- Transform the categorical features into numerical values.

- Use Machine Learning to model the data.

## 1 - Correlation Calculation :



In [156]:
def get_replace_map (csv_reader,cat_csv_reader):
    replace_map={}
    for name in cat_csv_reader.columns:
        labels = cat_csv_reader[name].astype('category').cat.categories.tolist()
        replace_map[name] = {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}
    return replace_map
def replace_cat_with_num(csv_reader,replace_map):
    csv_reader.replace(replace_map, inplace=True)
    return csv_reader

In [157]:
def correlation_enhancer(train_df,test_df,train_copy):
    correlation_cof = train_copy.corr(method ='pearson')
    vis = {}
    cnt=0
    print("pairs of highly correlated features :")
    for i in train_df.columns:
        for j in train_df.columns:
            if abs(correlation_cof[i][j])>=0.8 and i!=j:
                if vis.get(i)==None and vis.get(j)==None and i!='variable18' and j!='variable18' :
                    train_df.drop(i, axis=1,inplace=True)
                    test_df.drop(i, axis=1,inplace=True)
                    vis[i]=1
                    print(i,"  ",j, "  ",correlation_cof[i][j])
                    cnt+=1
    print("number of features to be droped : ",cnt)
    return train_df,test_df

In [158]:
train_df = pd.read_csv('drive/My Drive/Data/training.csv', sep=';',decimal=',')
test_df = pd.read_csv('drive/My Drive/Data/validation.csv', sep=';',decimal=',')
train_copy = train_df.dropna()
cat_train = train_df.select_dtypes(include=['object']).copy()
rp = get_replace_map (train_copy,cat_train)
train_copy = replace_cat_with_num(train_copy,rp)
train_df,test_df = correlation_enhancer(train_df,test_df,train_copy)

pairs of highly correlated features :
variable4    variable5    0.8319892083783643
variable14    variable17    1.00000000000001
variable19    classLabel    1.0
number of features to be droped :  3


Analysis :

## 2 - Handline Missing Data

In [160]:
def get_frequent_object(csv_reader,cat_csv_reader):
    frequent_object={}
    for name in cat_csv_reader.columns:
        frequent_object[name] = csv_reader[name].value_counts().index[0]
    return frequent_object
def modify_nan_dtype_object(csv_reader,cat_csv_reader,frequent_object):
    for name in cat_csv_reader.columns:
        for indx,row in csv_reader.iterrows():
            if pd.isna(csv_reader[name][indx]):
                csv_reader[name][indx] = frequent_object[name]
    return csv_reader
def modift_nan_flaot_int(csv_reader,avg_csv_reader):
    for name in csv_reader.columns:
        if csv_reader[name].dtype=='object':
            continue
        for indx,row in csv_reader.iterrows():
            if pd.isna(csv_reader[name][indx]):
                csv_reader[name][indx] = avg_csv_reader[name]
    return csv_reader

## 3 - Transform Categorical Features into numerical values :

In [161]:
def get_replace_map_binay_encoder (csv_reader_train,csv_reader_test,cat_csv_reader):
    for name in cat_csv_reader.columns:
        if name != 'classLabel':
            encoder = ce.BinaryEncoder(cols=[name])
            csv_reader_train = encoder.fit_transform(csv_reader_train)
            csv_reader_test = encoder.transform(csv_reader_test)
    labels = csv_reader_train['classLabel'].astype('category').cat.categories.tolist()
    replace_map = {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}
    csv_reader_train.replace(replace_map, inplace=True)
    csv_reader_test.replace(replace_map, inplace=True)
    return csv_reader_train,csv_reader_test

## 4 - Process train and test Data :

In [162]:
def get_train_data(csv_reader):
    csv_reader=csv_reader.drop('variable18', axis=1)
    #sns.catplot(x="variable3", y="classLabel", data=csv_reader);
    # handling missing data for categorical Variable
    cat_csv_reader = csv_reader.select_dtypes(include=['object']).copy()
    frequent_object = get_frequent_object(csv_reader,cat_csv_reader)
    csv_reader = modify_nan_dtype_object(csv_reader,cat_csv_reader,frequent_object)
    # handling missing data for numerical Variable
    int_float_modifier = csv_reader.mean(axis = 0)
    csv_reader = modift_nan_flaot_int(csv_reader,int_float_modifier)
    return csv_reader,int_float_modifier,frequent_object,cat_csv_reader

In [163]:
def get_test_data(csv_reader,int_float_modifier,frequent_object):
    csv_reader=csv_reader.drop('variable18', axis=1)
    # handling missing data for categorical Variable
    cat_csv_reader = csv_reader.select_dtypes(include=['object']).copy()
    csv_reader = modify_nan_dtype_object(csv_reader,cat_csv_reader,frequent_object)
    # handling missing data for numerical Variable
    csv_reader = modift_nan_flaot_int(csv_reader,int_float_modifier)
    return csv_reader

In [180]:
train_dataFrame,int_float_modifier,frequent_object,cat_csv_reader = get_train_data(train_df)
test_dataFrame = get_test_data(test_df,int_float_modifier,frequent_object)
train_dataFrame,test_dataFrame = get_replace_map_binay_encoder (train_dataFrame,test_dataFrame,cat_csv_reader)

## *5* - Normalize the data :

In [181]:
#def normalize_data(train_dataFrame,test_dataFrame,indx):
indx = [1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0]
y_train = train_dataFrame["classLabel"].values
y_test = test_dataFrame["classLabel"].values
train_data_all =  train_dataFrame.values 
test_data_all = test_dataFrame.values
x_mean = train_dataFrame.mean()
x_std = train_dataFrame.std()
for i in range(0,train_data_all.shape[1]):
    if (indx[i]==0):
        train_data_all[:,i] = (train_data_all[:,i] - x_mean[i])/ (x_std[i]*x_std[i] + 0.000000001)
        train_data_all[:,i] = (train_data_all[:,i] - x_mean[i])/ (x_std[i]*x_std[i] + 0.000000001)

train_data_all =  train_dataFrame.values 
test_data_all = test_dataFrame.values
x_train = train_data_all[:,:-1]
x_test = test_data_all[:,:-1]

In [None]:

lb = preprocessing.LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)

## *6* - Use Machine Learning to model the data :

In [191]:
suffled_indx = np.arange(x_train.shape[0])
random.shuffle(suffled_indx)
x_train=x_train[suffled_indx]
y_train=y_train[suffled_indx]

print("shape of train data : ",x_train.shape," shape of test : ",x_test.shape)
print("ones equal ",sum(y_train == 1))
print("zeros equal ",sum(y_train == 0))


clf = tree.DecisionTreeClassifier(max_depth=2) #max_depth=5
print()
clf = clf.fit(x_train, y_train.ravel())
print("tree")
print(clf.score(x_train, y_train.ravel()))
print(clf.score(x_test, y_test.ravel()))
predictions = clf.predict(x_test)
print(classification_report(y_test, predictions))

clf3 = RandomForestClassifier() #max_depth=5
clf3 = clf3.fit(x_train, y_train.ravel())
print("random forest")
print(clf3.score(x_train, y_train.ravel()))
print(clf3.score(x_test, y_test.ravel()))
predictions = clf3.predict(x_test)
print(classification_report(y_test, predictions))


print("SVM")
clf2 = svm.SVC(kernel='poly',degree=2,max_iter=50).fit(x_train, y_train) #kernel='rbf', C=3
print(clf2.score(x_train, y_train.ravel()))
print(clf2.score(x_test, y_test.ravel()))
predictions = clf2.predict(x_test)
print(classification_report(y_test, predictions))

print("Logestic Regression")
lr2 = LogisticRegression(max_iter=200) 
lr2.fit(x_train, y_train.ravel()) 
print(lr2.score(x_train, y_train.ravel()))
print(lr2.score(x_test, y_test.ravel()))
predictions = lr2.predict(x_test)
print(classification_report(y_test, predictions))

print("xgboost")
xg = XGBClassifier(max_iter=50)
xg.fit(x_train, y_train)
print(xg.score(x_train, y_train.ravel()))
print(xg.score(x_test, y_test.ravel()))
predictions = xg.predict(x_test)
print(classification_report(y_test, predictions))

print("Keras")
model = Sequential()
# For first layer, input shape must be supplied
layer1 = Dense(units = 300, activation = 'relu', input_dim = x_train.shape[1])
model.add(layer1)
model.add(Dropout(0.50))

layer2 = Dense(units = 1000, activation = 'relu')
model.add(layer2)

layer3 = Dense(units = 1000, activation = 'relu')
model.add(layer3)
layer5 = Dense(units = 500, activation = 'relu')
model.add(layer5)

layer4 = Dense(units = 1, activation = 'sigmoid')
model.add(layer4)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])
model.fit(x_train, y_train, epochs=10)
nn_score = model.evaluate(x_test, y_test)[1]
print(nn_score)

shape of train data :  (3700, 30)  shape of test :  (200, 30)
ones equal  [3424]
zeros equal  [276]

tree
0.9454054054054054
0.79
              precision    recall  f1-score   support

           0       0.97      0.63      0.76       107
           1       0.69      0.98      0.81        93

    accuracy                           0.79       200
   macro avg       0.83      0.80      0.79       200
weighted avg       0.84      0.79      0.79       200

random forest
1.0
0.84
              precision    recall  f1-score   support

           0       0.93      0.76      0.84       107
           1       0.77      0.94      0.84        93

    accuracy                           0.84       200
   macro avg       0.85      0.85      0.84       200
weighted avg       0.86      0.84      0.84       200

SVM
0.9256756756756757
0.47
              precision    recall  f1-score   support

           0       1.00      0.01      0.02       107
           1       0.47      1.00      0.64        93

 