# Loading libraries and data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('train.csv')#sample(frac=1, replace=True, random_state=2)

#making the same proportion
data0 = data[data.target == 0].sample(frac=0.43, replace=True, random_state=2)
data1 = data[data.target == 1]

frac0 = data0.shape[0]/data.shape[0]
frac1 = data1.shape[0]/data.shape[0]

print("Fraction of 0: ",frac0, "\nFraction of 1: ", frac1)

data = pd.concat([data0, data1]).sample(frac=1.0, replace=True, random_state=2)

data = data.reset_index(drop=True)


Fraction of 0:  0.29847 
Fraction of 1:  0.30588


# Data characteristics

In [3]:
print("Shape: ", data.shape)
data.head()

Shape:  (181305, 25)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,273783,0,0,1,T,Y,Red,Trapezoid,Snake,India,...,6e9491509,1,Novice,Freezing,i,Q,Bd,4,9,0
1,36912,0,0,1,F,N,Green,Star,Axolotl,Russia,...,26e51923f,2,Novice,Lava Hot,d,L,qo,3,3,1
2,20838,1,0,1,F,N,Red,Circle,Hamster,Canada,...,ccd9b1eba,1,Master,Warm,g,P,aO,2,10,1
3,281645,0,0,0,F,Y,Red,Star,Dog,China,...,8036de9b5,1,Novice,Boiling Hot,g,X,sD,1,1,1
4,236929,0,1,0,F,N,Red,Circle,Snake,Canada,...,e8b921583,2,Grandmaster,Warm,j,Q,Zq,4,2,1


In [None]:
data.nunique()

In [None]:
data.dtypes

# Data preprocessing and cleaning

In [4]:
#columns that we can use directly
ready_columns = ['bin_0', 'bin_1', 'bin_2']

#categorical columns with string data
str_categorical_columns = ['bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'] 

#ordinal columns
int_ordinal_columns = ['ord_0']
str_ordinal_columns = ['ord_1', 'ord_2', 'ord_3', 'ord_4']



#categorical columns with integer data
int_categorical_columns = ['day', 'month']

#columns wiht integer alues
real_columns = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

#column ord_5 is processed alone.

YY = data['target']

Creating a new dataframe for ML

In [26]:
#function to process the data
import string

def make_good_data_frame(in_data, n_pca = 0):
    
    lst_df = []
    
    #ready dataframe RD
    temp = in_data[ready_columns].copy()
    lst_df.append(temp)
    
    
    #String categorical dataframe SCD
    temp = pd.get_dummies(in_data[str_categorical_columns], drop_first=True)
    lst_df.append(temp)
    
    
    #ordinal columns
    lst_df.append(in_data['ord_0'])
    mapping = {"ord_1": {'Grandmaster':4, 'Expert':2, 'Novice':0, 'Contributor':1, 'Master':3},
               "ord_2": {'Cold':1, 'Hot':3, 'Lava Hot':5, 'Boiling Hot':4, 'Freezing':0, 'Warm':2},
               "ord_3": {'a':1, 'c':3, 'j':10, 'g':7, 'l':12, 'i':9, 'h':8, 'o':15, 'k':11, 'd':4, 'e':5, 'b':2, 'f':6, 'n':14, 'm':13},
               "ord_4": {'B':2, 'I':9, 'F':6, 'Z':26, 'Q':17, 'U':21, 'X':24, 'G':7, 'E':5, 'A':1, 'S':19, 'P':16, 'D':4, 'J':10, 'W':23, 'R':18, 'K':11, 'H':8,
                        'O':15, 'L':12, 'V':22, 'Y':25, 'T':20, 'M':13, 'C':3, 'N':14}}
    temp = in_data[str_ordinal_columns].replace(mapping).copy()
    lst_df.append(temp)
    
    
    #Integer categorical dataframe ICD
    temp = pd.DataFrame(index = in_data.index)
    for name in int_categorical_columns:
        tt = pd.get_dummies(in_data[name], drop_first=True)
        temp = pd.merge(temp, tt, left_index=True, right_index=True)
    lst_df.append(temp)
    
    
    #real columns
    temp = pd.DataFrame(index = in_data.index)
    for c in real_columns:
        temp[c] = in_data[c].apply(lambda x : int(x, 16))
    lst_df.append(temp)
    
    
    #ord_5
    temp = in_data['ord_5'].apply(lambda x : string.ascii_letters.index(x[0])*52 + string.ascii_letters.index(x[1])).copy()
    lst_df.append(temp)
    
    
    
    #mergin all the dataframes
    result = pd.DataFrame(index = in_data.index)
    for x in lst_df:
        result = pd.merge(result, x, left_index=True, right_index=True)
    
    #normalization
    result = ((result - result.min())/(result.max() - result.min()) - 0.5)*2.0
    
    #PCA
    if n_pca == 0:
        return result
    
    pca = PCA(n_components=n_pca)
    principalComponents = pca.fit_transform(result)
    result = pd.DataFrame(principalComponents)
    
    return result
   

XX = make_good_data_frame(data, 0)
XX.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3_T,bin_4_Y,nom_0_Green,nom_0_Red,nom_1_Polygon,nom_1_Square,nom_1_Star,...,9,10,11,12,nom_5,nom_6,nom_7,nom_8,nom_9,ord_5
0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,-1.0,-0.344325,0.17815,0.401933,0.620118,-0.136146,0.043059
1,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-0.437942,0.226999,-0.32702,0.449616,-0.696254,-0.373422
2,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,...,-1.0,1.0,-1.0,-1.0,0.936982,-0.941429,-0.827289,-0.778288,0.600425,-0.971789
3,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-0.120732,0.440385,-0.436127,-0.386356,0.001636,-0.285078
4,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0.119677,0.744468,-0.797221,0.543121,0.818207,0.979213


In [None]:
XX.nunique()

New training and test dataframes

In [28]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(XX, YY, test_size = 0.2, random_state = 3)

# ML section

In [29]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

1) Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter = 1000)
LR_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
lr_z_train = LR_clf.predict(X_train)
print("LR score training = ", round(LR_clf.score(X_train, y_train),3))
confusion_matrix(y_train, lr_z_train)

LR score training =  0.684


array([[48017, 23566],
       [22299, 51162]])

In [32]:
lr_z_test = LR_clf.predict(X_test)
print("LR score test = ", round(LR_clf.score(X_test, y_test),3))
confusion_matrix(y_test, lr_z_test)

LR score test =  0.681


array([[12032,  5969],
       [ 5588, 12672]])

In [33]:
print(cross_val_score(LR_clf, XX, YY, cv=5))

[0.68200871 0.68359946 0.68238603 0.68346157 0.68667954]


2) K Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k_clf = KNeighborsClassifier(n_neighbors=3)
k_clf.fit(X_train, y_train)

In [None]:
k_z_train = k_clf.predict(X_train)
print("K-N score training = ", round(k_clf.score(X_train, y_train),3))
confusion_matrix(y_train, k_z_train)

In [None]:
k_z_test = k_clf.predict(X_test)
print("K-N score test = ", round(k_clf.score(X_test, y_test),3))
confusion_matrix(y_test, k_z_test)

3) Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier


rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [19]:
rf_z_train = rf_clf.predict(X_train)
print("RF score training = ", round(rf_clf.score(X_train, y_train),3))
confusion_matrix(y_train, rf_z_train)

RF score training =  0.643


array([[42438, 29253],
       [22469, 50884]])

In [20]:
rf_z_test = rf_clf.predict(X_test)
print("RF score test = ", round(rf_clf.score(X_test, y_test),3))
confusion_matrix(y_test, rf_z_test)

RF score test =  0.606


array([[ 9976,  7917],
       [ 6356, 12012]])

In [21]:
print(cross_val_score(rf_clf, XX, YY, cv=5))

[0.60316585 0.60365131 0.60461653 0.60083837 0.60435742]



4) Support vector machine

In [None]:
from sklearn import svm

svm_clf = svm.LinearSVC(max_iter = 1000)
svm_clf.fit(X_train, y_train)

In [None]:
sv_z_train = svm_clf.predict(X_train)
print("SV score training = ", round(svm_clf.score(X_train, y_train),3))
confusion_matrix(y_train, sv_z_train)

In [None]:
sv_z_test = svm_clf.predict(X_test)
print("SV score test = ", round(svm_clf.score(X_test, y_test),3))
confusion_matrix(y_test, sv_z_test)

In [None]:
print(cross_val_score(svm_clf, XX, YY, cv=5))

5) Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [23]:
nb_z_train = nb_clf.predict(X_train)
print("NB score training = ", round(nb_clf.score(X_train, y_train),3))
confusion_matrix(y_train, nb_z_train)

NB score training =  0.535


array([[32307, 39384],
       [28105, 45248]])

In [24]:
nb_z_test = nb_clf.predict(X_test)
print("NB score test = ", round(nb_clf.score(X_test, y_test),3))
confusion_matrix(y_test, nb_z_test)

NB score test =  0.535


array([[ 8057,  9836],
       [ 7043, 11325]])

In [25]:
print(cross_val_score(nb_clf, XX, YY, cv=5))

[0.53171364 0.53054246 0.53476187 0.53616834 0.53546608]


# Submission

In [None]:
test_data = pd.read_csv('test.csv')

XXt = make_good_data_frame(test_data, 0)

XXt.head()

In [None]:
xxt_prediction = rf_clf.predict(XXt)
xxt_prediction

In [None]:
xxt_prediction = pd.DataFrame(xxt_prediction)
xxt_prediction.columns = ['target']

xxt_prediction['target'] = pd.to_numeric(xxt_prediction['target'], downcast='float')

result = pd.merge(test_data, xxt_prediction, left_index=True, right_index=True)

result[['id','target']].head()


In [None]:
result[['id','target']].to_csv("submission.csv", index=False)