# Loading libraries and data

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [17]:
data = pd.read_csv('train.csv').sample(frac=1, replace=True, random_state=2)
data = data.reset_index(drop=True)

# Data characteristics

In [18]:
print("Shape: ", data.shape)
data.head()

Shape:  (300000, 25)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,100879,0,0,1,T,Y,Green,Circle,Lion,Russia,...,e1b66f5f2,1,Novice,Boiling Hot,c,B,ri,1,3,0
1,203245,0,0,0,F,Y,Blue,Trapezoid,Cat,Finland,...,506b8e5dd,1,Expert,Freezing,j,I,ZS,1,3,0
2,95816,0,0,1,F,Y,Blue,Square,Snake,India,...,602bcc56f,1,Novice,Freezing,g,F,tM,1,4,1
3,84434,0,0,0,T,N,Green,Triangle,Dog,China,...,97616a1ac,1,Grandmaster,Cold,l,Z,Uu,1,2,1
4,33867,0,0,0,F,Y,Red,Trapezoid,Snake,Russia,...,34058a17c,3,Master,Hot,a,Q,CM,4,1,0


In [4]:
data.nunique()

id        189775
bin_0          2
bin_1          2
bin_2          2
bin_3          2
bin_4          2
nom_0          3
nom_1          6
nom_2          6
nom_3          6
nom_4          4
nom_5        222
nom_6        522
nom_7       1219
nom_8       2211
nom_9      11849
ord_0          3
ord_1          5
ord_2          6
ord_3         15
ord_4         26
ord_5        192
day            7
month         12
target         2
dtype: int64

In [5]:
data.dtypes

id         int64
bin_0      int64
bin_1      int64
bin_2      int64
bin_3     object
bin_4     object
nom_0     object
nom_1     object
nom_2     object
nom_3     object
nom_4     object
nom_5     object
nom_6     object
nom_7     object
nom_8     object
nom_9     object
ord_0      int64
ord_1     object
ord_2     object
ord_3     object
ord_4     object
ord_5     object
day        int64
month      int64
target     int64
dtype: object

# Data preprocessing and cleaning

In [19]:
#columns that we can use directly
ready_columns = ['bin_0', 'bin_1', 'bin_2']

#categorical columns with string data
str_categorical_columns = ['bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'] 

#ordinal columns
int_ordinal_columns = ['ord_0']
str_ordinal_columns = ['ord_1', 'ord_2', 'ord_3', 'ord_4']



#categorical columns with integer data
int_categorical_columns = ['day', 'month']

#columns wiht integer alues
real_columns = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

#column ord_5 is processed alone.

YY = data['target']

Creating a new dataframe for ML

In [20]:
#function to process the data
import string

def make_good_data_frame(in_data, n_pca = 0):
    
    lst_df = []
    
    #ready dataframe RD
    temp = in_data[ready_columns].copy()
    lst_df.append(temp)
    
    
    #String categorical dataframe SCD
    temp = pd.get_dummies(in_data[str_categorical_columns], drop_first=True)
    lst_df.append(temp)
    
    
    #ordinal columns
    lst_df.append(in_data['ord_0'])
    mapping = {"ord_1": {'Grandmaster':4, 'Expert':2, 'Novice':0, 'Contributor':1, 'Master':3},
               "ord_2": {'Cold':1, 'Hot':3, 'Lava Hot':5, 'Boiling Hot':4, 'Freezing':0, 'Warm':2},
               "ord_3": {'a':1, 'c':3, 'j':10, 'g':7, 'l':12, 'i':9, 'h':8, 'o':15, 'k':11, 'd':4, 'e':5, 'b':2, 'f':6, 'n':14, 'm':13},
               "ord_4": {'B':2, 'I':9, 'F':6, 'Z':26, 'Q':17, 'U':21, 'X':24, 'G':7, 'E':5, 'A':1, 'S':19, 'P':16, 'D':4, 'J':10, 'W':23, 'R':18, 'K':11, 'H':8,
                        'O':15, 'L':12, 'V':22, 'Y':25, 'T':20, 'M':13, 'C':3, 'N':14}}
    temp = in_data[str_ordinal_columns].replace(mapping).copy()
    lst_df.append(temp)
    
    
    #Integer categorical dataframe ICD
    temp = pd.DataFrame(index = in_data.index)
    for name in int_categorical_columns:
        tt = pd.get_dummies(in_data[name], drop_first=True)
        temp = pd.merge(temp, tt, left_index=True, right_index=True)
    lst_df.append(temp)
    
    
    #real columns
    temp = pd.DataFrame(index = in_data.index)
    for c in real_columns:
        temp[c] = in_data[c].apply(lambda x : int(x, 16))
    lst_df.append(temp)
    
    
    #ord_5
    temp = in_data['ord_5'].apply(lambda x : string.ascii_letters.index(x[0])*52 + string.ascii_letters.index(x[1])).copy()
    lst_df.append(temp)
    
    
    
    #mergin all the dataframes
    result = pd.DataFrame(index = in_data.index)
    for x in lst_df:
        result = pd.merge(result, x, left_index=True, right_index=True)
    
    #normalization
    result = ((result - result.min())/(result.max() - result.min()) - 0.5)*2.0
    
    #PCA
    if n_pca == 0:
        return result
    
    pca = PCA(n_components=n_pca)
    principalComponents = pca.fit_transform(result)
    result = pd.DataFrame(principalComponents)
    
    return result
   

XX = make_good_data_frame(data, 0)
XX.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3_T,bin_4_Y,nom_0_Green,nom_0_Red,nom_1_Polygon,nom_1_Square,nom_1_Star,...,9,10,11,12,nom_5,nom_6,nom_7,nom_8,nom_9,ord_5
0,-1.0,-1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0.976434,-0.540579,0.077072,0.620118,0.763431,-0.339272
1,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-0.056445,-0.246672,-0.856991,-0.353292,-0.3718,1.0
2,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0.150796,0.042395,-0.718599,-0.905704,-0.248732,-0.239792
3,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0.41966,-0.250595,-0.375844,0.517312,0.182643,0.789161
4,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0.150796,0.64821,0.964605,0.271891,-0.59369,0.107647


In [None]:
XX.nunique()

New training and test dataframes

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(XX, YY, test_size = 0.2, random_state = 2)

# ML section

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

1) Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter = 1000)
LR_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
lr_z_train = LR_clf.predict(X_train)
print("LR score training = ", round(LR_clf.score(X_train, y_train),3))
confusion_matrix(y_train, lr_z_train)

LR score training =  0.736


array([[151590,  15314],
       [ 48036,  25060]])

In [25]:
lr_z_test = LR_clf.predict(X_test)
print("LR score test = ", round(LR_clf.score(X_test, y_test),3))
confusion_matrix(y_test, lr_z_test)

LR score test =  0.735


array([[37650,  3850],
       [12060,  6440]])

In [26]:
print(cross_val_score(LR_clf, XX, YY, cv=5))

[0.73408777 0.735      0.73588333 0.73583333 0.73714562]


2) K Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k_clf = KNeighborsClassifier(n_neighbors=3)
k_clf.fit(X_train, y_train)

In [None]:
k_z_train = k_clf.predict(X_train)
print("K-N score training = ", round(k_clf.score(X_train, y_train),3))
confusion_matrix(y_train, k_z_train)

In [None]:
k_z_test = k_clf.predict(X_test)
print("K-N score test = ", round(k_clf.score(X_test, y_test),3))
confusion_matrix(y_test, k_z_test)

3) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rf_clf.fit(X_train, y_train)

In [None]:
rf_z_train = rf_clf.predict(X_train)
print("RF score training = ", round(rf_clf.score(X_train, y_train),3))
confusion_matrix(y_train, rf_z_train)

In [None]:
rf_z_test = rf_clf.predict(X_test)
print("RF score test = ", round(rf_clf.score(X_test, y_test),3))
confusion_matrix(y_test, rf_z_test)


4) Support vector machine

In [None]:
from sklearn import svm

svm_clf = svm.SVC(gamma='scale')
svm_clf.fit(X_train, y_train)

In [None]:
sv_z_train = svm_clf.predict(X_train)
print("SV score training = ", round(svm_clf.score(X_train, y_train),3))
confusion_matrix(y_train, sv_z_train)

In [None]:
sv_z_test = svm_clf.predict(X_test)
print("SV score test = ", round(svm_clf.score(X_test, y_test),3))
confusion_matrix(y_test, sv_z_test)

# Submission

In [27]:
test_data = pd.read_csv('test.csv')

XXt = make_good_data_frame(test_data, 0)

XXt.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3_T,bin_4_Y,nom_0_Green,nom_0_Red,nom_1_Polygon,nom_1_Square,nom_1_Star,...,9,10,11,12,nom_5,nom_6,nom_7,nom_8,nom_9,ord_5
0,-1.0,-1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,1.0,-1.0,-0.936089,0.226999,-0.353307,0.226973,-0.529102,-0.959911
1,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0.309258,-0.790145,-0.855617,-0.448047,-0.684951,0.688938
2,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,-0.318,-0.750736,-0.691678,0.432343,-0.131334,-0.237565
3,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,0.560093,0.300584,-0.621728,-0.910999,0.420855,-0.612472
4,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,1.0,-1.0,0.826024,0.186299,0.281435,0.946266,0.175671,-0.35709


In [28]:
xxt_prediction = LR_clf.predict(XXt)
xxt_prediction

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
xxt_prediction = pd.DataFrame(xxt_prediction)
xxt_prediction.columns = ['target']

xxt_prediction['target'] = pd.to_numeric(xxt_prediction['target'], downcast='float')

result = pd.merge(test_data, xxt_prediction, left_index=True, right_index=True)

result[['id','target']].head()

Unnamed: 0,id,target
0,300000,0.0
1,300001,0.0
2,300002,0.0
3,300003,0.0
4,300004,1.0


In [30]:
result[['id','target']].to_csv("submission.csv", index=False)