# Loading libraries and data

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [88]:
data = pd.read_csv('train.csv')#.sample(frac=1.0, replace=True, random_state=2)
#data = data.reset_index(drop=True)


# Data characteristics

In [89]:
print("Shape: ", data.shape)
data.head()

Shape:  (300000, 25)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [44]:
data.nunique()

id        189775
bin_0          2
bin_1          2
bin_2          2
bin_3          2
bin_4          2
nom_0          3
nom_1          6
nom_2          6
nom_3          6
nom_4          4
nom_5        222
nom_6        522
nom_7       1219
nom_8       2211
nom_9      11849
ord_0          3
ord_1          5
ord_2          6
ord_3         15
ord_4         26
ord_5        192
day            7
month         12
target         2
dtype: int64

In [45]:
data.dtypes

id         int64
bin_0      int64
bin_1      int64
bin_2      int64
bin_3     object
bin_4     object
nom_0     object
nom_1     object
nom_2     object
nom_3     object
nom_4     object
nom_5     object
nom_6     object
nom_7     object
nom_8     object
nom_9     object
ord_0      int64
ord_1     object
ord_2     object
ord_3     object
ord_4     object
ord_5     object
day        int64
month      int64
target     int64
dtype: object

# Data preprocessing and cleaning

In [90]:
#columns that we can use directly
ready_columns = ['bin_0', 'bin_1', 'bin_2']

#categorical columns with string data
str_categorical_columns = ['bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'ord_1', 'ord_2', 'ord_3', 'ord_4']

#categorical columns with integer data
int_categorical_columns = ['ord_0', 'day', 'month']

#columns wiht integer alues
real_columns = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

#column ord_5 is processed alone.

#target
YY = data['target']

Creating a new dataframe for ML

In [105]:
#function to process the data

def make_good_data_frame(in_data, n_pca = 0):
    #ready dataframe RD
    RD = in_data[ready_columns].copy()

    #String categorical dataframe SCD
    SCD = pd.get_dummies(in_data[str_categorical_columns], drop_first=True) 

    #Integer categorical dataframe ICD
    ICD = pd.get_dummies(in_data[int_categorical_columns[0]], drop_first=True)
    for i in range(1, len(int_categorical_columns)):
        temp_df = pd.get_dummies(in_data[int_categorical_columns[i]], drop_first=True)
        ICD = pd.merge(ICD, temp_df, left_index=True, right_index=True)
        
    #real columns
    realD = pd.DataFrame()
    for c in real_columns:
        realD[c] = in_data[c].apply(lambda x : int(x, 16))
    #realD = (realD - realD.mean())/realD.std()
    realD = (realD - realD.min())/(realD.max() - realD.min())
    
    #ord_5
    ord_5_D = pd.DataFrame()
    ord_5_D['ord_5'] = in_data['ord_5'].apply(lambda x : ord(x[0]) + ord(x[1]))
    #ord_5_D = (ord_5_D - ord_5_D.mean())/ord_5_D.std()
    ord_5_D = (ord_5_D - ord_5_D.min())/(ord_5_D.max() - ord_5_D.min())

    result = pd.merge(RD, SCD, left_index=True, right_index=True)
    result = pd.merge(result, ICD, left_index=True, right_index=True)
    result = pd.merge(result, realD, left_index=True, right_index=True)
    result = pd.merge(result, ord_5_D, left_index=True, right_index=True)
    
    #PCA
    if n_pca == 0:
        return result
    
    pca = PCA(n_components=n_pca)
    principalComponents = pca.fit_transform(result)
    result = pd.DataFrame(data = principalComponents)
    
    return result

XX = make_good_data_frame(data, 0)
XX.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3_T,bin_4_Y,nom_0_Green,nom_0_Red,nom_1_Polygon,nom_1_Square,nom_1_Star,...,9,10,11,12,nom_5,nom_6,nom_7,nom_8,nom_9,ord_5
0,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0.318918,0.229711,0.409083,0.763774,0.184708,0.818182
1,0,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0.70983,0.984441,0.230843,0.300073,0.969706,0.336364
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0.197992,0.035706,0.650502,0.86956,0.681276,0.381818
3,0,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0.952962,0.316061,0.923618,0.29234,0.509517,0.572727
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0.358966,0.124632,0.017261,0.793971,0.692946,0.563636


New training and test dataframes

In [106]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(XX, YY, test_size = 0.1, random_state = 2)

# ML section

In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

1) Logistic Regression

In [107]:
LR_clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter = 1000)
LR_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [108]:
lr_z_train = LR_clf.predict(X_train)
print("LR score training = ", round(LR_clf.score(X_train, y_train),3))
confusion_matrix(y_train, lr_z_train)

LR score training =  0.74


array([[169581,  17909],
       [ 52347,  30163]])

In [109]:
lr_z_test = LR_clf.predict(X_test)
print("LR score test = ", round(LR_clf.score(X_test, y_test),3))
confusion_matrix(y_test, lr_z_test)

LR score test =  0.739


array([[18808,  1938],
       [ 5898,  3356]])

2) K Neighbors Classifier

In [229]:
k_clf = KNeighborsClassifier(n_neighbors=7)
k_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform')

In [192]:
k_z_train = k_clf.predict(X_train)
print("K-N score training = ", round(k_clf.score(X_train, y_train),3))
confusion_matrix(y_train, k_z_train)

K-N score training =  0.752


array([[14714,   860],
       [ 4709,  2217]])

In [193]:
k_z_test = k_clf.predict(X_test)
print("K-N score test = ", round(k_clf.score(X_test, y_test),3))
confusion_matrix(y_test, k_z_test)

K-N score test =  0.686


array([[4740,  476],
       [1880,  404]])

3) Random Forest

In [97]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [98]:
rf_z_train = rf_clf.predict(X_train)
print("RF score training = ", round(rf_clf.score(X_train, y_train),3))
confusion_matrix(y_train, rf_z_train)

RF score training =  0.709


array([[184683,   2807],
       [ 75797,   6713]])

In [99]:
rf_z_test = rf_clf.predict(X_test)
print("RF score test = ", round(rf_clf.score(X_test, y_test),3))
confusion_matrix(y_test, rf_z_test)

RF score test =  0.697


array([[20334,   412],
       [ 8680,   574]])

4) Support vector machine

In [None]:
svm_clf = SVC(gamma='auto')
svm_clf.fit(X_train, y_train)

In [None]:
sv_z_train = svm_clf.predict(X_train)
print("SV score training = ", round(svm_clf.score(X_train, y_train),3))
confusion_matrix(y_train, sv_z_train)

In [None]:
sv_z_test = svm_clf.predict(X_test)
print("SV score test = ", round(svm_clf.score(X_test, y_test),3))
confusion_matrix(y_test, sv_z_test)

# Submission

In [110]:
test_data = pd.read_csv('test.csv')

XXt = make_good_data_frame(test_data, 0)

XXt.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3_T,bin_4_Y,nom_0_Green,nom_0_Red,nom_1_Polygon,nom_1_Square,nom_1_Star,...,9,10,11,12,nom_5,nom_6,nom_7,nom_8,nom_9,ord_5
0,0,0,1,1,1,0,0,0,0,0,...,0,0,1,0,0.031956,0.613499,0.323347,0.613486,0.235449,0.618182
1,0,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0.654629,0.104927,0.072192,0.275976,0.157525,0.281818
2,1,0,1,0,1,0,0,0,1,0,...,0,0,0,1,0.341,0.124632,0.154161,0.716172,0.434333,0.590909
3,0,0,1,1,1,0,1,0,0,1,...,0,0,0,0,0.780047,0.650292,0.189136,0.044501,0.710428,0.7
4,0,1,1,0,0,0,1,0,0,0,...,0,0,1,0,0.913012,0.593149,0.640717,0.973133,0.587835,0.518182


In [112]:
xxt_prediction = LR_clf.predict(XXt)
xxt_prediction

array([0, 1, 0, ..., 0, 0, 0])

In [113]:
xxt_prediction = pd.DataFrame(xxt_prediction)
xxt_prediction.columns = ['target']

xxt_prediction['target'] = pd.to_numeric(xxt_prediction['target'], downcast='float')

result = pd.merge(test_data, xxt_prediction, left_index=True, right_index=True)

result[['id','target']].head()

Unnamed: 0,id,target
0,300000,0.0
1,300001,1.0
2,300002,0.0
3,300003,0.0
4,300004,1.0


In [115]:
result[['id','target']].to_csv("submission.csv", index=False)