# Loading libraries and data

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [45]:
data = pd.read_csv('train.csv').sample(frac=1.0, replace=True, random_state=2)
data = data.reset_index(drop=True)


# Data characteristics

In [46]:
print("Shape: ", data.shape)
data.head()

Shape:  (300000, 25)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,100879,0,0,1,T,Y,Green,Circle,Lion,Russia,...,e1b66f5f2,1,Novice,Boiling Hot,c,B,ri,1,3,0
1,203245,0,0,0,F,Y,Blue,Trapezoid,Cat,Finland,...,506b8e5dd,1,Expert,Freezing,j,I,ZS,1,3,0
2,95816,0,0,1,F,Y,Blue,Square,Snake,India,...,602bcc56f,1,Novice,Freezing,g,F,tM,1,4,1
3,84434,0,0,0,T,N,Green,Triangle,Dog,China,...,97616a1ac,1,Grandmaster,Cold,l,Z,Uu,1,2,1
4,33867,0,0,0,F,Y,Red,Trapezoid,Snake,Russia,...,34058a17c,3,Master,Hot,a,Q,CM,4,1,0


In [47]:
data.nunique()

id        189775
bin_0          2
bin_1          2
bin_2          2
bin_3          2
bin_4          2
nom_0          3
nom_1          6
nom_2          6
nom_3          6
nom_4          4
nom_5        222
nom_6        522
nom_7       1219
nom_8       2211
nom_9      11849
ord_0          3
ord_1          5
ord_2          6
ord_3         15
ord_4         26
ord_5        192
day            7
month         12
target         2
dtype: int64

In [48]:
data.dtypes

id         int64
bin_0      int64
bin_1      int64
bin_2      int64
bin_3     object
bin_4     object
nom_0     object
nom_1     object
nom_2     object
nom_3     object
nom_4     object
nom_5     object
nom_6     object
nom_7     object
nom_8     object
nom_9     object
ord_0      int64
ord_1     object
ord_2     object
ord_3     object
ord_4     object
ord_5     object
day        int64
month      int64
target     int64
dtype: object

# Data preprocessing and cleaning

In [49]:
#columns that we can use directly
ready_columns = ['bin_0', 'bin_1', 'bin_2']

#categorical columns with string data
str_categorical_columns = ['bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'ord_1', 'ord_2', 'ord_3', 'ord_4']

#categorical columns with integer data
int_categorical_columns = ['ord_0', 'day', 'month']

#columns wiht integer alues
real_columns = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

#column ord_5 is processed alone.

YY = data['target']

Creating a new dataframe for ML

In [50]:
#function to process the data

def make_good_data_frame(in_data, n_pca = 0):
    #ready dataframe RD
    RD = in_data[ready_columns].copy()

    #String categorical dataframe SCD
    SCD = pd.get_dummies(in_data[str_categorical_columns], drop_first=True) 

    #Integer categorical dataframe ICD
    ICD = pd.get_dummies(in_data[int_categorical_columns[0]], drop_first=True)
    for i in range(1, len(int_categorical_columns)):
        temp_df = pd.get_dummies(in_data[int_categorical_columns[i]], drop_first=True)
        ICD = pd.merge(ICD, temp_df, left_index=True, right_index=True)
        
    #real columns
    realD = pd.DataFrame()
    for c in real_columns:
        realD[c] = in_data[c].apply(lambda x : int(x, 16))
    #realD = (realD - realD.mean())/realD.std()
    realD = (realD - realD.min())/(realD.max() - realD.min())
    
    #ord_5
    ord_5_D = pd.DataFrame()
    ord_5_D['ord_5'] = in_data['ord_5'].apply(lambda x : ord(x[0]) + ord(x[1]))
    #ord_5_D = (ord_5_D - ord_5_D.mean())/ord_5_D.std()
    ord_5_D = (ord_5_D - ord_5_D.min())/(ord_5_D.max() - ord_5_D.min())

    result = pd.merge(RD, SCD, left_index=True, right_index=True)
    result = pd.merge(result, ICD, left_index=True, right_index=True)
    result = pd.merge(result, realD, left_index=True, right_index=True)
    result = pd.merge(result, ord_5_D, left_index=True, right_index=True)
    
    #PCA
    if n_pca == 0:
        return result
    
    pca = PCA(n_components=n_pca)
    principalComponents = pca.fit_transform(result)
    result = pd.DataFrame(principalComponents)
    
    return result

XX = make_good_data_frame(data, 3)
XX.head()

Unnamed: 0,0,1,2
0,-0.549101,-0.685457,-0.65343
1,0.16198,0.253001,0.562874
2,-0.620076,0.127584,-0.73681
3,0.761128,-0.562084,0.847519
4,0.184292,0.844665,-0.319058


New training and test dataframes

In [51]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(XX, YY, test_size = 0.2, random_state = 2)

# ML section

In [52]:
from sklearn.metrics import confusion_matrix

1) Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter = 1000)
LR_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [54]:
lr_z_train = LR_clf.predict(X_train)
print("LR score training = ", round(LR_clf.score(X_train, y_train),3))
confusion_matrix(y_train, lr_z_train)

LR score training =  0.695


array([[166904,      0],
       [ 73096,      0]])

In [55]:
lr_z_test = LR_clf.predict(X_test)
print("LR score test = ", round(LR_clf.score(X_test, y_test),3))
confusion_matrix(y_test, lr_z_test)

LR score test =  0.692


array([[41500,     0],
       [18500,     0]])

2) K Neighbors Classifier

In [56]:
from sklearn.neighbors import KNeighborsClassifier

k_clf = KNeighborsClassifier(n_neighbors=3)
k_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [57]:
k_z_train = k_clf.predict(X_train)
print("K-N score training = ", round(k_clf.score(X_train, y_train),3))
confusion_matrix(y_train, k_z_train)

K-N score training =  0.877


array([[154947,  11957],
       [ 17540,  55556]])

In [58]:
k_z_test = k_clf.predict(X_test)
print("K-N score test = ", round(k_clf.score(X_test, y_test),3))
confusion_matrix(y_test, k_z_test)

K-N score test =  0.727


array([[34368,  7132],
       [ 9254,  9246]])

3) Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier


rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [60]:
rf_z_train = rf_clf.predict(X_train)
print("RF score training = ", round(rf_clf.score(X_train, y_train),3))
confusion_matrix(y_train, rf_z_train)

RF score training =  0.706


array([[166367,    537],
       [ 69922,   3174]])

In [61]:
rf_z_test = rf_clf.predict(X_test)
print("RF score test = ", round(rf_clf.score(X_test, y_test),3))
confusion_matrix(y_test, rf_z_test)

RF score test =  0.697


array([[41219,   281],
       [17895,   605]])


4) Support vector machine

In [62]:
from sklearn import svm

svm_clf = svm.SVC(gamma='scale')
svm_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
sv_z_train = svm_clf.predict(X_train)
print("SV score training = ", round(svm_clf.score(X_train, y_train),3))
confusion_matrix(y_train, sv_z_train)

In [35]:
sv_z_test = svm_clf.predict(X_test)
print("SV score test = ", round(svm_clf.score(X_test, y_test),3))
confusion_matrix(y_test, sv_z_test)

SV score test =  0.712


array([[402,  19],
       [154,  25]])

# Submission

In [63]:
test_data = pd.read_csv('test.csv')

XXt = make_good_data_frame(test_data, 3)

XXt.head()

Unnamed: 0,0,1,2
0,-0.558026,0.33695,-0.644392
1,0.451451,0.70001,-0.689224
2,0.251651,0.13349,-0.679956
3,0.379444,0.744108,-0.664869
4,0.948338,0.582886,-0.632522


In [42]:
xxt_prediction = svm_clf.predict(XXt)
xxt_prediction

array([0, 0, 0, ..., 0, 0, 0])

In [30]:
xxt_prediction = pd.DataFrame(xxt_prediction)
xxt_prediction.columns = ['target']

xxt_prediction['target'] = pd.to_numeric(xxt_prediction['target'], downcast='float')

result = pd.merge(test_data, xxt_prediction, left_index=True, right_index=True)

result[['id','target']].head()

Unnamed: 0,id,target
0,300000,0.0
1,300001,0.0
2,300002,0.0
3,300003,0.0
4,300004,0.0


In [31]:
result[['id','target']].to_csv("submission.csv", index=False)