# Model

In [1]:
!ls

Analysis.ipynb        [31mtest.csv[m[m
[31msample_submission.csv[m[m [31mtrain.csv[m[m


In [281]:
import pandas as pd

df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [282]:
df = df.drop(columns=['id'], errors='ignore')
df.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [289]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin


nom_cols = [x for x in df.columns if 'nom' in x]
bin_cols = [x for x in df.columns if 'bin' in x]




# Create a custom Mixin 
def transform_df(X):
    X = X[X.columns]  #Create a copy ? 
    X['bin_3'] = X['bin_3'].map(lambda x : 1 if x=='T' else 0)
    X['bin_4'] = X['bin_4'].map(lambda x : 1 if x=='Y' else 0)    
    
    temps = {
        'Freezing': 0,
        'Cold': 1,
        'Warm': 2, 
        'Hot': 3,
        'Boiling Hot': 4,
        'Lava Hot': 5
    }
    
    X['ord_2'] = X['ord_2'].map(lambda x : temps[x])
    X['ord_3'] = X['ord_3'].map(lambda x : ord(x))
    X['ord_4'] = X['ord_4'].map(lambda x : ord(x))
    X['ord_5.1'] = X['ord_5'].map(lambda x : ord(x[0]))
    X['ord_5.2'] = X['ord_5'].map(lambda x : ord(x[1]))
    
    
    # decrease the cardinality of the nominal categories with high cardinality 
    vals_6 = X['nom_6'].value_counts().index[:10]
    X['nom_6'] = X['nom_6'].map(lambda x : x if x in vals_6 else 'other')
    vals_7 = X['nom_7'].value_counts().index[:10]
    X['nom_7'] = X['nom_7'].map(lambda x : x if x in vals_7 else 'other')
    vals_8 = X['nom_8'].value_counts().index[:10]
    X['nom_8'] = X['nom_8'].map(lambda x : x if x in vals_8 else 'other')
    vals_9 = X['nom_9'].value_counts().index[:10]
    X['nom_9'] = X['nom_9'].map(lambda x : x if x in vals_8 else 'other')
    
    # Dummy Encode the Categorical Values
    X_dummies = pd.get_dummies(X[nom_cols + ['ord_1']], drop_first=True)

    # Create the transformed DF
    res = X[bin_cols+['ord_0', 'ord_2', 'ord_3', 'ord_4', 'ord_5.1', 'ord_5.2']]
    res = pd.concat([res, X_dummies], axis=1)
    
    if('target' in X.columns):
        res['target'] = X['target']
    
    return res

    

a= transform_df(df)
a.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,ord_0,ord_2,ord_3,ord_4,ord_5.1,...,nom_8_c720f85ca,nom_8_d69adef8b,nom_8_db3352558,nom_8_fdcd0dada,nom_8_other,ord_1_Expert,ord_1_Grandmaster,ord_1_Master,ord_1_Novice,target
0,0,0,0,1,1,2,1,104,68,107,...,0,0,0,0,1,0,1,0,0,0
1,0,1,0,1,1,1,3,97,65,98,...,0,0,0,0,1,0,1,0,0,0
2,0,0,0,0,1,1,5,104,82,74,...,0,0,0,0,1,1,0,0,0,0
3,0,1,0,0,1,1,4,105,68,107,...,0,0,0,0,1,0,1,0,0,1
4,0,0,0,0,0,1,0,97,82,113,...,0,0,0,0,1,0,1,0,0,0


In [290]:
# Create Train-Test Split
from sklearn.model_selection import train_test_split

X = a.drop(columns='target')
y = a['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [309]:
# Train Model

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#model = RandomForestClassifier(random_state=0, n_estimators=100, verbose=2, max_depth=12, max_features='sqrt').fit(X_train, y_train)
model = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)



In [310]:
# Baseline. Our data has an imbalance towards 0's.
# If we guess 0 always, we end up with accuracy ~70%
base_acc = y_train.sum()/len(y_train)
print("Baseline model accuracy : {}".format(1-base_acc))

# Evaluate Model
y_pred = model.predict(X_test)

# Accuracy of our model
acc = (y_pred == y_test).sum()/len(y_test)
print("The resulting Accuracy is : {}".format(acc))

Baseline model accuracy : 0.6936069651741293
The resulting Accuracy is : 0.724020202020202


In [225]:
# Predict on the Test set

In [314]:
sub_df = pd.read_csv('data/test.csv')
id_values = sub_df['id']
sub_df = transform_df(sub_df)
sub_df = sub_df.reindex(columns = X_train.columns).fillna(value=0)

pred_values = model.predict(sub_df)

In [315]:
# Create and save submission DF
import numpy as np 

pred_df = pd.DataFrame({'id': id_values, 'target': pred_values})
pred_df.to_csv('data/submit.csv', index=False)