# Simple models

# Loading the data

In [1]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.shape

(100000, 52)

In [2]:
x_train = train.drop(columns=['id','target'])
y_train = train['target']
x_test = test.drop(columns=['id'])

In [3]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True)

# Submit

In [4]:
def submit(model, filename):
    model.fit(x_train, y_train)
    test_pred = model.predict_proba(x_test)
    submission = pd.DataFrame(test_pred, columns=model.classes_)
    submission.insert(0, 'id', test['id'])
    submission.to_csv(filename, index=False)

# Lightgbm model

In [19]:
from lightgbm.sklearn import LGBMClassifier
lgbm = LGBMClassifier(n_jobs = -2)

In [20]:
from sklearn.model_selection import cross_validate
scores = cross_validate(lgbm, X=x_train, y=y_train, cv=kfold, return_train_score = True,
                         scoring='neg_log_loss')

In [21]:
scores

{'fit_time': array([4.71974301, 6.03473926, 5.37232757, 5.76984   , 6.35233402]),
 'score_time': array([0.29277897, 0.3637979 , 0.36307979, 0.43104625, 0.42544055]),
 'test_score': array([-1.09765585, -1.09592907, -1.09481363, -1.09219572, -1.09732808]),
 'train_score': array([-1.03985489, -1.04039264, -1.0398559 , -1.04090371, -1.0399794 ])}

In [22]:
scores['test_score'].mean()

-1.0955844697774881

In [23]:
submit(lgbm, 'lgbm_default_nocat.csv')

# default catboost

In [24]:
from catboost import CatBoostClassifier
cbc = CatBoostClassifier(thread_count=3, silent=True)

In [25]:
from sklearn.model_selection import cross_validate
scores = cross_validate(cbc, X=x_train, y=y_train, cv=kfold, return_train_score = True,
                         scoring='neg_log_loss')

In [26]:
scores

{'fit_time': array([ 58.86771584,  65.66978788,  67.56323862,  72.43018174,
        107.42154026]),
 'score_time': array([0.36634088, 0.1797657 , 0.17595959, 0.31515527, 0.26018238]),
 'test_score': array([-1.09425944, -1.09578639, -1.08726956, -1.10238732, -1.09471942]),
 'train_score': array([-1.01005738, -1.00937368, -1.01047957, -1.00859205, -1.01059842])}

In [27]:
scores['test_score'].mean()

-1.094884427783186

In [28]:
submit(cbc, 'cbc_default_nocat.csv')

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=-1)

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(rfc, X=x_train, y=y_train, cv=kfold, return_train_score = True,
                         scoring='neg_log_loss')

In [None]:
scores

In [None]:
scores['test_score'].mean()

Worse than baseline!

# Lightgbm all variables categorical

In [10]:
cat_cols = x_train.columns

In [11]:
from categorical_transform import CategoricalTransform
from lightgbm.sklearn import LGBMClassifier
lgbm = LGBMClassifier(n_jobs = -2)

In [12]:
from sklearn.pipeline import Pipeline
lightgbm_pipe = Pipeline([('cat', CategoricalTransform(cat_cols=cat_cols)),
                     ('lgbm', lgbm)])

In [14]:
from sklearn.model_selection import cross_validate
scores = cross_validate(lightgbm_pipe, X=x_train, y=y_train, cv=kfold, return_train_score = True,
                         scoring='neg_log_loss')

In [15]:
scores

{'fit_time': array([5.95164013, 5.67020464, 5.68722725, 6.24658966, 6.88659215]),
 'score_time': array([0.66452026, 0.66873789, 0.74573088, 0.73980093, 0.77645874]),
 'test_score': array([-1.09508857, -1.09097307, -1.09722968, -1.08958342, -1.10916769]),
 'train_score': array([-1.01818553, -1.0196123 , -1.01780299, -1.01861981, -1.01460178])}

In [18]:
scores['test_score'].mean()

-1.0964084875116744

# catboost all variables categorical

In [36]:
cat_cols = x_train.columns.tolist()

In [37]:
from min_zero_transform import MinZeroTransform
cbc = CatBoostClassifier(thread_count=3, silent=True, cat_features=cat_cols)

In [38]:
from sklearn.pipeline import Pipeline
catboost_pipe = Pipeline([('cat', MinZeroTransform()),
                     ('cbc', cbc)])

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(catboost_pipe, X=x_train, y=y_train, cv=kfold, return_train_score = True,
                         scoring='neg_log_loss')

In [None]:
scores

In [None]:
scores['test_score'].mean()