In [39]:
import random
import pandas as pd
import numpy as np
import tensorflow as tf

from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression

random.seed(42)

In [40]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [41]:
all_df = pd.concat([train, test]).drop(['id', 'target'], axis=1).reset_index(drop=True)

all_df['bin_3'] = all_df['bin_3'].replace({"F": 0, "T": 1})
all_df['bin_4'] = all_df['bin_4'].replace({"N": 0, "Y": 1})

all_df['ord_1'] = all_df['ord_1'].replace({
    "Novice": 0,
    "Contributor": 1,
    "Expert": 2,
    "Master": 3,
    "Grandmaster": 4
})
all_df['ord_2'] = all_df['ord_2'].replace({
    "Freezing": 0,
    "Cold": 1,
    "Warm": 2,
    "Hot": 3,
    "Boiling Hot": 4,
    "Lava Hot": 5
})
all_df['ord_3'] = LabelEncoder().fit_transform(all_df['ord_3'])
all_df['ord_4'] = LabelEncoder().fit_transform(all_df['ord_4'])
all_df['ord_5'] = all_df['ord_5'].replace({'Cl': 'GG', 
                                           'Kf': 'GG', 
                                           'eg': 'GG', 
                                           'xP': 'GG'})

all_df['ord_5a'] = all_df['ord_5'].str[0]
all_df['ord_5b'] = all_df['ord_5'].str[1]

all_df['ord_5a'] = LabelEncoder().fit_transform(all_df['ord_5a'])
all_df['ord_5'] = LabelEncoder().fit_transform(all_df['ord_5'])

nom_9_df = train.groupby('nom_9')['target'].mean().reset_index()
nom_9_df.columns = ['nom_9', 'mean_target']

to_rep = nom_9_df.loc[(nom_9_df['mean_target'] == 1) | (nom_9_df['mean_target'] == 0), 'nom_9'].tolist()
target_rep = ['ff4ccc205', '1a25c6368', 'e35015401', 'c0b71d6ef', 'b958a2001', 'eeaf55dc2', '55bc7f2fd']

rep_map = {}
for i in to_rep:
    rep_map[i] = random.choice(target_rep)
all_df.loc[all_df['nom_9'].isin(to_rep), 'nom_9'] = all_df.loc[all_df['nom_9'].isin(to_rep), 'nom_9'].map(rep_map)    
    
# to_drop = nom_9_df.loc[nom_9_df['mean_target'] == 0.5, 'nom_9'].sample(frac=0.5, random_state=42).tolist()
# all_train_df = all_df.iloc[:300000].loc[~all_df['nom_9'].isin(to_drop)]
# all_test_df = all_df.iloc[300000:]
# all_df = pd.concat([all_train_df, all_test_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [42]:
ohe_cols = ['day', 'month', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 
            'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_5b']

ohc = OneHotEncoder().fit_transform(all_df[ohe_cols])

In [43]:
full_sparse_df = hstack([csr_matrix(all_df[all_df.columns.difference(ohe_cols + ['bin_0'])].values), ohc]).tocsr()

In [47]:
X_train = full_sparse_df[:-200000]
y_train = train['target']
X_test = full_sparse_df[-200000:]

In [51]:
skf = StratifiedKFold(5, shuffle=True, random_state=42)
score = 0
for train_index, test_index in skf.split(X_train, y_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_dev, X_val = X_train[train_index], X_train[test_index]
    y_dev, y_val = y_train[train_index], y_train[test_index]
    
    model = LogisticRegression(solver='lbfgs', max_iter=5000, C=0.11, random_state=42).fit(X_dev, y_dev)
    val_pred = model.predict_proba(X_val)
    val_score = roc_auc_score(y_val, val_pred[:,1])
    print('ROCAUC score: {}'.format(val_score))
    score += val_score

print("===================")
print('ROCAUC score: {}'.format(score / 5))

TRAIN: [     0      1      2 ... 299995 299997 299998] TEST: [     3      5      7 ... 299994 299996 299999]
ROCAUC score: 0.805894586244043
TRAIN: [     1      2      3 ... 299996 299997 299999] TEST: [     0     14     28 ... 299975 299985 299998]
ROCAUC score: 0.8037606934148611
TRAIN: [     0      2      3 ... 299997 299998 299999] TEST: [     1      4     26 ... 299988 299991 299995]
ROCAUC score: 0.8028039674436462
TRAIN: [     0      1      3 ... 299996 299998 299999] TEST: [     2      8      9 ... 299981 299986 299997]
ROCAUC score: 0.8070600793088858
TRAIN: [     0      1      2 ... 299997 299998 299999] TEST: [     6     11     12 ... 299987 299989 299992]
ROCAUC score: 0.8022060475316799
ROCAUC score: 0.8043450747886232


In [52]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=5000, C=0.11, random_state=42).fit(X_train, y_train)

In [53]:
pred_test = pd.DataFrame({'id': test['id'],
                          'target': lr_model.predict_proba(X_test)[:,1]})

In [54]:
# public score 0.80754
pred_test.to_csv('../submissions/lr-oh-od-no0-c11-ord5-submission.csv', index=False)

In [55]:
# blend with oh my cat and glmnet -> 0.80824
omc = pd.read_csv('../submissions/submission-oh-my-cat.csv')
glmnet = pd.read_csv('../submissions/submission_glmnet.csv')

In [56]:
blend_out = ((glmnet + omc + pred_test)/3)
blend_out['id'] = blend_out['id'].astype(int)

In [57]:
blend_out.to_csv('../submissions/submission-blend-lr-omc-glmnet-ord5.csv', index=False)