In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping


Using TensorFlow backend.


In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
all_df = pd.concat([train, test]).drop(['id', 'target'], axis=1).reset_index(drop=True)

all_df['bin_3'] = all_df['bin_3'].replace({"F": 0, "T": 1})
all_df['bin_4'] = all_df['bin_4'].replace({"N": 0, "Y": 1})

all_df['ord_1'] = all_df['ord_1'].replace({
    "Novice": 0,
    "Contributor": 1,
    "Expert": 2,
    "Master": 3,
    "Grandmaster": 4
})
all_df['ord_2'] = all_df['ord_2'].replace({
    "Freezing": 0,
    "Cold": 1,
    "Warm": 2,
    "Hot": 3,
    "Boiling Hot": 4,
    "Lava Hot": 5
})
all_df['ord_3'] = LabelEncoder().fit_transform(all_df['ord_3'])
all_df['ord_4'] = LabelEncoder().fit_transform(all_df['ord_4'])

all_df['ord_5a'] = all_df['ord_5'].str[0]
all_df['ord_5a'] = LabelEncoder().fit_transform(all_df['ord_5a'])

all_df['ord_5b'] = all_df['ord_5'].str[1]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
ohe_cols = ['day', 'month', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 
            'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_5', 'ord_5b']

ohc = OneHotEncoder().fit_transform(all_df[ohe_cols])

In [5]:
full_sparse_df = hstack([csr_matrix(all_df[all_df.columns.difference(ohe_cols + ['bin_0'])].values), ohc]).tocsr()

In [6]:
X_train = full_sparse_df[:300000]
y_train = train['target']
X_test = full_sparse_df[300000:]

In [8]:
skf = StratifiedKFold(5, shuffle=True, random_state=42)
for train_index, test_index in skf.split(X_train, y_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_dev, X_val = X_train[train_index], X_train[test_index]
    y_dev, y_val = y_train[train_index], y_train[test_index]
    
    model = LogisticRegression(solver='lbfgs', max_iter=5000, C=0.1, random_state=42).fit(X_dev, y_dev)
    val_pred = model.predict_proba(X_val)
    print('ROCAUC score: {}'.format(roc_auc_score(y_val, val_pred[:,1])))

TRAIN: [     0      1      2 ... 299995 299997 299998] TEST: [     3      5      7 ... 299994 299996 299999]
ROCAUC score: 0.8046976605482206
TRAIN: [     1      2      3 ... 299996 299997 299999] TEST: [     0     14     28 ... 299975 299985 299998]
ROCAUC score: 0.8026895024751906
TRAIN: [     0      2      3 ... 299997 299998 299999] TEST: [     1      4     26 ... 299988 299991 299995]
ROCAUC score: 0.8016492765133283
TRAIN: [     0      1      3 ... 299996 299998 299999] TEST: [     2      8      9 ... 299981 299986 299997]
ROCAUC score: 0.8064780716965907
TRAIN: [     0      1      2 ... 299997 299998 299999] TEST: [     6     11     12 ... 299987 299989 299992]
ROCAUC score: 0.8009751487619332


In [9]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=5000, C=0.1, random_state=42).fit(X_train, y_train)

In [10]:
pred_test = pd.DataFrame({'id': test['id'],
                          'target': lr_model.predict_proba(X_test)[:,1]})

In [12]:
# public score 0.80811
pred_test.to_csv('../submissions/lr-oh-od-no0-submission.csv', index=False)