In [None]:
import datetime

import pandas as pd
import numpy as np

np.random.seed(0)

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import f1_score,mean_squared_error

import lightgbm as lgb

from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin

from tqdm import tqdm

In [None]:
train_df_balanced = pd.read_csv('train_df_final.csv')
test_df = pd.read_csv('test_df_final.csv')
train_df = pd.read_csv('train_df_final_blanced.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
train_df = train_df.fillna(value=0)
train_df_balanced = train_df_balanced.fillna(value=0)
test_df = test_df.fillna(value=0)

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat,average='micro'), True

# Classifier

In [None]:
features = test_df.columns
df_train_X = train_df[features]
df_test_X = test_df[features]
df_train_balanced_X = train_df_balanced[features]
y = train_df['label'].values
y_balanced = train_df_balanced['label'].values

In [None]:
params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0,
    'metric':'f1'
}

In [None]:
validation_scores = []
balanced_scores = []
models = []
folds = 3
skf = StratifiedKFold(n_splits=folds)
preds = np.zeros(df_test_X.shape[0])
for train_index, test_index in skf.split(df_train_X, y):
    X_train, X_test = df_train_X.iloc[train_index,:], df_train_X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train = lgb.Dataset(X_train,y_train)
    valid = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(params, train,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=True)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    balanced_scores.append(f1_score(y_balanced,np.round(model.predict(df_train_balanced_X)),average='micro'))
    models.append(model)
    preds += np.round(model.predict(df_test_X))

In [None]:
np.mean(validation_scores), np.std(validation_scores), np.min(validation_scores)

In [None]:
np.mean(balanced_scores), np.std(balanced_scores), np.min(balanced_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]

fig, ax = plt.subplots(figsize=(40, 60))
lgb.plot_importance(best_model,ax=ax)

In [None]:
submission_df['prediction'] = np.where(preds > 2.0, 1, 0)
submission_df['prediction'].sum() / submission_df.shape[0]

In [None]:
submission_df.to_csv('submission.csv',index=False)