In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import gc

import pandas_profiling as pdp

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/train.csv')
test = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/sample_submit.csv')

In [None]:
class cfg:
  n_splits = 5

  seed = 123

  num_round = 100

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'm_estimators': 100000,
    'random_state': cfg.seed,
    'importance_type': 'gain',
}

In [None]:
x_train = train[['goal', 'country', 'duration', 'category1', 'category2']]
# x_train = train[['duration']]
y_train = train['state']
id_train = train['id']
categorical_features = ['goal', 'country', 'category1', 'category2']

cv = list(StratifiedKFold(n_splits=cfg.n_splits, shuffle=True, random_state=cfg.seed).split(x_train, y_train))

metrics = []
imp = pd.DataFrame()

for column in categorical_features:
  target_column = x_train[column]
  le = LabelEncoder()
  le.fit(target_column)
  label_encoded_column = le.transform(target_column)
  x_train[column] = pd.Series(label_encoded_column).astype('category')

for nfold in range(cfg.n_splits):
  print('-'*20, nfold, '-'*20)
  idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
  x_tr, y_tr = x_train.loc[idx_tr], y_train.loc[idx_tr]
  x_va, y_va = x_train.loc[idx_va], y_train.loc[idx_va]
  print(x_tr.shape, y_tr.shape)
  print(x_va.shape, y_va.shape)

  lgb_tr = lgb.Dataset(x_tr, y_tr)
  lgb_va = lgb.Dataset(x_va, y_va)

  lgb_results = {}
  model = lgb.train(params, lgb_tr, num_boost_round=cfg.num_round,
                    valid_names=['train', 'valid'],
                    valid_sets=[lgb_tr, lgb_va])   

  y_tr_pred = model.predict(x_tr)
  y_va_pred = model.predict(x_va)
  y_tr_pred = np.where(y_tr_pred > 0.5, 1, 0)
  y_va_pred = np.where(y_va_pred > 0.5, 1, 0)
  print(y_tr)
  print(y_tr_pred)
  f1_tr = f1_score(y_tr, y_tr_pred)
  f1_va = f1_score(y_va, y_va_pred)
  print('[f1 score] tr:{:.2f}, va:{:.2f}'.format(f1_tr, f1_va))
  metrics.append([nfold, f1_tr, f1_va])


  # _imp = pd.DataFrame({'col':x_train.columns, 'imp':model.feature_importances_, 'nfold':nfold})
  # imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

print('='*20, 'result', '='*20)
metrics = np.array(metrics)
print(metrics)

-------------------- 0 --------------------
(7832, 5) (7832,)
(1959, 5) (1959,)
[1]	train's binary_logloss: 0.661627	valid's binary_logloss: 0.662542
[2]	train's binary_logloss: 0.63548	valid's binary_logloss: 0.637984
[3]	train's binary_logloss: 0.613162	valid's binary_logloss: 0.616619
[4]	train's binary_logloss: 0.593663	valid's binary_logloss: 0.598985
[5]	train's binary_logloss: 0.577127	valid's binary_logloss: 0.583821
[6]	train's binary_logloss: 0.562677	valid's binary_logloss: 0.570152
[7]	train's binary_logloss: 0.549847	valid's binary_logloss: 0.558687
[8]	train's binary_logloss: 0.538632	valid's binary_logloss: 0.549084
[9]	train's binary_logloss: 0.528585	valid's binary_logloss: 0.540962
[10]	train's binary_logloss: 0.519945	valid's binary_logloss: 0.533629
[11]	train's binary_logloss: 0.512085	valid's binary_logloss: 0.526808
[12]	train's binary_logloss: 0.505135	valid's binary_logloss: 0.521006
[13]	train's binary_logloss: 0.499115	valid's binary_logloss: 0.516195
[14]	tr

In [None]:
sub = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/sample_submit.csv', header=None)
sub

Unnamed: 0,0,1
0,test_00000,1
1,test_00001,0
2,test_00002,0
3,test_00003,0
4,test_00004,1
...,...,...
9795,test_09795,0
9796,test_09796,0
9797,test_09797,1
9798,test_09798,0


In [None]:
x_test = test[['goal', 'country', 'duration', 'category1', 'category2']]
id_test = test['id']

for column in categorical_features:
  target_column = x_test[column]
  le = LabelEncoder()
  le.fit(target_column)
  label_encoded_column = le.transform(target_column)
  x_test[column] = pd.Series(label_encoded_column).astype('category')

y_test_pred = model.predict(x_test)
y_test_pred = np.where(y_test_pred > 0.5, 1, 0)
print(id_test)
print(y_test_pred)
df_submit = pd.DataFrame({'id':id_test, 'pred':y_test_pred})
df_submit.to_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Output/submit_1.csv', index=None, header=None)

0       test_00000
1       test_00001
2       test_00002
3       test_00003
4       test_00004
           ...    
9795    test_09795
9796    test_09796
9797    test_09797
9798    test_09798
9799    test_09799
Name: id, Length: 9800, dtype: object
[1 1 1 ... 0 0 0]
