In [118]:
import numpy as np
import pandas as pd
import os
import pickle
import gc
import re

import pandas_profiling as pdp

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [119]:
train = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/train.csv')
test = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/sample_submit.csv')

In [120]:
class cfg:
  n_splits = 5

  seed = 123

  num_round = 100

In [121]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.02,
    'num_leaves': 16,
    'm_estimators': 100000,
    'random_state': cfg.seed,
    'importance_type': 'gain',
}

In [122]:
def cleaning(texts):
  clean_texts = []
  for text in texts:
    text = remove_tag(text)
    clean_texts.append(text)
  return clean_texts

def remove_tag(x):
  p = re.compile(r"<[^>]*?>")
  return p.sub('',x)

In [123]:
fold = pd.DataFrame(np.load("/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Oututput/test-2-bert-deberta-base-epoch10/preds/oof_pred.npy"))
print(fold.shape)
fold

(9791, 4)


Unnamed: 0,0,1,2,3
0,0.943848,0.938477,0.013741,0.010490
1,0.991699,0.664551,0.018234,0.015839
2,0.978027,0.987793,0.028488,0.036438
3,0.982910,0.987305,0.036285,0.023911
4,0.865234,0.995605,0.014061,0.009857
...,...,...,...,...
9786,0.966309,0.962402,0.012825,0.014229
9787,0.994629,0.718750,0.006851,0.008255
9788,0.990723,0.940918,0.057190,0.049683
9789,0.974609,0.989746,0.035065,0.042572


In [124]:
# df_text = train[['html_content']]
# df_text['html_content'] = cleaning(df_text['html_content'])
# print(df_text.shape)

# vec = CountVectorizer(min_df=200)

# vec.fit(df_text['html_content'])

# html_content = pd.DataFrame(vec.transform(df_text['html_content']).toarray(), columns=vec.get_feature_names())
# print(html_content.shape)
# html_content

In [125]:
x_train = train[['goal', 'country', 'duration', 'category1', 'category2']]
x_train.columns = ['goal1', 'country1', 'duration1', 'category1', 'category2']
# x_train = train[['duration']]
y_train = train['state']
id_train = train['id']
categorical_features = ['goal1', 'country1', 'category1', 'category2']

cv = list(StratifiedKFold(n_splits=cfg.n_splits, shuffle=True, random_state=cfg.seed).split(x_train, y_train))


metrics = []
imp = pd.DataFrame()

for column in categorical_features:
  target_column = x_train[column]
  le = LabelEncoder()
  le.fit(target_column)
  label_encoded_column = le.transform(target_column)
  x_train[column] = pd.Series(label_encoded_column).astype('category')

# print(x_train.shape)
# x_train = pd.concat([x_train, html_content], axis=1)
print(x_train.shape)
x_train = pd.concat([x_train, fold], axis=1)
print(x_train.shape)

for nfold in range(cfg.n_splits):
  print('-'*20, nfold, '-'*20)
  idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
  x_tr, y_tr = x_train.loc[idx_tr], y_train.loc[idx_tr]
  x_va, y_va = x_train.loc[idx_va], y_train.loc[idx_va]
  print(x_tr.shape, y_tr.shape)
  print(x_va.shape, y_va.shape)

  lgb_tr = lgb.Dataset(x_tr, y_tr)
  lgb_va = lgb.Dataset(x_va, y_va)

  lgb_results = {}
  model = lgb.train(params, lgb_tr, num_boost_round=cfg.num_round,
                    valid_names=['train', 'valid'],
                    valid_sets=[lgb_tr, lgb_va])

  y_tr_pred = model.predict(x_tr)
  y_va_pred = model.predict(x_va)
  y_tr_pred = np.where(y_tr_pred > 0.5, 1, 0)
  y_va_pred = np.where(y_va_pred > 0.5, 1, 0)

  print(y_tr)
  print(y_tr_pred)
  f1_tr = f1_score(y_tr, y_tr_pred)
  f1_va = f1_score(y_va, y_va_pred)
  print('[f1 score] tr:{:.2f}, va:{:.2f}'.format(f1_tr, f1_va))
  metrics.append([nfold, f1_tr, f1_va])

print('='*20, 'result', '='*20)
metrics = np.array(metrics)
print(metrics)

(9791, 5)
(9791, 9)
-------------------- 0 --------------------
(7832, 9) (7832,)
(1959, 9) (1959,)
[1]	train's binary_logloss: 0.684985	valid's binary_logloss: 0.685458
[2]	train's binary_logloss: 0.677202	valid's binary_logloss: 0.678088
[3]	train's binary_logloss: 0.669709	valid's binary_logloss: 0.670901
[4]	train's binary_logloss: 0.662405	valid's binary_logloss: 0.663966
[5]	train's binary_logloss: 0.655408	valid's binary_logloss: 0.657417
[6]	train's binary_logloss: 0.648652	valid's binary_logloss: 0.651078
[7]	train's binary_logloss: 0.64215	valid's binary_logloss: 0.64494
[8]	train's binary_logloss: 0.635848	valid's binary_logloss: 0.639
[9]	train's binary_logloss: 0.629729	valid's binary_logloss: 0.633308
[10]	train's binary_logloss: 0.623884	valid's binary_logloss: 0.627891
[11]	train's binary_logloss: 0.618126	valid's binary_logloss: 0.622408
[12]	train's binary_logloss: 0.612517	valid's binary_logloss: 0.617039
[13]	train's binary_logloss: 0.6071	valid's binary_logloss: 0.

In [126]:
sub = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/sample_submit.csv', header=None)
sub

Unnamed: 0,0,1
0,test_00000,1
1,test_00001,0
2,test_00002,0
3,test_00003,0
4,test_00004,1
...,...,...
9795,test_09795,0
9796,test_09796,0
9797,test_09797,1
9798,test_09798,0


In [127]:
# df_text = test[['html_content']]
# df_text['html_content'] = cleaning(df_text['html_content'])
# print(df_text.shape)

# vec = CountVectorizer(min_df=200)

# vec.fit(df_text['html_content'])

# html_content_test = pd.DataFrame(vec.transform(df_text['html_content']).toarray(), columns=vec.get_feature_names())
# print(html_content_test.shape)
# html_content_test

In [128]:
fold_test = pd.DataFrame(np.load("/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Oututput/test-2-bert-deberta-base-epoch10/preds/sub_pred.npy"))
print(fold_test.shape)
fold_test

(9800, 4)


Unnamed: 0,0,1,2,3
0,0.400787,0.598572,0.000198,0.000212
1,0.074722,0.924805,0.000158,0.000124
2,0.046900,0.952881,0.000107,0.000120
3,0.934082,0.065323,0.000211,0.000178
4,0.948242,0.051544,0.000181,0.000152
...,...,...,...,...
9795,0.276001,0.723633,0.000206,0.000203
9796,0.151062,0.848389,0.000207,0.000192
9797,0.729980,0.269501,0.000313,0.000279
9798,0.472534,0.526733,0.000289,0.000384


In [129]:
x_test = test[['goal', 'country', 'duration', 'category1', 'category2']]
x_test.columns = ['goal1', 'country1', 'duration1', 'category1', 'category2']
id_test = test['id']

for column in categorical_features:
  target_column = x_test[column]
  le = LabelEncoder()
  le.fit(target_column)
  label_encoded_column = le.transform(target_column)
  x_test[column] = pd.Series(label_encoded_column).astype('category')

# print(x_test.shape)
# x_test = pd.concat([x_test, html_content_test], axis=1)
print(x_test.shape)
x_test = pd.concat([x_test, fold_test], axis=1)
print(x_test.shape)

y_test_pred = model.predict(x_test)
y_test_pred = np.where(y_test_pred > 0.5, 1, 0)
print(id_test)
print(y_test_pred)
df_submit = pd.DataFrame({'id':id_test, 'pred':y_test_pred})
print(df_submit)
df_submit.to_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Output/submit_4.csv', index=None, header=None)

(9800, 5)
(9800, 9)
0       test_00000
1       test_00001
2       test_00002
3       test_00003
4       test_00004
           ...    
9795    test_09795
9796    test_09796
9797    test_09797
9798    test_09798
9799    test_09799
Name: id, Length: 9800, dtype: object
[1 1 1 ... 0 1 1]
              id  pred
0     test_00000     1
1     test_00001     1
2     test_00002     1
3     test_00003     0
4     test_00004     0
...          ...   ...
9795  test_09795     0
9796  test_09796     1
9797  test_09797     0
9798  test_09798     1
9799  test_09799     1

[9800 rows x 2 columns]
