In [70]:
import numpy as np
import pandas as pd
import os
import pickle
import gc
import re

import pandas_profiling as pdp

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [71]:
train = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/train.csv')
test = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/sample_submit.csv')

In [72]:
class cfg:
  n_splits = 5

  seed = 123

  num_round = 100

In [73]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'm_estimators': 100000,
    'random_state': cfg.seed,
    'importance_type': 'gain',
}

In [74]:
def cleaning(texts):
  clean_texts = []
  for text in texts:
    text = remove_tag(text)
    clean_texts.append(text)
  return clean_texts

def remove_tag(x):
  p = re.compile(r"<[^>]*?>")
  return p.sub('',x)

In [75]:
df_text = train[['html_content']]
df_text['html_content'] = cleaning(df_text['html_content'])
print(df_text.shape)

vec = CountVectorizer(min_df=1000)

vec.fit(df_text['html_content'])

html_content = pd.DataFrame(vec.transform(df_text['html_content']).toarray(), columns=vec.get_feature_names())
print(html_content.shape)
html_content

(9791, 1)
(9791, 249)


Unnamed: 0,000,10,able,about,after,all,along,already,also,always,...,with,without,work,working,world,would,year,years,you,your
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,2,1,2,1,2,2
1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,1,0,0,0,0,2
2,0,0,0,0,0,2,0,0,0,0,...,1,0,0,0,0,0,0,0,2,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,2,0
4,0,0,1,2,1,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9786,0,0,0,2,0,0,0,0,0,2,...,0,0,1,0,0,0,0,1,0,0
9787,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9788,0,1,0,0,0,2,1,0,4,0,...,5,0,0,1,0,2,0,1,1,5
9789,0,1,1,0,1,3,1,0,4,1,...,6,0,1,2,1,1,2,2,45,19


In [76]:
x_train = train[['goal', 'country', 'duration', 'category1', 'category2']]
x_train.columns = ['goal1', 'country1', 'duration1', 'category1', 'category2']
# x_train = train[['duration']]
y_train = train['state']
id_train = train['id']
categorical_features = ['goal1', 'country1', 'category1', 'category2']

cv = list(StratifiedKFold(n_splits=cfg.n_splits, shuffle=True, random_state=cfg.seed).split(x_train, y_train))


metrics = []
imp = pd.DataFrame()

for column in categorical_features:
  target_column = x_train[column]
  le = LabelEncoder()
  le.fit(target_column)
  label_encoded_column = le.transform(target_column)
  x_train[column] = pd.Series(label_encoded_column).astype('category')

print(x_train.shape)
x_train = pd.concat([x_train, html_content], axis=1)
print(x_train.shape)

for nfold in range(cfg.n_splits):
  print('-'*20, nfold, '-'*20)
  idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
  x_tr, y_tr = x_train.loc[idx_tr], y_train.loc[idx_tr]
  x_va, y_va = x_train.loc[idx_va], y_train.loc[idx_va]
  print(x_tr.shape, y_tr.shape)
  print(x_va.shape, y_va.shape)

  lgb_tr = lgb.Dataset(x_tr, y_tr)
  lgb_va = lgb.Dataset(x_va, y_va)

  lgb_results = {}
  model = lgb.train(params, lgb_tr, num_boost_round=cfg.num_round,
                    valid_names=['train', 'valid'],
                    valid_sets=[lgb_tr, lgb_va])

  y_tr_pred = model.predict(x_tr)
  y_va_pred = model.predict(x_va)
  y_tr_pred = np.where(y_tr_pred > 0.5, 1, 0)
  y_va_pred = np.where(y_va_pred > 0.5, 1, 0)

  print(y_tr)
  print(y_tr_pred)
  f1_tr = f1_score(y_tr, y_tr_pred)
  f1_va = f1_score(y_va, y_va_pred)
  print('[f1 score] tr:{:.2f}, va:{:.2f}'.format(f1_tr, f1_va))
  metrics.append([nfold, f1_tr, f1_va])

print('='*20, 'result', '='*20)
metrics = np.array(metrics)
print(metrics)

(9791, 5)
(9791, 254)
-------------------- 0 --------------------
(7832, 254) (7832,)
(1959, 254) (1959,)
[1]	train's binary_logloss: 0.659772	valid's binary_logloss: 0.662393
[2]	train's binary_logloss: 0.631436	valid's binary_logloss: 0.635375
[3]	train's binary_logloss: 0.606627	valid's binary_logloss: 0.612573
[4]	train's binary_logloss: 0.585153	valid's binary_logloss: 0.59165
[5]	train's binary_logloss: 0.56667	valid's binary_logloss: 0.574688
[6]	train's binary_logloss: 0.550587	valid's binary_logloss: 0.560159
[7]	train's binary_logloss: 0.536244	valid's binary_logloss: 0.547989
[8]	train's binary_logloss: 0.523415	valid's binary_logloss: 0.537228
[9]	train's binary_logloss: 0.5117	valid's binary_logloss: 0.527787
[10]	train's binary_logloss: 0.500919	valid's binary_logloss: 0.518507
[11]	train's binary_logloss: 0.491369	valid's binary_logloss: 0.510763
[12]	train's binary_logloss: 0.48309	valid's binary_logloss: 0.503616
[13]	train's binary_logloss: 0.475165	valid's binary_log

In [77]:
sub = pd.read_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Input/sample_submit.csv', header=None)
sub

Unnamed: 0,0,1
0,test_00000,1
1,test_00001,0
2,test_00002,0
3,test_00003,0
4,test_00004,1
...,...,...
9795,test_09795,0
9796,test_09796,0
9797,test_09797,1
9798,test_09798,0


In [78]:
df_text = test[['html_content']]
df_text['html_content'] = cleaning(df_text['html_content'])
print(df_text.shape)

vec = CountVectorizer(min_df=200)

vec.fit(df_text['html_content'])

html_content_test = pd.DataFrame(vec.transform(df_text['html_content']).toarray(), columns=vec.get_feature_names())
print(html_content_test.shape)
html_content_test

(9800, 1)
(9800, 1453)


Unnamed: 0,00,000,10,100,1000,11,12,13,14,15,...,young,your,yourself,youth,youtube,このコンテンツを表示するにはhtml5対応のブラウザが必要です,再生,動画を再生,音ありでリプレイ,音声ありで
0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,2,2,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,3,2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9795,8,0,0,0,2,0,0,0,0,0,...,0,1,0,0,0,2,2,2,2,2
9796,12,0,1,0,0,0,1,0,1,0,...,4,2,0,0,0,3,3,3,3,3
9797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9798,0,0,0,0,0,0,0,0,0,0,...,0,3,1,0,0,0,0,0,0,0


In [81]:
x_test = test[['goal', 'country', 'duration', 'category1', 'category2']]
x_test.columns = ['goal1', 'country1', 'duration1', 'category1', 'category2']
id_test = test['id']

for column in categorical_features:
  target_column = x_test[column]
  le = LabelEncoder()
  le.fit(target_column)
  label_encoded_column = le.transform(target_column)
  x_test[column] = pd.Series(label_encoded_column).astype('category')

x_test = pd.concat([x_test, html_content_test], axis=1)

y_test_pred = model.predict(x_test)
y_test_pred = np.where(y_test_pred > 0.5, 1, 0)
print(id_test)
print(y_test_pred)
df_submit = pd.DataFrame({'id':id_test, 'pred':y_test_pred})
print(df_submit)
df_submit.to_csv('/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Output/submit_2.csv', index=None, header=None)

0       test_00000
1       test_00001
2       test_00002
3       test_00003
4       test_00004
           ...    
9795    test_09795
9796    test_09796
9797    test_09797
9798    test_09798
9799    test_09799
Name: id, Length: 9800, dtype: object
[1 1 1 ... 0 0 0]
              id  pred
0     test_00000     1
1     test_00001     1
2     test_00002     1
3     test_00003     0
4     test_00004     0
...          ...   ...
9795  test_09795     0
9796  test_09796     1
9797  test_09797     0
9798  test_09798     0
9799  test_09799     0

[9800 rows x 2 columns]
