# はじめに

In [138]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from  sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [162]:
train = pd.read_csv('/content/drive/MyDrive/data-comp/signate/mufg-comp/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/data-comp/signate/mufg-comp/data/test.csv')
sample_submit = pd.read_csv('/content/drive/MyDrive/data-comp/signate/mufg-comp/data/sample_submit.csv')

In [163]:
print(train.shape, test.shape)

(9791, 8) (9800, 7)


In [164]:
train.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state
0,train_00000,20001-21000,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1
1,train_00001,19001-20000,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0
2,train_00002,2001-3000,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0
3,train_00003,1001-2000,US,30,art,mixed media,"<div class=""contents""><div><div class=""templat...",1
4,train_00004,1001-2000,US,29,film & video,webseries,"<div class=""contents""><div><p>The story of the...",1


In [165]:
test.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content
0,test_00000,5001-6000,FR,30,dance,performances,"<div class=""contents""><div><p>Bonjour ,</p><p>..."
1,test_00001,6001-7000,GB,23,publishing,children's books,"<div class=""contents""><div><p><span class=""bol..."
2,test_00002,6001-7000,GB,30,theater,plays,"<div class=""contents""><div><p>COW is a rural t..."
3,test_00003,1001-2000,CA,14,art,digital art,"<div class=""contents""><div><p>I've been creati..."
4,test_00004,1-1000,US,30,music,hip-hop,"<div class=""contents""><div><div class=""templat..."


In [166]:
sample_submit.head()

Unnamed: 0,test_00000,1
0,test_00001,0
1,test_00002,0
2,test_00003,0
3,test_00004,1
4,test_00005,1


In [167]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9791 entries, 0 to 9790
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            9791 non-null   object
 1   goal          9791 non-null   object
 2   country       9791 non-null   object
 3   duration      9791 non-null   int64 
 4   category1     9791 non-null   object
 5   category2     9791 non-null   object
 6   html_content  9791 non-null   object
 7   state         9791 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 612.1+ KB


In [168]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            9800 non-null   object
 1   goal          9800 non-null   object
 2   country       9800 non-null   object
 3   duration      9800 non-null   int64 
 4   category1     9800 non-null   object
 5   category2     9800 non-null   object
 6   html_content  9800 non-null   object
dtypes: int64(1), object(6)
memory usage: 536.1+ KB


In [169]:
train_x = train.drop(['state'], axis=1)
train_y = train['state']
test_x = test.copy()

# 前処理

In [170]:
# country, category1, category2をラベルエンコーディングする
cat_cols = ['goal', 'country', 'category1', 'category2']
all_x = pd.concat([train_x, test_x])
for c in cat_cols:
  le = LabelEncoder()
  all_x[c] = le.fit_transform(all_x[c])
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)

# モデリング

In [171]:
train_x = train_x[['goal', 'country', 'duration', 'category1', 'category2']]
test_x = test_x[['goal', 'country', 'duration', 'category1', 'category2']]

In [172]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9791 entries, 0 to 9790
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   goal       9791 non-null   int64
 1   country    9791 non-null   int64
 2   duration   9791 non-null   int64
 3   category1  9791 non-null   int64
 4   category2  9791 non-null   int64
dtypes: int64(5)
memory usage: 382.6 KB


In [173]:
test_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   goal       9800 non-null   int64
 1   country    9800 non-null   int64
 2   duration   9800 non-null   int64
 3   category1  9800 non-null   int64
 4   category2  9800 non-null   int64
dtypes: int64(5)
memory usage: 382.9 KB


In [174]:
print(train_x.shape, test_x.shape,train_y.shape)

(9791, 5) (9800, 5) (9791,)


In [175]:
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=71)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [176]:
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=71)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    return pred_train

In [177]:
model_lb = LGBMClassifier()
pred_train = predict_cv(model_lb, train_x, train_y, test_x)

In [178]:
f1_score(train_y, pred_train)

0.7250108178277802

# 提出

In [179]:
model_lb.fit(train_x, train_y)
pred_test = model_lb.predict(test_x)

In [180]:
submit = pd.DataFrame({'id': test['id'], 'state': pred_test})
submit.to_csv('/content/drive/MyDrive/data-comp/signate/mufg-comp/output/submit1.csv', header=False, index=False)