# LGBMClassifier
---
1. クレンジング処理
2. StratifiedKFoldによる正負を均等に分割
3. 交差ごとの学習とモデルの保存
4. 閾値の探索
5. 推論と提出

In [None]:
!pip install -q texthero

In [None]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm 
import torch.nn as nn 
import torch 
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os, time, pickle, json
import texthero as hero
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from typing import Dict, List, Any, Tuple, Union 
from sklearn.metrics import f1_score, classification_report
from pprint import pprint 
import warnings
from statistics import mean 
import random 
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv("./raw/train.csv")
test = pd.read_csv("./raw/test.csv")

## クレンジング処理
---

In [None]:
def create_tf_idf(df, dim:int=50, is_train=True)->pd.DataFrame:
    df["abstract"] = df["abstract"].fillna(" ")
    df["contents"] = df["title"] + " " + df["abstract"] 
    df.drop(["title", "abstract"], axis=1, inplace=True)
    clean_text = hero.clean(df.contents, pipeline=[
        hero.preprocessing.fillna,
        hero.preprocessing.lowercase,
        hero.preprocessing.remove_digits,
        hero.preprocessing.remove_punctuation,
        hero.preprocessing.remove_diacritics,
        hero.preprocessing.remove_stopwords,
        hero.preprocessing.remove_whitespace
    ])

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5)),
        ('svd', TruncatedSVD(n_components=dim, random_state=0)),
    ])

    out_df = pd.DataFrame(pipeline.fit_transform(clean_text), index=df.index).rename(
        columns={i: "content"+'_'+str(i) for i in range(dim)})
    if is_train:
        out_df["judgement"] = df["judgement"]
    
    return out_df

train = create_tf_idf(train)
test = create_tf_idf(test, 50, False)

## 交差分割
---

In [None]:
def get_train_data(train):
    '''正と負を均等に分割する'''
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for n, (tr_id, va_id) in enumerate(kf.split(train, train.judgement)):
        train.loc[va_id, "fold"] = int(n)
    train["fold"] = train.fold.astype(np.uint8)
    return train

train = get_train_data(train)
train.head()

## 学習と閾値探索
---

In [None]:
def train_fn(fold) -> Tuple[List[float], List[float]]:
    train_ds = train[train.fold != fold]
    val_ds = train[train.fold == fold]
    x_train, y_train = train_ds.drop(["fold", "judgement"], axis=1), train_ds[["judgement"]]
    x_val, y_val = val_ds.drop(["fold", "judgement"], axis=1), val_ds[["judgement"]]

    model = LGBMClassifier(random_state=0).fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], 
                                             early_stopping_rounds=100, verbose=0)
    proba = model.predict_proba(x_val)[:, 1].tolist() # Positiveである確率
    os.makedirs("models/", exist_ok=True)
    model.save_model(f"models/lgb{fold}.model")
    return proba, y_val.values.ravel().tolist() 

def main():
    '''
    k分割したモデルの保存と、それぞれで予測された確率から最も評価の良い閾値の探索
    '''
    predict, correct = [], []
    for fold in range(5):
        proba, corr = train_fn(fold)
        for p in proba:
            predict.append(p)
        for c in corr:
            correct.append(c)
    predict = np.array(predict)
    
    for threshold in np.arange(0.01, 0.2, 0.03):
        pred = np.where(predict > threshold, 1, 0).astype(np.uint8).tolist()
        f1 = f1_score(pred, correct)
        pprint(classification_report(pred, correct))
        result[str(threshold)] = f1 
    return result 

main()

## 推論と提出
---

In [None]:
def test_fn(fold) -> List[float]:
    x_test = test[["contents"]]
    model = LGBMClassifier(random_state=0)
    model.load_model(f"models/lgb{fold}.model")
    proba = model.predict_proba(x_test)[:, 1].tolist()
    del x_test 
    del model 
    return proba 


def inference(threshold: float) -> List[int]:
    predict_proba = []
    for fold in range(5):
        # 5回のモデルの出力された確率平均を求める
        proba = test_fn(fold)
        predict_proba.append(proba)
    predict_proba = np.array(predict_proba)
    predict_proba = np.mean(predict_proba, axis=0)
    predict = np.where(predict_proba > threshold, 1, 0).astype(np.uint8).tolist()
    return predict

def submittion(threshold):
    predict = inference(threshold)
    sub = pd.read_csv("./raw/sample_submit.csv")
    sub.columns = ["id", "dummy"]
    sub = sub[["id"]]
    sub["predict"] = predict 
    sub.to_csv("./raw/submit0914lgb.csv", index=False, header=False)

submittion(threshold)
