<a href="https://colab.research.google.com/github/hashk1/nlp-100-knock-2020-rev2/blob/main/06-%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第6章: 機械学習

In [None]:
# ライブラリ読み込み
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# データ取得
! wget -c https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip

In [None]:
! unzip -o -d NewsAggregatorDataset NewsAggregatorDataset.zip

In [None]:
# readme.txtの説明を読む
! cat NewsAggregatorDataset/readme.txt

In [None]:
# データを見る
! head -n 5 NewsAggregatorDataset/newsCorpora.csv

### 50. データの入手・整形

In [None]:
df = pd.read_table("NewsAggregatorDataset/newsCorpora.csv", header=None)
df.columns = ["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
df = df.query('PUBLISHER in ["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"]')
df = df[["CATEGORY", "TITLE"]]
df["CATEGORY"] = df["CATEGORY"].map({"b": 0, "t": 1, "e": 2, "m": 3})
df.head()

In [None]:
X = df
y = df["CATEGORY"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size =0.8, stratify=y, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, train_size =0.5, stratify=y_valid, random_state=0)

X_train.to_csv("train.txt", sep="\t", index=False, header=None)
X_valid.to_csv("valid.txt", sep="\t", index=False, header=None)
X_test.to_csv("test.txt", sep="\t", index=False, header=None)

In [None]:
! wc -l train.txt valid.txt test.txt

### 51. 特徴量抽出

In [None]:
X_train = pd.read_table("train.txt", header=None)
X_valid = pd.read_table("valid.txt", header=None)
X_test = pd.read_table("test.txt", header=None)
X_train.columns = ["CATEGORY", "TITLE"]
X_valid.columns = ["CATEGORY", "TITLE"]
X_test.columns = ["CATEGORY", "TITLE"]
# ラベル付けしておく
X_train["DATA"] = "TRAIN"
X_valid["DATA"] = "VALID"
X_test["DATA"] = "TEST"
# くっつける
X = pd.concat([X_train, X_valid, X_test]).reset_index(drop = True)
X.head()

In [None]:
# TITLEのベクトル化
vectorizer = CountVectorizer(token_pattern="(?u)\\b\\w+\\b")
bow = vectorizer.fit_transform(X["TITLE"])
X = pd.concat([X, pd.DataFrame(bow.toarray())], axis=1)
X.head()

In [None]:
# ボキャブラリの表示
vocabulary_ = vectorizer.vocabulary_
vocabulary_

In [None]:
# vocabulary_を保存
joblib.dump(vocabulary_, "vocabulary_.joblib")

In [None]:
X_train = X.query('DATA=="TRAIN"').drop(["CATEGORY", "TITLE", "DATA"], axis=1)
X_valid = X.query('DATA=="VALID"').drop(["CATEGORY", "TITLE", "DATA"], axis=1)
X_test = X.query('DATA=="TEST"').drop(["CATEGORY", "TITLE", "DATA"], axis=1)

# データを保存
X_train.to_csv("train.feature.txt", sep="\t", index=False, header=None)
X_valid.to_csv("valid.feature.txt", sep="\t", index=False, header=None)
X_test.to_csv("test.feature.txt", sep="\t", index=False, header=None)

### 52. 学習

以下で使うので、まとめて書いておく

In [None]:
# 学習データ
#X_train = pd.read_table("train.feature.txt", header=None)
y_train = pd.read_table("train.txt", header=None)[0]

# 検証データ
#X_valid = pd.read_table("valid.feature.txt", header=None)
y_valid = pd.read_table("valid.txt", header=None)[0]

# 評価データ
#X_test = pd.read_table("test.feature.txt", header=None)
y_test = pd.read_table("test.txt", header=None)[0]

In [None]:
# モデル作成と学習
clf = LogisticRegression(max_iter=10000, random_state=0)
clf.fit(X_train, y_train)

In [None]:
# 係数
clf.coef_

In [None]:
# モデルを保存
joblib.dump(clf, "logistic-regression-model.joblib")

### 53. 予測

In [None]:
# 予測確率 ["b": 0, "t": 1, "e": 2, "m": 3]
y_train_pred_prob = clf.predict_proba(X_train)
y_train_pred_prob

### 54. 正解率の計測

In [None]:
# 学習データ、評価データの予測値
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# 正解率
print("accuracy for train: {}".format(accuracy_score(y_train, y_train_pred)))
print("accuracy for test: {}".format(accuracy_score(y_test, y_test_pred)))

### 55. 混同行列の作成

In [None]:
# 混同行列
print("confusion matrix for train: \n{}".format(confusion_matrix(y_train, y_train_pred)))
print("confusion matrix for test: \n{}".format(confusion_matrix(y_test, y_test_pred)))

### 56. 適合率、再現率、F1スコアの計測

In [None]:
# スコア
print("precision for train: {}".format(precision_score(y_train, y_train_pred, average=None)))
print("micro averaged precison for test: {}".format(precision_score(y_test, y_test_pred, average="micro")))
print("Macro averaged precison for test: {}".format(precision_score(y_test, y_test_pred, average="macro")))
print("recall for train: {}".format(recall_score(y_train, y_train_pred, average=None)))
print("micro averaged recall for test: {}".format(recall_score(y_test, y_test_pred, average="micro")))
print("Macro averaged recall for test: {}".format(recall_score(y_test, y_test_pred, average="macro")))
print("F1 for train: {}".format(f1_score(y_train, y_train_pred, average=None)))
print("micro averaged F1 for test: {}".format(f1_score(y_test, y_test_pred, average="micro")))
print("Macro averaged F1 for test: {}".format(f1_score(y_test, y_test_pred, average="macro")))

### 57. 特徴量の重みの確認

In [None]:
# モデル
#clf = joblib.load("logistic-regression-model.joblib")
# ボキャブラリ
#vocabulary_ = joblib.load("vocabulary_.joblib")

category = ["b", "t", "e", "m"]

for i, cat in enumerate(category):
    print("category: {}".format(cat))
    d = dict(zip(vectorizer.vocabulary_, clf.coef_[i]))
    top10 = sorted(d.items(), key=lambda x: -abs(x[1]))[:10]
    bottom10 = sorted(d.items(), key=lambda x: abs(x[1]))[:10]
    print("top-10:{}".format(top10))
    print("bottom-10:{}".format(bottom10))

### 58. 正則化パラメータの変更

In [None]:
cs = [0.01, 0.1, 1, 10, 100]

train_accs = []
valid_accs = []
test_accs = []

for c in cs:
    clf = LogisticRegression(C=c, max_iter=10000, random_state=0)
    clf.fit(X_train, y_train)
    
    y_train_pred = clf.predict(X_train)
    y_valid_pred = clf.predict(X_valid)
    y_test_pred = clf.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    valid_acc = accuracy_score(y_valid, y_valid_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    train_accs.append(train_acc)
    valid_accs.append(valid_acc)
    test_accs.append(test_acc)

plt.plot(cs, train_accs, label="accuracy for train")
plt.plot(cs, valid_accs, label="accuracy for valid")
plt.plot(cs, test_accs, label="accuracy for test")
plt.xscale("log")
plt.xlabel("C")
plt.ylabel("accuracy")
plt.legend()

### 59. ハイパーパラメータの探索

In [None]:
accs = {}

# Logistic Regression
cs = [0.01, 0.1, 1, 10, 100]
for c in cs:
    clf = LogisticRegression(C=c, max_iter=10000, random_state=0)
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    accs["Logistic Regression with C={}".format(c)] = accuracy_score(y_test, y_test_pred)
    
# Random Forest
ds = [2, 4, 6, 8, 10]
for d in ds:
    clf = RandomForestClassifier(max_depth=d, random_state=0)
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    accs["Random Forest with max_depth={}".format(d)] = accuracy_score(y_test, y_test_pred)

best_model, best_acc = sorted(accs.items(), key=lambda x: -x[1])[0]
print("{}: {}".format(best_model, best_acc) )