## 13.1 回帰の予測性能評価

In [1]:
import pandas as pd

df = pd.read_csv("data/cinema.csv")

# 欠損値
df = df.fillna(df.mean())

x = df.loc[:, "SNS1":"original"]
t = df["sales"]

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x, t)

In [2]:
# 平均２乗誤差（MSE）を求める
from sklearn.metrics import mean_squared_error

pred = model.predict(x)
# 予測値と実測値でMSEを計算
mse = mean_squared_error(pred, t)
mse

# ここで出力された値が、大きいのか少ないのか、理解するためには、データに対する専門的な知識が必要

151986.03957624518

In [3]:
# 2乗平均平方根誤差（RMSE)
import math

math.sqrt(mse)
# 平均値から予測して、389.85ほど誤差が生じるということがわかる

389.85386951554705

In [4]:
from sklearn.metrics import mean_absolute_error

yosoku = [2, 3, 5, 7, 11, 13]
target = [3, 5, 8, 11, 16, 19]

mse = mean_squared_error(yosoku, target)
print("rmse:{}".format(math.sqrt(mse)))
print("mae:{}".format(mean_absolute_error(yosoku, target)))

print("外れ値の混入")
yosoku = [2, 3, 5, 7, 11, 13, 46]  # 実際には23だけど46と予測
target = [3, 5, 8, 11, 16, 19, 23]
mse = mean_squared_error(yosoku, target)
print("rmse:{}".format(math.sqrt(mse)))
print("mae:{}".format(mean_absolute_error(yosoku, target)))


rmse:3.8944404818493075
mae:3.5
外れ値の混入
rmse:9.411239481143202
mae:6.285714285714286


## 13.2 分類の予測性能評価

In [5]:
df = pd.read_csv("data/Survived.csv")
df = df.fillna(df["Age"].mean())

x = df[['Pclass', 'Age']]
t = df['Survived']

In [6]:
from sklearn import tree

model = tree.DecisionTreeClassifier(max_depth=2, random_state=0)
model.fit(x, t)

In [7]:
from sklearn.metrics import classification_report

pred = model.predict(x)
out_put = classification_report(y_pred=pred, y_true=t)
print(out_put)

              precision    recall  f1-score   support

           0       0.78      0.65      0.71       549
           1       0.56      0.70      0.62       342

    accuracy                           0.67       891
   macro avg       0.67      0.68      0.67       891
weighted avg       0.69      0.67      0.68       891


In [8]:
out_put = classification_report(y_pred=pred, y_true=t, output_dict=True)
# out_putをデータフレームに変換
pd.DataFrame(out_put)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.778742,0.55814,0.672278,0.668441,0.694066
recall,0.653916,0.701754,0.672278,0.677835,0.672278
f1-score,0.710891,0.621762,0.672278,0.666326,0.67668
support,549.0,342.0,0.672278,891.0,891.0


## 13.3 K分割交差検証

In [11]:
df = pd.read_csv("data/cinema.csv")
df = df.fillna(df.mean())
x = df.loc[:, "SNS1":"original"]
t = df["sales"]

In [12]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=3, shuffle=True, random_state=0)

In [13]:
from sklearn.model_selection import cross_validate

model = LinearRegression()
result = cross_validate(model, x, t, cv=kf, scoring="r2", return_train_score=True)
print(result)

{'fit_time': array([0.0024941 , 0.00170088, 0.00151277]), 'score_time': array([0.00122786, 0.0008738 , 0.00084519]), 'test_score': array([0.72465051, 0.71740834, 0.75975591]), 'train_score': array([0.76928501, 0.76368104, 0.75780074])}


In [14]:
sum(result["test_score"]) / len(result["test_score"])

0.7339382541774341