## 12.1 ロジスティック回帰

In [1]:
# 分類木も予測結果を確率で表示できる
import pandas as pd
from sklearn import tree

df = pd.read_csv("data/KvsT.csv")
x = df.loc[:, "体重":"年代"]
t = df["派閥"]

model = tree.DecisionTreeClassifier(max_depth=1, random_state=0)
model.fit(x, t)

data = [[65, 20]]  # 予測結果用の未知のデータ
print(model.predict(data))  # 予測派閥
print(model.predict_proba(data))  # 派閥の確率
# 0.6 がきのこ派である確率、0.4 がたけのこ派である確率

['きのこ']
[[0.6 0.4]]




In [2]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/iris.csv")
df.head()

Unnamed: 0,がく片長さ,がく片幅,花弁長さ,花弁幅,種類
0,0.22,0.63,0.08,0.04,Iris-setosa
1,0.17,0.42,0.35,0.04,Iris-setosa
2,0.11,0.5,0.13,0.04,Iris-setosa
3,0.08,0.46,0.26,0.04,Iris-setosa
4,0.19,0.67,0.44,0.04,Iris-setosa


In [3]:
# 本来なら訓練&検証データ、テストデータに分割するが割愛
df_mean = df.loc[:, :"花弁幅"].mean()
train2 = df.fillna(df_mean)

x = train2.loc[:, :"花弁幅"]
t = train2["種類"]

# 特徴量の標準化
# ロジスティック回帰は、特徴量を標準化しないと予測性能が良くなりにくい
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
new = sc.fit_transform(x)


In [4]:
x_train, x_val, y_train, y_val = train_test_split(new, t, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression

# C=0.1 は正規化項の定数、正規化項の影響力を調整するための重み定数
# Cは小さい方が、過学習を防げる（回帰のalphaとは逆）
# multi_classは、３グループ以上の分類の場合はautoにする
# solverは、最適化アルゴリズムの指定
model = LogisticRegression(random_state=0, C=0.1, multi_class='auto', solver='lbfgs')

In [5]:
model.fit(x_train, y_train)
print(model.score(x_train, y_train))
model.score(x_val, y_val)

0.8666666666666667


0.8333333333333334

In [6]:
model.coef_
# 0は、virginiaの式の系数
# 1は、versicolorの式の系数
# 2は、setonaの式の系数

array([[-0.53209541,  0.48584036, -0.52629135, -0.83192326],
       [ 0.09494378, -0.44720771, -0.00110969, -0.04413366],
       [ 0.43715163, -0.03863265,  0.52740105,  0.87605692]])

In [7]:
x_new = [[1, 2, 3, 4]]  # 新規データ
model.predict(x_new)

array(['Iris-virginica'], dtype=object)

In [8]:
# 確率による予測結果をみる
model.predict_proba(x_new)

array([[4.03394997e-05, 3.02965489e-03, 9.96930006e-01]])

## 12.2 ランダムフォレスト

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
%matplotlib inline

df = pd.read_csv("data/Survived.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,211536,13.0000,,S
887,888,1,1,female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,male,26.0,0,0,111369,30.0000,C148,C


In [10]:
jo1 = df['Pclass'] == 1
jo2 = df["Survived"] == 0
jo3 = df["Age"].isnull()
df.loc[(jo1) & (jo2) & (jo3), "Age"] = 43

jo2 = df["Survived"] == 1
df.loc[(jo1) & (jo2) & (jo3), "Age"] = 35

jo1 = df['Pclass'] == 2
jo2 = df["Survived"] == 0
jo3 = df["Age"].isnull()
df.loc[(jo1) & (jo2) & (jo3), "Age"] = 26

jo2 = df["Survived"] == 1
df.loc[(jo1) & (jo2) & (jo3), "Age"] = 20

jo1 = df['Pclass'] == 3
jo2 = df["Survived"] == 0
jo3 = df["Age"].isnull()
df.loc[(jo1) & (jo2) & (jo3), "Age"] = 43

jo2 = df["Survived"] == 1
df.loc[(jo1) & (jo2) & (jo3), "Age"] = 35

In [11]:
col = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

x = df[col]
t = df["Survived"]

# Sex列は文字の列なので、ダミー変数化
dummy = pd.get_dummies(df["Sex"], drop_first=True, dtype="uint8")
x = pd.concat([x, dummy], axis=1)
x.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,0
4,3,35.0,0,0,8.05,1


In [12]:
from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(x, t, test_size=0.2, random_state=0)
model = RandomForestClassifier(n_estimators=200, random_state=0)

In [13]:
model.fit(x_train, y_train)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

0.9887640449438202
0.8715083798882681


In [14]:
from sklearn import tree

model2 = tree.DecisionTreeClassifier(random_state=0)
model2.fit(x_train, y_train)

print(model2.score(x_train, y_train))
print(model2.score(x_test, y_test))

0.9887640449438202
0.8156424581005587


In [15]:
importance = model.feature_importances_
pd.Series(importance, index=x.columns)

Pclass    0.079546
Age       0.323012
SibSp     0.045682
Parch     0.032854
Fare      0.265573
male      0.253334
dtype: float64

## 12.3 アダブースト

In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

x_train, x_test, y_train, y_test = train_test_split(x, t, test_size=0.2, random_state=0)

# 最大の深さ５の決定木を何個も作っていく
base_model = DecisionTreeClassifier(random_state=0, max_depth=5)
# 決定木を500個作成
model = AdaBoostClassifier(n_estimators=500, random_state=0, base_estimator=base_model)
model.fit(x_train, y_train)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))




0.9887640449438202
0.8156424581005587


In [17]:
# ランダムフォレストやアダブーストで回帰

df = pd.read_csv("data/cinema.csv")
df = df.fillna(df.mean())
x = df.loc[:, "SNS1":"original"]
t = df["sales"]

x_train, x_test, y_train, y_test = train_test_split(x, t, test_size=0.2, random_state=0)

# ランダムフォレスト回帰
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.5563347234627347

In [18]:
# アダブースト回帰
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

base = DecisionTreeRegressor(random_state=0, max_depth=3)
model = AdaBoostRegressor(n_estimators=100, random_state=0, base_estimator=base)
model.fit(x_train, y_train)
model.score(x_test, y_test)



0.6748482902800903