In [1]:
% matplotlib inline
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [2]:
# 演習①：乳がんのデータセット
bc = load_breast_cancer()

In [3]:
# sklearnのdatasetはdictionary。keysで取り出せる
bc.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [4]:
# Description
bc.DESCR



In [5]:
# 説明変数のcolumn名
bc.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='<U23')

In [6]:
# 目的変数の種類。データとしては下記にあるように、0 or 1で１つのcolumnになっている
bc.target_names

array(['malignant', 'benign'],
      dtype='<U9')

In [7]:
features_df = pd.DataFrame(bc.data, columns=bc.feature_names)
features_df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [8]:
target_df = pd.DataFrame(bc.target, columns=['target'])
target_df.describe()

Unnamed: 0,target
count,569.0
mean,0.627417
std,0.483918
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [9]:
# target_dfにNaNがあるかどうかを判定する
target_df.isnull().any().any() # FalseでNaNは含まれていないので、既にクレジング済のデータである。

False

In [10]:
# SVCを使って、乳がんを予測する
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [11]:
bc_X = pd.DataFrame(bc.data)
bc_y = bc.target

In [13]:
# trainを前から5/4、testを後の1/5として、予測精度を比較
len(bc_X) * 4 / 5

455.2

In [14]:
svc = SVC()
svc.fit(bc_X.iloc[:455, :], bc_y[:455])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
accuracy_score(svc.predict(bc_X.iloc[455:, :]), bc_y[455:])

0.77192982456140347

In [41]:
# 交差検定をするにあたり、一番簡便なcross_val_scoreを使う。５分割する
cross_val_score(SVC(), bc_X, bc_y, cv=5)

array([ 0.62608696,  0.62608696,  0.62831858,  0.62831858,  0.62831858])

In [16]:
# RandomForrestを使ってみる
from sklearn.ensemble import RandomForestClassifier

In [17]:
rfc = RandomForestClassifier()
rfc.fit(bc_X.iloc[:455, :], bc_y[:455])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
accuracy_score(rfc.predict(bc_X.iloc[455:, :]), bc_y[455:]) # 非常に高い精度で予測できた

0.97368421052631582

In [45]:
# cross_val_scoreを用いて、交差検定を行う
cross_val_score(RandomForestClassifier(), bc_X, bc_y, cv=5)

array([ 0.92173913,  0.93913043,  0.97345133,  0.96460177,  0.97345133])

In [47]:
# RandomForestClassifierのパラメータを変更し、交差検定の平均値をとる
# RandomForestClassifierのパラメータ詳細についてはdocumentを参照する
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
cross_val_score(RandomForestClassifier(max_depth=10), bc_X, bc_y, cv=3).mean()

0.96484730344379466

In [19]:
# 演習②：タイタニックのデータで生存者の予測を行う
import seaborn as sns

In [20]:
# NaNを除外するために、dropnaを使う
titanic = sns.load_dataset('titanic').dropna()
titanic[['sex', 'age', 'fare', 'embark_town']].head()

Unnamed: 0,sex,age,fare,embark_town
1,female,38.0,71.2833,Cherbourg
3,female,35.0,53.1,Southampton
6,male,54.0,51.8625,Southampton
10,female,4.0,16.7,Southampton
11,female,58.0,26.55,Southampton


In [21]:
titanic.survived.head()

1     1
3     1
6     0
10    1
11    1
Name: survived, dtype: int64

In [22]:
# sex, embark_townは質的データであり、そのまま説明変数にできないため、dummy変数にする
dummies = pd.get_dummies(titanic[['sex', 'embark_town']])

In [23]:
X = pd.concat([titanic[['age', 'fare']], dummies], axis=1)

In [24]:
y = titanic.survived

In [26]:
# だめな例。trainとtestが同じだと、当然高い精度になる
rfc1 = RandomForestClassifier()
rfc1.fit(X, y)
accuracy_score(rfc1.predict(X), y)

0.97252747252747251

In [27]:
# trainingを最初から4/5に、testを最後の1/5にする
train_number = int(len(X) * 4 / 5)
rfc2 = RandomForestClassifier()
rfc2.fit(X.iloc[:train_number, :], y[:train_number])
accuracy_score(rfc2.predict(X.iloc[train_number:, :]), y[train_number:])
# ただしこの方法だと、データの順番に偏りがある場合に、精度が悪くなってしまう（例えば生存者、死亡者の順番に名簿が並んでいた場合など）

0.81081081081081086

In [28]:
# train_test_splitを使うと、ランダムにtraining dataとtest dataに分けることが出来る
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [30]:
rfc3 = RandomForestClassifier()
rfc3.fit(X_train, y_train)
accuracy_score(y_test, rfc3.predict(X_test))

0.72972972972972971

In [31]:
# 交差検定をする。一番簡易的なcross_val_scoreを使う
cross_val_score(SVC(), X, y)

array([ 0.67213115,  0.68852459,  0.66666667])

In [32]:
cross_val_score(RandomForestClassifier(), X, y, cv=5)

array([ 0.78378378,  0.78378378,  0.83783784,  0.55555556,  0.82857143])

In [48]:
# cross_val_scoreを使わずに交差検定を自分で実装する
# KFoldでtrain, testのデータセットを作れる
from sklearn.model_selection import KFold

In [49]:
# 簡易的にage, fareのみを説明変数とする
# indexで指定するために、[3, 6]となってしまっている点は可読性が悪い
rf = RandomForestClassifier()
kf = KFold(n_splits=5)
for tr_idx, te_idx in kf.split(titanic.index):
    X_tr = titanic.iloc[tr_idx, [3, 6]]
    X_te = titanic.iloc[te_idx, [3, 6]]
    y_tr = titanic.iloc[tr_idx, 0]
    y_te = titanic.iloc[te_idx, 0]
    svc.fit(X_tr, y_tr)
    print(accuracy_score(y_te, svc.predict(X_te)))

0.513513513514
0.675675675676
0.666666666667
0.75
0.694444444444


In [50]:
# 演習③：手書き数字の判別を行う
# このデータセットでは既に画像処理がされていて、8×8ピクセルに変換されている
# 詳細はdocument参照
from sklearn.datasets import load_digits

In [51]:
ld = load_digits()
d_X = ld.data
d_X

array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,  10.,   0.,   0.],
       [  0.,   0.,   0., ...,  16.,   9.,   0.],
       ..., 
       [  0.,   0.,   1., ...,   6.,   0.,   0.],
       [  0.,   0.,   2., ...,  12.,   0.,   0.],
       [  0.,   0.,  10., ...,  12.,   1.,   0.]])

In [52]:
d_y = ld.target
d_y

array([0, 1, 2, ..., 8, 9, 8])

In [55]:
# 複数のモデルで交差検定する
from sklearn.linear_model import LogisticRegression

for model in [RandomForestClassifier(), LogisticRegression(), SVC()]:
    r = cross_val_score(model, d_X, d_y, cv=5)
    print(r.mean(), r.std())

0.906061517707 0.0211722686811
0.921018811336 0.0302462780159
0.448786800616 0.0372083256436


In [36]:
cross_val_score(SVC(), d_X, d_y)

array([ 0.39368771,  0.41068447,  0.45973154])

In [37]:
cross_val_score(RandomForestClassifier(), d_X, d_y)

array([ 0.89368771,  0.91986644,  0.89261745])

In [38]:
svc=SVC()
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [39]:
svc.fit(X, y)
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [56]:
# 演習④：レストランのチップのデータから、喫煙者かどうかを予測する
tips = sns.load_dataset('tips') 
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [57]:
X = pd.concat([tips[['total_bill', 'tip', 'size']],
              pd.get_dummies(tips[['sex', 'day', 'time']])], axis=1)
y = tips.smoker

In [58]:
# 演習③で行った3つのモデルでの交差検定を、メソッド化する
def prediction(X, y):
    for model in [RandomForestClassifier(), LogisticRegression(), SVC()]:
        r = cross_val_score(model, X, y, cv=5)
        print(r.mean(), r.std())

In [59]:
prediction(X, y)

0.585744897959 0.0870277976722
0.660068027211 0.118104899543
0.65556462585 0.0497023667243


In [60]:
# 昼と夜のチップで有意差があるか、t検定を行う
from scipy.stats import ttest_ind

In [61]:
tips.time.unique()

[Dinner, Lunch]
Categories (2, object): [Dinner, Lunch]

In [62]:
ttest_ind(tips[tips.time == 'Dinner'].tip, tips[tips.time == 'Lunch'].tip)

Ttest_indResult(statistic=1.9062569301202392, pvalue=0.05780153475171558)