# スマホのセンサーデータから行動を予測しよう

Human Activity Recognition Using Smartphones Data Set

In [2]:
# 基本的なライブラリ
import os
import numpy as np
import pandas as pd
import sklearn

# 今回使うサポートベクトルマシンと線形回帰
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

# 検証に使う関数
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# モデルの評価に使う関数
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

# 訓練したモデルを保存するための関数
from sklearn.externals import joblib

In [4]:
import pandas as pd
df = pd.read_table('train/X_train.txt', header=None, delim_whitespace=True)
df.to_csv('train/X_train.csv', index=False, header=False)

df = pd.read_table('test/X_test.txt', header=None, delim_whitespace=True)
df.to_csv('test/X_test.csv', index=False, header=False)

df = pd.read_table('train/y_train.txt', header=None, delim_whitespace=True)
df.to_csv('train/y_train.csv', index=False, header=False)

df = pd.read_table('test/y_test.txt', header=None, delim_whitespace=True)
df.to_csv('test/y_test.csv', index=False, header=False)

In [5]:
# データセットの読み込み
# X_train = pd.read_csv("Train/X_train.csv", sep = " ", header = None)
# y_train = pd.read_csv("Train/y_train.csv", sep = " ", header = None)
# X_test = pd.read_csv("Test/X_test.csv", sep = " ", header = None)
# y_test = pd.read_csv("Test/y_test.csv", sep = " ", header = None)
X_train = pd.read_csv("train/X_train.csv", header = None)
y_train = pd.read_csv("train/y_train.csv", header = None)
X_test = pd.read_csv("test/X_test.csv",  header = None)
y_test = pd.read_csv("test/y_test.csv",  header = None)

# ランダムサーチでモデルを訓練

In [6]:
# ランダムサーチのハイパーパラメータ群を用意
param_dist_1 = {"C":[0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0], "kernel":['linear','poly','rbf','sigmoid'], 
                "degree":[3, 4, 5, 6, 7], }

param_dist_2 = {"penalty":['l2', 'l1', 'elasticnet'], "alpha":[0.0001, 0.0002, 0.0003, 0.0004], "l1_ratio":[0.11, 0.12, 0.13,
                 0.14, 0.15, 0.16, 0.17], }

In [7]:
# モデルのインスタンスを作成
svm_clf = SVC()
sgd_clf = SGDClassifier()

In [8]:
# ランダムサーチのインスタンスを作成
random_search_1 = RandomizedSearchCV(svm_clf, cv = 5, scoring = 'accuracy', param_distributions = param_dist_1, n_iter = 20)
random_search_2 = RandomizedSearchCV(sgd_clf, cv = 5, scoring = 'accuracy', param_distributions = param_dist_2, n_iter = 20)

In [9]:
# ランダムサーチでモデルを訓練
random_search_1.fit(X_train, y_train.values.ravel())
random_search_2.fit(X_train, y_train.values.ravel())



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SGDClassifier(alpha=0.0001, average=False,
                                           class_weight=None,
                                           early_stopping=False, epsilon=0.1,
                                           eta0=0.0, fit_intercept=True,
                                           l1_ratio=0.15,
                                           learning_rate='optimal',
                                           loss='hinge', max_iter=1000,
                                           n_iter_no_change=5, n_jobs=None,
                                           penalty='l2', power_t=0.5,
                                           random_state=None, shuffle=True,
                                           tol=0.001, validation_fraction=0.1,
                                           verbose=0, warm_start=False),
                   iid='warn', n_iter=20, n_jobs=None,
                   param_distr

# ベストな予測器のインスタンスを作成

In [10]:
# ベストな予測器のインスタンスを作成
svm_clf_best = random_search_1.best_estimator_
sgd_clf_best = random_search_2.best_estimator_

In [11]:
# ベストな予測器を訓練セットで訓練
svm_clf_best.fit(X_train, y_train)
sgd_clf_best.fit(X_train, y_train)

#ベストな予測器を使ってテストセットの予測
y_predict_svm = svm_clf_best.predict(X_test)
y_predict_sgd = sgd_clf_best.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# ベストな予測器の評価

In [12]:
# 各モデルの正確度accuracyを比較
print('SVM accuracy:', accuracy_score(y_test, y_predict_svm))
print('SGD accuracy:', accuracy_score(y_test, y_predict_sgd))

# 混同行列を表示
conf_mx_svm = confusion_matrix(y_test, y_predict_svm)
conf_mx_sgd = confusion_matrix(y_test, y_predict_sgd)
print(conf_mx_svm,"\n")
print(conf_mx_sgd)

# 適合率、再現率、F値、インスタンス数を表示
prfs_svm = precision_recall_fscore_support(y_test, y_predict_svm)
prfs_sgd = precision_recall_fscore_support(y_test, y_predict_sgd)
print(prfs_svm)
print("\n")
print(prfs_sgd)

SVM accuracy: 0.9633525619273838
SGD accuracy: 0.9616559212758737
[[492   1   3   0   0   0]
 [ 18 451   2   0   0   0]
 [  4   6 410   0   0   0]
 [  0   2   0 434  55   0]
 [  0   0   0  17 515   0]
 [  0   0   0   0   0 537]] 

[[492   2   2   0   0   0]
 [ 17 453   1   0   0   0]
 [  3  11 406   0   0   0]
 [  0   3   0 443  45   0]
 [  0   0   0  29 503   0]
 [  0   0   0   0   0 537]]
(array([0.95719844, 0.98043478, 0.98795181, 0.96230599, 0.90350877,
       1.        ]), array([0.99193548, 0.95753715, 0.97619048, 0.88391039, 0.96804511,
       1.        ]), array([0.97425743, 0.9688507 , 0.98203593, 0.92144374, 0.93466425,
       1.        ]), array([496, 471, 420, 491, 532, 537], dtype=int64))


(array([0.9609375 , 0.96588486, 0.99266504, 0.93855932, 0.91788321,
       1.        ]), array([0.99193548, 0.96178344, 0.96666667, 0.90224033, 0.94548872,
       1.        ]), array([0.97619048, 0.96382979, 0.97949337, 0.92004154, 0.93148148,
       1.        ]), array([496, 471, 420, 

In [13]:
# ベストな予測器の内訳を表示
print(svm_clf_best)
print("\n")
print(sgd_clf_best)

SVC(C=0.8, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=6, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


SGDClassifier(alpha=0.0002, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.16, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='elasticnet', power_t=0.5, random_state=None,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)


In [14]:
# ベストなモデルをファイルとして保存
joblib.dump(svm_clf_best, "model_svm.pkl", compress=True)
joblib.dump(sgd_clf_best, "model_sgd.pkl", compress=True)

['model_sgd.pkl']