In [60]:
#pandasインポートとデータセット読み込み
import pandas as pd
df_forAgePred = pd.read_csv('forAgePred.csv')

#欠損値の行を削除
df_forAgePred = df_forAgePred.dropna()


In [61]:
#Mr. Mrs. 等の呼称ごとの年齢を推定

#年齢をInt化し100倍に
x = df_forAgePred.Age.apply(lambda x: int(x*100))
df_forAgePred = df_forAgePred.drop('Age',axis =1)
x.name = 'Age'
df_forAgePred = pd.concat((df_forAgePred, x), axis = 1)


#推定に使う要素Xを指定。数値化したタイトル列
l_pred = ['TitlesNum']

In [62]:
#試すモデルを指定

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


names = ['LogisticRegression',
  'SVC',
  'LinearSVC',
  'KNeighbors',
  'DecisionTree',
  'RandomForest',
  'MLPClassifier']

l_models = []

l_models.append(("LogisticRegression", LogisticRegression()))
l_models.append(("SVC", SVC()))
l_models.append(("LinearSVC", LinearSVC()))
l_models.append(("KNeighbors", KNeighborsClassifier()))
l_models.append(("DecisionTree", DecisionTreeClassifier()))
l_models.append(("RandomForest", RandomForestClassifier()))
l_models.append(("MLPClassifier", MLPClassifier(solver='lbfgs', random_state=0)))


In [63]:
#機械学習モデル

def evaluate_models(df_forAgePred, l_pred, l_models):

    results = []
    names = []

    X_train, X_test, y_train, y_test = train_test_split(df_forAgePred[l_pred], df_forAgePred.Age, test_size=0.25)

    for name, model in l_models:
        model.fit(X_train, y_train)
        res_pred = model.predict(X_test)
        result = mean_squared_error(y_test, res_pred)

        names.append(name)
        results.append(result)

    return names, results

In [64]:
#各種機械学習モデルを使い年齢推定テスト

import statistics

result_list = []

for i in range(0,10):
    print('test {0}'.format(i))
    name, res = evaluate_models(df_forAgePred, l_pred, l_models)
    for x, y in zip(name, res):
        result_list.append([x, y])

print('done')

test 0
test 1
test 2
test 3
test 4
test 5
test 6
test 7
test 8
test 9
done


In [65]:
#結果表示

print('mean squared error results by model')
print()
for n in names:
    r = [i[1] for i in result_list if i[0] == n]
    print(n)
    print('avg: {0:,.2f} / median: {1:,.2f}'.format(sum(r)/len(r), statistics.median(r)))

mean squared error results by model

LogisticRegression
avg: 6,703.49 / median: 6,831.40
SVC
avg: 6,703.49 / median: 6,831.40
LinearSVC
avg: 6,703.49 / median: 6,831.40
KNeighbors
avg: 7,255.81 / median: 7,063.95
DecisionTree
avg: 6,703.49 / median: 6,831.40
RandomForest
avg: 6,703.49 / median: 6,831.40
MLPClassifier
avg: 6,703.49 / median: 6,831.40


In [66]:
#ランダムフォレストのGrid Search Cross Validation

def applyGSCV(model, param, X, Y):
    from sklearn.model_selection import GridSearchCV

    res = GridSearchCV(model, param, cv=3)
    res.fit(X, Y)

    return res

In [67]:
#選択したモデルを学習させる

from sklearn.ensemble import RandomForestClassifier
model_selected = RandomForestClassifier()

obj_param = {
'n_estimators': [5,10,20,30,50,100,300],
'max_depth': [3,5,10,15,20,25,30,40,50,100],
'random_state': [0]
}

df_feature = df_forAgePred[l_pred]
df_answer = df_forAgePred["Age"]

model_AgePred = applyGSCV(model_selected, obj_param, df_feature, df_answer)

print('done')

done


In [68]:
#学習したモデルを保存する

import pickle
filename = 'age_pred.sav'
pickle.dump(model_AgePred, open(filename, 'wb'))