In [422]:
from sklearn.neural_network import MLPClassifier
from sklearn import datasets, metrics
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

# 平均二乗誤差を評価するためのメソッドを呼び出し
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import numpy as np
np.set_printoptions(threshold=np.inf)
import pandas as pd

In [389]:
def input_data(dr):
    #読み込みたいcsvファイルへのパス作成、読み込み
    path = dr + 'wetherInfo.csv'
    input_data = pd.read_csv(path)
    
    #必要な部分だけ抽出
    input_data = input_data.iloc[:, (len(input_data.columns)-4):len(input_data.columns)]
    
    #欠損地除去
    input_data = input_data.dropna()
    
    return input_data

In [390]:
#特徴抽出、ベクトル化
def data_fe(data):
    #1時間前との差分を追加
    data['temp_dfr'] = data['temp'] - data['temp'].shift(1)
    data['wind_dfr'] = data['wind'] - data['wind'].shift(1)
    data['humidity_dfr'] = data['humidity'] - data['humidity'].shift(1)

    #欠損値除去
    data = data.dropna()

    #ベクトル化
    x_data = data.as_matrix()

    return x_data

In [391]:
def NN_label_vec(input_data):
    #1時間後に雨が10mm以上降れば１、降らなければ０でラベルを作成
    data = np.array(input_data.rain >= 5, dtype = 'int')
    label_vec = np.roll(data, -1)
    label_vec[len(label_vec) - 1] = 0

    return label_vec

def NN(x_train, y_train, x_test, y_test):
    #MLPClassifier適用
    clf = MLPClassifier(hidden_layer_sizes=(100,100), random_state=1)
    clf.fit(x_train, y_train)

    #clf（トレーニング済み）にテスト用データを適用
    predicted = clf.predict(x_test)
    print(metrics.classification_report(y_test, predicted))

In [392]:
def LR_label_vec(input_data):
    data = np.array(input_data.rain)
    label_vec = np.roll(data, -1)
    label_vec[len(label_vec)-1] = 0
    
    return label_vec

def LR(x_train, y_train, x_test, y_test):
    #オブジェクト生成
    mod = LinearRegression(fit_intercept = True, normalize = True, copy_X = True, n_jobs = 1)
    #教師用データでパラメータ推定
    mod.fit(x_train, y_train)
    
    #作成したモデルから予測
    y_train_pred = mod.predict(x_train)
    y_test_pred = mod.predict(x_test)
    
    #教師用、テスト用データに関して平均二乗誤差を出力
    #小さいほどモデルの性能がいい
    print('MSE Train : %.3f, Test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
    #教師用、テスト用データに関してR^2を出力
    #1に近いほどモデルの性能がいい
    print('R^2 Train : %.3f, Test : %.3f' % (mod.score(x_train, y_train), mod.score(x_test, y_test)))


In [419]:
def MLPR(x_train, y_train, x_test, y_test):
    mod = MLPRegressor(hidden_layer_sizes=(2000,1000,500,1000, 2000),random_state=42)
    mod.fit(x_train, y_train)
    print(mod.score(x_test,y_test))
    

In [423]:
def RFR(x_train, y_train, x_test, y_test):
    # モデル構築、パラメータはデフォルト
    forest = RandomForestRegressor()
    forest.fit(x_train, y_train)
    
    # 予測値を計算
    y_train_pred = forest.predict(x_train)
    y_test_pred = forest.predict(x_test)
    
    print('MSE train : %.3f, test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)) )
    # R^2の計算
    print('MSE train : %.3f, test : %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)) )


In [425]:
if __name__ == '__main__':
    
    #train -> 教師用
    #test -> テスト用
    
    #読み込みたいcsvファイルのあるディレクトを引数として取得
    #args = sys.argv
    #dr_train = args[1]
    #dr_test = args[2]
    dr_train = './kochi_train2015/'
    dr_test = './kochi_test2016/'
    
    
    #csv読み込み、欠損地除去
    input_train = input_data(dr_train)
    input_test = input_data(dr_test)
    
    
    #特徴抽出、ベクトル化
    x_train = data_fe(input_train)
    x_test = data_fe(input_test)
    
    
    #ラベル作成
    
    #NN
    #y_train = NN_label_vec(data_train)
    #y_test = NN_label_vec(data_test)
    
    #Liner Regression, Neural Network Regression
    y_train = LR_label_vec(data_train)
    y_test = LR_label_vec(data_test)
    
    
    
    #学習・識別率算出
    
    #NN
    #NN(x_train, y_train, x_test, y_test)
    
    #Liner Regression
    #LR(x_train, y_train, x_test, y_test)
    
    #Neural Network Regression
    #MLPR(x_train, y_train, x_test, y_test)
    
    RFR(x_train, y_train, x_test, y_test)


MSE train : 0.448, test : 1.751
MSE train : 0.864, test : 0.307
