## DAY2 宿題
- Kickstarter Projectsの分類

◇ DAY_2の範囲でやったこと
- 外れ値除去
- 標準化
- K_Fold法による評価

◆ やってないこと
- カテゴリ変数を説明変数にする
　> これはやったほうがいい
- 正則化
　> K_Fold法の訓練正答率とテスト正答率を比べると
　　そこまで差が出ていなので少なくとも過学習はしていないと思います
- SVM
　> キックスターターが成功するかどうかは確率で予測したいので
　　ロジスティック回帰のままで進めます

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

# DAY_1 4_linear_regression_multi_real_data
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# DAY_1 6_logistic_regression_real_data
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix

# DAY2 1_how_to_validation
from sklearn.model_selection import KFold # 交差検証法に関する関数

In [2]:
# https://qiita.com/Sasagawa0185/items/1185933dd0e560a26b07
import datetime

# 与えられた文字列型の時刻をdatetime型へ変換する関数
def str2datetime(t, format):
    return datetime.datetime.strptime(t, format)

# 差を秒に直したものを一日当たりの秒数でわったものを出力します
# 単位：日
def dateDiff(t):
    t1, t2 = t
    delta = t2 - t1
    return delta.total_seconds()/(60*60*24)

In [3]:
def standardize(df):
    return (df-df.mean())/df.std(ddof=True)
def normalize(df):
    return (df - df.min())/(df.max()-df.min())

### 1. データの読み込み

In [4]:
df_house = pd.read_csv("kickstarter-projects/ks-projects-201801.csv", index_col=0)
# 日付を期間に変換し列" dateDiff" に格納
df_house.launched = df_house.launched.apply(str2datetime, format='%Y-%m-%d %H:%M:%S')
df_house.deadline = df_house.deadline.apply(str2datetime, format='%Y-%m-%d')
df_house["dateDiff"] = df_house[["launched", "deadline"]].apply(dateDiff, axis=1)

# 連続量の特徴量（列名）
continuousCols = ["dateDiff", "goal"]

# 離散ラベルの特徴量
quantizedCols = ["category", "main_category", "currency", "country"]

# 固有の列
uniqueCols = ["name"]

# クラウドファウンディング前に利用できないと特徴量
unavailableCols = ["pledged", "backers", "usd pledged", "usd_pledged_real", "usd_goal_real"]

# 目的変数の列を数値に変換し、列 "stateValue" に格納
states = ["successful", "failed", "canceled"]
df_house["stateValue"] = (states[0] == df_house["state"]).astype(float)

In [5]:
requiredCols = continuousCols + ["stateValue"]

### 2. データの前処理

In [6]:
df_dataset = df_house[requiredCols]

# 欠損値を確認
df_dataset.isnull().sum()

dateDiff      0
goal          0
stateValue    0
dtype: int64

In [7]:
# 要約統計量の表示
df_dataset.describe()

Unnamed: 0,dateDiff,goal,stateValue
count,378661.0,378661.0,378661.0
mean,33.9154,49080.79,0.353762
std,65.913304,1183391.0,0.478137
min,0.005058,0.01,0.0
25%,29.097639,2000.0,0.0
50%,29.689259,5200.0,0.0
75%,36.649815,16000.0,1.0
max,16738.958333,100000000.0,1.0


In [8]:
# 期間（dateDiff）が[長すぎる／短すぎる]データがあるので、
# ひとまず期間が[1年以内／一週間以上]のプロジェクトという条件を追加する
df_dataset = df_dataset[df_dataset.dateDiff < 366]
df_dataset = df_dataset[df_dataset.dateDiff >= 7]
df_dataset.describe()

Unnamed: 0,dateDiff,goal,stateValue
count,375346.0,375346.0,375346.0
mean,33.888758,48847.05,0.353186
std,12.574616,1172615.0,0.477961
min,7.001424,0.01,0.0
25%,29.10704,2000.0,0.0
50%,29.707396,5500.0,0.0
75%,37.009225,16250.0,1.0
max,91.96265,100000000.0,1.0


In [9]:
# goalはどの範囲が確からしいか判断できなかったが、
# 100ドル以上, 1,000,000ドル未満という条件でデータセットを構築した
df_dataset = df_dataset[df_dataset.goal <= 1000000]
df_dataset = df_dataset[df_dataset.goal >= 100]
df_dataset.describe()

Unnamed: 0,dateDiff,goal,stateValue
count,370308.0,370308.0,370308.0
mean,33.908929,22112.54038,0.352593
std,12.551322,66070.659117,0.477778
min,7.001424,100.0,0.0
25%,29.108481,2000.0,0.0
50%,29.710926,5500.0,0.0
75%,37.048354,16500.0,1.0
max,91.96265,1000000.0,1.0


In [10]:
# 標準化
df_dataset[requiredCols[:-1]] = df_dataset[requiredCols[:-1]].apply(standardize)
df_dataset.describe()

Unnamed: 0,dateDiff,goal,stateValue
count,370308.0,370308.0,370308.0
mean,-5.271156e-14,1.138033e-15,0.352593
std,1.0,1.0,0.477778
min,-2.143799,-0.3331667,0.0
25%,-0.3824655,-0.3044096,0.0
50%,-0.334467,-0.251436,0.0
75%,0.250127,-0.08494755,1.0
max,4.625307,14.80063,1.0


### 3. データ統計量の確認

In [11]:
# # 【重い】散布図行列を書いてみる
# pd.plotting.scatter_matrix(df_standardized, figsize=(10,10))
# plt.show()

In [12]:
# 相関係数を確認
df_dataset.corr()

Unnamed: 0,dateDiff,goal,stateValue
dateDiff,1.0,0.088725,-0.116963
goal,0.088725,1.0,-0.132707
stateValue,-0.116963,-0.132707,1.0


In [13]:
y = df_dataset["stateValue"].values
X = df_dataset.drop("stateValue", axis=1).values
n_split = 5 # グループ数を設定（今回は5分割）

cross_valid_accuracy = 0
split_num = 1

# テスト役を交代させながら学習と評価を繰り返す
for train_idx, test_idx in KFold(n_splits=n_split, shuffle=True, random_state=1234).split(X, y):
    X_train, y_train = X[train_idx], y[train_idx] #学習用データ
    X_test, y_test = X[test_idx], y[test_idx]     #テスト用データ
    
    # 学習用データを使ってロジスティック回帰モデルを学習
    clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
    clf.fit(X_train, y_train)

    # テストデータに対する予測を実行（比較用に訓練データに対する予測も行う）
    y_est = clf.predict(X_test)
    y_est_train = clf.predict(X_train)
    
    print("Fold %s"%split_num)
    
    # テストデータに対する対数尤度を表示
    print('対数尤度 = {:.3f}'.format(- log_loss(y_test, y_est)))
    # 正答率を表示（カッコ内は訓練時の正答率）
    accuracy = accuracy_score(y_test, y_est)
    train_accuracy = accuracy_score(y_train, y_est_train)
    print('正答率 = {:.3f}% ({:.3f}%)'.format(100 * accuracy, 100 * train_accuracy))    
    # テストデータに対する予測と正解のクロス集計
    conf_mat = pd.DataFrame(confusion_matrix(y_test, y_est), 
                            index=['正解 = False, Canceled', '正解 = Success'], 
                            columns=['予測 = False, Canceled', '予測 = Success'])
    display(conf_mat)
    print()
    
    # 後で平均を取るために加算
    cross_valid_accuracy += accuracy
    split_num += 1

# 正答率の平均値を最終的な汎化正答率とする
final_accuracy = cross_valid_accuracy / n_split
print("Cross Validation Accuracy = %.3f" % (final_accuracy*100))

Fold 1
対数尤度 = -12.091
正答率 = 64.994% (64.961%)


Unnamed: 0,"予測 = False, Canceled",予測 = Success
"正解 = False, Canceled",46582,1403
正解 = Success,24523,1554



Fold 2
対数尤度 = -12.067
正答率 = 65.063% (64.985%)


Unnamed: 0,"予測 = False, Canceled",予測 = Success
"正解 = False, Canceled",46879,1120
正解 = Success,24755,1308



Fold 3
対数尤度 = -12.192
正答率 = 64.700% (64.828%)


Unnamed: 0,"予測 = False, Canceled",予測 = Success
"正解 = False, Canceled",47776,115
正解 = Success,26029,142



Fold 4
対数尤度 = -12.198
正答率 = 64.683% (64.828%)


Unnamed: 0,"予測 = False, Canceled",予測 = Success
"正解 = False, Canceled",47771,80
正解 = Success,26076,134



Fold 5
対数尤度 = -12.098
正答率 = 64.972% (64.904%)


Unnamed: 0,"予測 = False, Canceled",予測 = Success
"正解 = False, Canceled",47514,500
正解 = Success,25442,605



Cross Validation Accuracy = 64.882
