## 必要なライブラリを読み込む

In [32]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

## データ読み込み

In [5]:
! wc -l test.csv train.csv

 4577465 test.csv
 40428968 train.csv
 45006433 total


## データ用が多いので、絞って読み込み

In [33]:
train_fname = 'train.csv'
reader = pd.read_csv(train_fname, chunksize=1000)

In [34]:
df_train = reader.get_chunk(1000)

In [35]:
df_train.shape

(1000, 24)

In [36]:
df_train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1000009418151094273,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,10000169349117863715,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,10000371904215119486,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,10000640724480838376,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,10000679056417042096,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [37]:
df_train.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

## データの欠損値、型、columnsを確認

In [38]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
id                  1000 non-null uint64
click               1000 non-null int64
hour                1000 non-null int64
C1                  1000 non-null int64
banner_pos          1000 non-null int64
site_id             1000 non-null object
site_domain         1000 non-null object
site_category       1000 non-null object
app_id              1000 non-null object
app_domain          1000 non-null object
app_category        1000 non-null object
device_id           1000 non-null object
device_ip           1000 non-null object
device_model        1000 non-null object
device_type         1000 non-null int64
device_conn_type    1000 non-null int64
C14                 1000 non-null int64
C15                 1000 non-null int64
C16                 1000 non-null int64
C17                 1000 non-null int64
C18                 1000 non-null int64
C19                 1000 non-null int64
C20 

## Numericalデータの統計情報

In [39]:
df_train.describe()

Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,9.402135e+18,0.16,14102100.0,1005.019,0.185,1.047,0.178,17716.24,318.368,56.488,1968.148,0.789,124.672,37637.883,88.556
std,2.390389e+18,0.366789,0.0,1.033299,0.388492,0.55054,0.605543,3115.731031,9.998227,35.787632,380.270482,1.227186,232.189622,48512.364147,46.087253
min,1.004777e+16,0.0,14102100.0,1001.0,0.0,0.0,0.0,377.0,216.0,36.0,112.0,0.0,35.0,-1.0,13.0
25%,1.002866e+19,0.0,14102100.0,1005.0,0.0,1.0,0.0,15705.0,320.0,50.0,1722.0,0.0,35.0,-1.0,61.0
50%,1.006611e+19,0.0,14102100.0,1005.0,0.0,1.0,0.0,17653.0,320.0,50.0,1974.0,0.0,35.0,-1.0,79.0
75%,1.010323e+19,0.0,14102100.0,1005.0,0.0,1.0,0.0,20362.0,320.0,50.0,2277.0,2.0,39.0,100084.0,156.0
max,1.014356e+19,1.0,14102100.0,1010.0,1.0,5.0,3.0,21692.0,320.0,250.0,2497.0,3.0,1835.0,100233.0,157.0


## Categoricalデータの統計情報

In [40]:
df_train.describe(include=['O']) 

Unnamed: 0,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model
count,1000,1000,1000,1000,1000,1000,1000,1000,1000
unique,122,109,10,75,15,7,127,910,341
top,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,0489ce3f,8a4875bd
freq,341,341,373,790,826,796,870,6,73


## hour列から　月、日、曜日、時を抜き出すために関数を用意

In [41]:
from datetime import datetime as dt
# hour列から　月、日、曜日、時を抜き出すために関数を用意

def get_month_from_time(x):
    time_date = dt.strptime(str(x), "%y%m%d%H")
    return time_date.month
def get_day_from_time(x):
    time_date = dt.strptime(str(x), "%y%m%d%H")
    return time_date.day
def get_weekday_from_time(x):
    time_date = dt.strptime(str(x), "%y%m%d%H")
    return time_date.weekday()
def get_hour_from_time(x):
    time_date = dt.strptime(str(x), "%y%m%d%H")
    return time_date.hour


In [42]:
def preprocess(df_train):
    
    # 上記で作成した関数をapplyで一括適用
    df_train["hour_month"] = df_train["hour"].apply(get_month_from_time)
    df_train["hour_day"] = df_train["hour"].apply(get_day_from_time)
    df_train["hour_weekday"] = df_train["hour"].apply(get_weekday_from_time)
    df_train["hour_hour"] = df_train["hour"].apply(get_hour_from_time)
   
    # カテゴリ変数からダミー変数の作成
    X = df_train.iloc[:, 0:28]
    colnames_categorical = ['site_id', 'site_domain','site_category', 'app_id', 'app_domain', 'app_category', 'device_id','device_ip', 'device_model']
    X_dummy = pd.get_dummies(X[colnames_categorical])
    X = pd.merge(X, X_dummy, left_index=True, right_index=True)
    X = X.drop(colnames_categorical, axis=1)
    
    
    return X

In [43]:
X = preprocess(df_train)

In [44]:
X

Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,...,device_model_fa61e8fe,device_model_fbc27727,device_model_fbec56bc,device_model_fce66524,device_model_fd621b1f,device_model_feb70d53,device_model_ff065cf0,device_model_ff2a3543,device_model_ff717dd1,device_model_ffe69079
0,1000009418151094273,0,14102100,1005,0,1,2,15706,320,50,...,0,0,0,0,0,0,0,0,0,0
1,10000169349117863715,0,14102100,1005,0,1,0,15704,320,50,...,0,0,0,0,0,0,0,0,0,0
2,10000371904215119486,0,14102100,1005,0,1,0,15704,320,50,...,0,0,0,0,0,0,0,0,0,0
3,10000640724480838376,0,14102100,1005,0,1,0,15706,320,50,...,0,0,0,0,0,0,0,0,0,0
4,10000679056417042096,0,14102100,1005,1,1,0,18993,320,50,...,0,0,0,0,0,0,0,0,0,0
5,10000720757801103869,0,14102100,1005,0,1,0,16920,320,50,...,0,0,0,0,0,0,0,0,0,0
6,10000724729988544911,0,14102100,1005,0,1,0,20362,320,50,...,0,0,0,0,0,0,0,0,0,0
7,10000918755742328737,0,14102100,1005,1,1,0,20632,320,50,...,0,0,0,0,0,0,0,0,0,0
8,10000949271186029916,1,14102100,1005,0,1,2,15707,320,50,...,0,0,0,0,0,0,0,0,0,0
9,10001264480619467364,0,14102100,1002,0,0,0,21689,320,50,...,0,0,0,0,0,0,0,0,0,0


## ターゲット変数と特徴量を指定してsklearnに渡せるように準備する

In [45]:
target_col = 'click'
exclude_cols = ['click', 'id', 'hour']
feature_cols = [col for col in X.columns if col not in exclude_cols]

In [46]:
feature_cols

['C1',
 'banner_pos',
 'device_type',
 'device_conn_type',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'hour_month',
 'hour_day',
 'hour_weekday',
 'hour_hour',
 'site_id_02d5151c',
 'site_id_030440fe',
 'site_id_04773863',
 'site_id_070ca277',
 'site_id_09c3b4c1',
 'site_id_0a742914',
 'site_id_0aab7161',
 'site_id_0c2fe9d6',
 'site_id_0d0207b0',
 'site_id_0eb72673',
 'site_id_11944c42',
 'site_id_12fb4121',
 'site_id_17caea14',
 'site_id_18859862',
 'site_id_1b171d6d',
 'site_id_1bf0f8d0',
 'site_id_1e25e084',
 'site_id_1fbe01fe',
 'site_id_22d080df',
 'site_id_2328ee8e',
 'site_id_26fa1946',
 'site_id_29229f8e',
 'site_id_29cfa778',
 'site_id_2c4ed2f7',
 'site_id_2d7bacb1',
 'site_id_34040f5c',
 'site_id_34d1d55f',
 'site_id_38217daf',
 'site_id_395fa97c',
 'site_id_39cffaa4',
 'site_id_3d88bdd0',
 'site_id_3e183f99',
 'site_id_43d6df75',
 'site_id_44f60771',
 'site_id_4b0f0061',
 'site_id_4bf5bbe2',
 'site_id_4dd0a958',
 'site_id_5114c672',
 'site_id_518d6168',

## 説明変数（特徴量）

In [47]:
X[feature_cols].head()

Unnamed: 0,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,...,device_model_fa61e8fe,device_model_fbc27727,device_model_fbec56bc,device_model_fce66524,device_model_fd621b1f,device_model_feb70d53,device_model_ff065cf0,device_model_ff2a3543,device_model_ff717dd1,device_model_ffe69079
0,1005,0,1,2,15706,320,50,1722,0,35,...,0,0,0,0,0,0,0,0,0,0
1,1005,0,1,0,15704,320,50,1722,0,35,...,0,0,0,0,0,0,0,0,0,0
2,1005,0,1,0,15704,320,50,1722,0,35,...,0,0,0,0,0,0,0,0,0,0
3,1005,0,1,0,15706,320,50,1722,0,35,...,0,0,0,0,0,0,0,0,0,0
4,1005,1,1,0,18993,320,50,2161,0,35,...,0,0,0,0,0,0,0,0,0,0


## 被説明変数（ターゲット変数）

In [48]:
X[target_col].head()

0    0
1    0
2    0
3    0
4    0
Name: click, dtype: int64

In [49]:
y = np.array(X[target_col])
X = np.array(X[feature_cols])

In [50]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [51]:
X

array([[1005,    0,    1, ...,    0,    0,    0],
       [1005,    0,    1, ...,    0,    0,    0],
       [1005,    0,    1, ...,    0,    0,    0],
       ...,
       [1005,    0,    1, ...,    0,    0,    0],
       [1005,    0,    1, ...,    0,    0,    0],
       [1005,    0,    1, ...,    0,    0,    0]])

# モデルの作成

In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [53]:
X_train, X_test, y_train, y_test = \
 train_test_split(X, y, test_size=0.3, random_state=1234)

In [54]:
X_train1, X_train2, y_train1, y_train2 = \
 train_test_split(X_train, y_train, test_size=0.3, random_state=1234)

## ランダムフォレスト

In [55]:
rf = RandomForestRegressor(random_state=1234)
rf.fit(X_train1, y_train1)
y_pred2 = rf.predict(X_train2)
rf_mse = mean_squared_error(y_train2, y_pred2)
print('Random Forest RMSE: ', np.sqrt(rf_mse))

Random Forest RMSE:  0.3592982578516602


In [56]:
print(accuracy_score(y_pred2.round(), y_train2,normalize=False))

181


## 変数増加法で変数選択をしてみる

In [57]:
def get_gfs_feature_indices(X, y, features, clf):
    X_train_, X_test_, y_train_, y_test_ = \
        train_test_split(X, y, test_size=0.3, random_state=1234)
    
    #どの特徴量が何列目にあるかを辞書型で保持
    feature_indices = {feature: idx for idx, feature in enumerate(features)}
   
    # 特徴量をユニークにする
    features = set(features)
    
    # 評価（MSE）の初期化
    last_mse = np.inf
    
    #選ばれた特徴量を保存するための空集合を用意
    chosen_features = set()

    while len(chosen_features) < len(features):
        mse_features = []
        
        # 集合は引き算すると差集合
        for feature in (features - chosen_features):
            candidates = chosen_features.union(set([feature]))
            indices = [feature_indices[feature] for feature in candidates]
            clf.fit(X_train_[:, indices], y_train_)
            y_pred = clf.predict(X_test_[:, indices])
            mse = mean_squared_error(y_test_, y_pred)
            mse_features += [(mse, feature)]
        mse, feature = min(mse_features, key= lambda x:x[0])
        
        if mse >= last_mse:
            break
        last_mse = mse
        print('Newly Added Feature: {},\tRMSE Score: {}'.format(feature, np.sqrt(mse)))
        chosen_features.add(feature)
    return [feature_indices[feature] for feature in chosen_features]

In [58]:
selected_feature_index = \
get_gfs_feature_indices(X=X_train, 
                        y=y_train, 
                        features=feature_cols, 
                        clf=RandomForestRegressor(random_state=1234))

Newly Added Feature: C21,	RMSE Score: 0.32586335381661247
Newly Added Feature: banner_pos,	RMSE Score: 0.3220932030332173
Newly Added Feature: device_model_be6db1d7,	RMSE Score: 0.31670396599079836
Newly Added Feature: site_id_43d6df75,	RMSE Score: 0.31415307034088463
Newly Added Feature: device_model_a5bce124,	RMSE Score: 0.3116480834997257
Newly Added Feature: device_model_2203a096,	RMSE Score: 0.3102554452608933
Newly Added Feature: device_model_3e7932d7,	RMSE Score: 0.30810638549212027
Newly Added Feature: device_model_711ee120,	RMSE Score: 0.3071100407061577
Newly Added Feature: device_ip_c6563308,	RMSE Score: 0.3064090644377387
Newly Added Feature: site_id_5b08c53b,	RMSE Score: 0.3057677505337289
Newly Added Feature: device_model_7594f139,	RMSE Score: 0.30354051573263235
Newly Added Feature: device_id_c357dbff,	RMSE Score: 0.3027983010908156
Newly Added Feature: device_ip_53650af8,	RMSE Score: 0.30197521170159136
Newly Added Feature: device_id_a99f214a,	RMSE Score: 0.301223805342

## 変数選択後の精度を確認

In [59]:
rf = RandomForestRegressor(random_state=1234)
rf.fit(X_train1[:, selected_feature_index], y_train1)
y_pred2 = rf.predict(X_train2[:, selected_feature_index])
rf_mse = mean_squared_error(y_train2, y_pred2)
print('RandomForest RMSE: ', np.sqrt(rf_mse))

RandomForest RMSE:  0.2870825347011269


In [60]:
print(accuracy_score(y_pred2.round(), y_train2,normalize=False))

188


In [61]:
rf = RandomForestRegressor(random_state=1234)

In [62]:
params = {'n_estimators': [10, 50, 100], 'max_depth': [5, 10, 50]}

In [63]:
import sklearn 
sklearn.__version__

'0.19.1'

In [64]:
#注意: scikit-learnのバージョンが、0.18の場合は、scoring='neg_mean_squared_error'とします
gscv = GridSearchCV(rf, param_grid=params, verbose=1,
                     cv=3, scoring='neg_mean_squared_error')

In [65]:
gscv.fit(X_train1[:, selected_feature_index], y_train1)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    1.8s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1234, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100], 'max_depth': [5, 10, 50]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring='neg_mean_squared_error', verbose=1)

In [66]:
gscv.best_params_

{'max_depth': 5, 'n_estimators': 100}

## パラメーターチューニング後のスコア

In [67]:
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=1234)
rf.fit(X_train1[:, selected_feature_index ], y_train1)
y_pred2 = rf.predict(X_train2[:, selected_feature_index])
rf_mse = mean_squared_error(y_train2, y_pred2)
print('RandomForest RMSE: ', np.sqrt(rf_mse))

RandomForest RMSE:  0.3179235999124238


In [68]:
print(accuracy_score(y_pred2.round(), y_train2,normalize=False))

183


## テストデータへ適用して精度を確認する

In [69]:
rf = RandomForestRegressor(n_estimators=100, max_depth=50, random_state=1234)
rf.fit(X_train[:, selected_feature_index], y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=1234, verbose=0, warm_start=False)

In [70]:
y_pred_on_test = rf.predict(X_test[:, selected_feature_index])
rf_mse = mean_squared_error(y_test, y_pred_on_test)
print('RandomForest RMSE: ', np.sqrt(rf_mse))

RandomForest RMSE:  0.381920937525873


In [71]:
print(accuracy_score(y_pred2.round(), y_train2,normalize=False))

183


## submission用csvの作成

In [72]:
test_fname = 'test.csv'
reader_test = pd.read_csv(test_fname, chunksize=1000)
df_test_origin = reader_test.get_chunk(1000)


In [73]:
df_test = preprocess(df_test_origin)

In [115]:
selected_feature_index2 = ['id', 
'C21',
'banner_pos',
'device_model_be6db1d7',
'site_id_43d6df75',
'device_model_a5bce124',
'device_model_2203a096',
'device_model_3e7932d7',
'device_model_711ee120',
'device_ip_c6563308',
'site_id_5b08c53b',
'device_model_7594f139',
'device_id_c357dbff',
'device_ip_53650af8',
'device_id_a99f214a',
'site_category_50e219e0',
'app_id_1779deee',
'device_ip_76e48bb9',
'device_ip_65a3b9c7',
'device_ip_4ba68400',
'device_ip_1c0467d6',
'site_id_6399eda6',
'device_model_32fa85fb',
'device_ip_aa914ed8',
'device_id_175e2558',
'site_id_89a490f5',
'device_model_779d90c2',
'device_ip_0b416039',
'site_id_85f751fd',
'site_id_b7e9786d',
'device_ip_3faee6d1',
'device_ip_ca630095'
]

## カテゴリ変数を使うと、trainに入っているけど、testに入っていないものができてしまいモデルでキーエラーとなる。
#### よく考えたらそうですね。

In [117]:
submit_data =  pd.Series(rf.predict(df_test[selected_feature_index2]), name='click', index=df_test_origin['id'])
submit_data.to_csv('submit.csv', header=True)

KeyError: "['site_id_43d6df75' 'device_model_3e7932d7' 'device_ip_c6563308'\n 'device_id_c357dbff' 'device_ip_53650af8' 'app_id_1779deee'\n 'device_ip_76e48bb9' 'device_ip_65a3b9c7' 'device_ip_4ba68400'\n 'device_ip_1c0467d6' 'device_model_32fa85fb' 'device_ip_aa914ed8'\n 'device_id_175e2558' 'site_id_89a490f5' 'device_ip_0b416039'\n 'device_ip_3faee6d1' 'device_ip_ca630095'] not in index"