In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
data = pd.read_csv("../../gci_project_storage/mynavi_data.csv", encoding="SHIFT-JIS")

In [22]:
data.shape

(46469, 110)

In [23]:
#文字列以外の変数でインスタントに分析を抽出
non_str_cols = data.columns[data.dtypes!="object"]
data_instant = data[non_str_cols]

#欠損値の回避
data_instant = data_instant[data_instant.columns[data_instant.isnull().sum()<10000]]
data_instant = data_instant.dropna()
data_instant.shape

(31700, 80)

In [24]:
data_instant.columns

Index(['rent', 'area', 'deposit', 'key_money', 'flooring', 'locality',
       'buildings_height', 'parking', 'status', 'structure',
       'transaction_type', 'air_conditioner', 'auto_lock', 'bath_toilet',
       'reheating', 'wash_basin', 'washing_machine', 'top_floor',
       'corner_room', 'immediate', 'upper', 'shower', 'bath_drier', 'washlet',
       'toilet_style', 'bathhouse', 'water_heater', 'system_kitchen',
       'counter_kitchen', 'independent_kitchen', 'L_kitchen', 'refrigerator',
       'gas_stove', 'IH_stove', 'electric_stove', 'stove_num', 'elderly_0',
       'elderly_1', 'elderly_2', 'company_0', 'company_1', 'company_2',
       'single_0', 'single_1', 'single_2', 'student_0', 'student_1',
       'student_2', 'direction_1', 'direction_2', 'direction_3', 'direction_4',
       'direction_5', 'direction_6', 'direction_7', 'direction_8',
       'structure_1', 'structure_2', 'structure_3', 'structure_4',
       'structure_5', 'structure_6', 'structure_7', 'structure_8',
   

<h3>回帰とランダムフォレストで交差検証でスコア(決定係数)を計算してみる<br>

\begin{eqnarray}
R^2 = 
1-\frac{RSS}{TSS} 
\end{eqnarray}

In [25]:
#インスタントな回帰/ランダムフォレスト分析
X = data_instant[data_instant.columns[data_instant.columns!="rent"]]
y = data_instant.rent

# 交差検証に向けてインデックスをランダムに置換
print("Before permutation:\n", X.head(4).iloc[:,:5] ,"\n")
X = X.reindex(np.random.permutation(X.index))
print("After permutation:\n", X.head(4).iloc[:,:5])

Before permutation:
      area  deposit  key_money  flooring  locality
0   19.87    77000      77000         0       2.0
5   25.19    90000      90000         1       4.0
9   27.54    94000      94000         1       4.0
10  27.54    94000      94000         1       4.0 

After permutation:
         area  deposit  key_money  flooring  locality
32995  25.81        0      88500         1       5.0
23524  47.77   167000     334000         1       4.0
20819  20.00    69000      69000         1       1.0
44581  32.90        0      75000         1       1.0


In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score

In [27]:
#交差検証に向けて、model の List を作る
models = [
    LinearRegression(),
    RandomForestRegressor() 
]
#スコアの計算
for model in models:
    scores = cross_val_score(model, X, y,cv=4)
    print(model.__class__.__name__, "   \tMean_score:\t", scores )

LinearRegression    	Mean_score:	 [ -9.50954460e-02  -2.47089623e-03  -1.55675045e-02  -2.24044512e+02]
RandomForestRegressor    	Mean_score:	 [-0.28407181 -0.38385221 -0.09475329 -3.1607974 ]


In [28]:
model = models[1]
model.fit(X.iloc[:30000,],y.iloc[:30000,])
model.score(X.iloc[30000:,],y.iloc[30000:,])

-2.4209151477275719

In [29]:
model = models[1]
model.fit(X,y)
model.score(X,y)

0.72621047643052028

In [30]:
X.iloc[:20000,].head()

Unnamed: 0,area,deposit,key_money,flooring,locality,buildings_height,parking,status,structure,transaction_type,...,status_4,transaction_type_1,transaction_type_2,transaction_type_3,transaction_type_4,transaction_type_5,transaction_type_6,bath_style_0,bath_style_1,bath_style_2
32995,25.81,0,88500,1,5.0,10.0,0,4.0,4,1,...,1,1,0,0,0,0,0,0,0,1
23524,47.77,167000,334000,1,4.0,5.0,1,2.0,4,6,...,0,0,0,0,0,0,1,0,0,1
20819,20.0,69000,69000,1,1.0,3.0,0,2.0,10,6,...,0,0,0,0,0,0,1,0,0,1
44581,32.9,0,75000,1,1.0,2.0,1,4.0,10,6,...,1,0,0,0,0,0,1,0,0,0
22592,18.2,67000,0,1,2.0,4.0,0,2.0,4,3,...,0,0,0,1,0,0,0,0,0,1
