In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("../../gci_project_storage/mynavi_data.csv", encoding="SHIFT-JIS")

In [3]:
data.shape

(46469, 110)

In [4]:
#文字列以外の変数でインスタントに分析を抽出
non_str_cols = data.columns[data.dtypes!="object"]
data_instant = data[non_str_cols]

#欠損値の回避
data_instant = data_instant[data_instant.columns[data_instant.isnull().sum()<10000]]
data_instant = data_instant.dropna()

<h3>回帰とランダムフォレストで交差検証でスコア(決定係数)を計算してみる<br>

\begin{eqnarray}
R^2 = 
1-\frac{RSS}{TSS} 
\end{eqnarray}

In [5]:
#インスタントな回帰/ランダムフォレスト分析
X = data_instant[data_instant.columns[data_instant.columns!="rent"]]
y = data_instant.rent

# 交差検証に向けてインデックスをランダムに置換
print("Before permutation:\n", X.head(4).iloc[:,:5] ,"\n")
X = X.reindex(np.random.permutation(X.index))
print("After permutation:\n", X.head(4).iloc[:,:5])

Before permutation:
      area  deposit  key_money  flooring  locality
0   19.87    77000      77000         0       2.0
5   25.19    90000      90000         1       4.0
9   27.54    94000      94000         1       4.0
10  27.54    94000      94000         1       4.0 

After permutation:
         area  deposit  key_money  flooring  locality
5167   12.49    55000      55000         1       2.0
10388  25.82    90000          0         1       1.0
15749  41.40   156000     156000         1       3.0
1339   26.03    98000      98000         1       5.0


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score

In [7]:
#交差検証に向けて、model の List を作る
models = [
    LinearRegression(),
    RandomForestRegressor() 
]
#スコアの計算
for model in models:
    scores = cross_val_score(model, X, y,cv=4)
    print(model.__class__.__name__, "   \tMean_score:\t", scores )

LinearRegression    	Mean_score:	 [-0.09320851 -0.0104994  -0.01531917 -2.81896051]
RandomForestRegressor    	Mean_score:	 [-0.27223325 -0.45587681 -0.09179329 -3.01992539]


In [8]:
model = models[1]
model.fit(X.iloc[:20000,],y.iloc[:20000,])
model.score(X.iloc[20000:,],y.iloc[20000:,])

-3.1064117361318933

In [9]:
model = models[1]
model.fit(X,y)
model.score(X,y)

0.71957790090182572

In [10]:
X.iloc[:20000,].head()

Unnamed: 0,area,deposit,key_money,flooring,locality,buildings_height,parking,status,structure,transaction_type,...,status_4,transaction_type_1,transaction_type_2,transaction_type_3,transaction_type_4,transaction_type_5,transaction_type_6,bath_style_0,bath_style_1,bath_style_2
5167,12.49,55000,55000,1,2.0,2.0,0,2.0,10,6,...,0,0,0,0,0,0,1,0,0,0
10388,25.82,90000,0,1,1.0,3.0,0,2.0,5,6,...,0,0,0,0,0,0,1,0,0,1
15749,41.4,156000,156000,1,3.0,4.0,0,2.0,4,1,...,0,1,0,0,0,0,0,0,0,1
1339,26.03,98000,98000,1,5.0,12.0,0,1.0,4,6,...,0,0,0,0,0,0,1,0,0,1
28311,18.62,57000,57000,0,1.0,3.0,0,2.0,1,1,...,0,1,0,0,0,0,0,0,0,1
