In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("../../gci_project_storage/mynavi_data.csv", encoding="SHIFT-JIS")

In [4]:
data.shape

(46469, 110)

In [5]:
#文字列以外の変数でインスタントに分析を抽出
non_str_cols = data.columns[data.dtypes!="object"]
data_instant = data[non_str_cols]

#欠損値の回避
data_instant = data_instant[data_instant.columns[data_instant.isnull().sum()<10000]]
data_instant = data_instant.dropna()

<h3>回帰とランダムフォレストで交差検証でスコア(決定係数)を計算してみる<br>

\begin{eqnarray}
R^2 = 
1-\frac{RSS}{TSS} 
\end{eqnarray}

In [6]:
#インスタントな回帰/ランダムフォレスト分析
X = data_instant[data_instant.columns[data_instant.columns!="rent"]]
y = data_instant.rent

# 交差検証に向けてインデックスをランダムに置換
print("Before permutation:\n", X.head(4).iloc[:,:5] ,"\n")
X = X.reindex(np.random.permutation(X.index))
print("After permutation:\n", X.head(4).iloc[:,:5])

Before permutation:
      area  deposit  key_money  flooring  locality
0   19.87    77000      77000         0       2.0
5   25.19    90000      90000         1       4.0
9   27.54    94000      94000         1       4.0
10  27.54    94000      94000         1       4.0 

After permutation:
         area  deposit  key_money  flooring  locality
11037  33.69   130000     130000         1       7.0
34528  20.14    57000          0         1       2.0
22224  41.83   118000          0         1       1.0
45089  39.60    77000          0         0       2.0


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score

In [8]:
#交差検証に向けて、model の List を作る
models = [
    LinearRegression(),
    RandomForestRegressor() 
]
#スコアの計算
for model in models:
    scores = cross_val_score(model, X, y,cv=4)
    print(model.__class__.__name__, "   \tMean_score:\t", scores )

LinearRegression    	Mean_score:	 [ -9.33392611e-02  -3.08138206e-03  -2.13861701e+02  -1.21018356e+00]
RandomForestRegressor    	Mean_score:	 [-0.29556252 -0.3329533  -0.08977756 -3.09308819]


In [9]:
model = models[1]
model.fit(X.iloc[:20000,],y.iloc[:20000,])
model.score(X.iloc[20000:,],y.iloc[20000:,])

-3.2890495651357652

In [10]:
model = models[1]
model.fit(X,y)
model.score(X,y)

0.70843505059009404

In [11]:
X.iloc[:20000,].head()

Unnamed: 0,area,deposit,key_money,flooring,locality,buildings_height,parking,status,structure,transaction_type,...,status_4,transaction_type_1,transaction_type_2,transaction_type_3,transaction_type_4,transaction_type_5,transaction_type_6,bath_style_0,bath_style_1,bath_style_2
11037,33.69,130000,130000,1,7.0,7.0,0,2.0,4,5,...,0,0,0,0,0,1,0,0,0,0
34528,20.14,57000,0,1,2.0,5.0,0,2.0,4,6,...,0,0,0,0,0,0,1,0,0,1
22224,41.83,118000,0,1,1.0,4.0,0,2.0,4,1,...,0,1,0,0,0,0,0,0,0,1
45089,39.6,77000,0,0,2.0,2.0,1,2.0,10,1,...,0,1,0,0,0,0,0,0,0,1
32087,19.0,118000,118000,1,2.0,2.0,0,2.0,7,6,...,0,0,0,0,0,0,1,0,0,1


In [12]:
#価格と面積のプロット
plt.plot(data.area,data.rent,'o')
plt.xlim(0,350)
plt.show()