In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 81)

In [41]:
# trainデータを取得、NaN値などの状況を確認
dataset = pd.read_csv('data/train.csv', sep=',')
test_dataset = pd.read_csv('data/test.csv', sep=',')

In [42]:
# SalePriceとの相関を確認し、上位x位を採用する
# 一旦築年数は時系列データのため外す
correlation = dataset.corr()
larger_corr_cols = correlation.nlargest(15, 'SalePrice')['SalePrice'].index.drop(['YearRemodAdd', 'GarageYrBlt', 'MasVnrArea', 'BsmtFinSF1'])

dataset['YearBuilt'] = dataset['YrSold'] - dataset['YearBuilt']
dataset_train = dataset.loc[:,larger_corr_cols]
print(f'NaN Check: {dataset_train.isnull().sum()}')


# Electricalは1つのみ欠損なので行自体を削除
# GrLivAreaにおけるOutlierを削除 (Id = 1299 and 524)
dataset = dataset.drop(dataset[dataset['Id'] == 1299].index)
dataset = dataset.drop(dataset[dataset['Id'] == 524].index)
dataset.dropna(subset=['Electrical'])

df_train = dataset.loc[:,larger_corr_cols]

# Logを取って正規分布へ変換
log_cols = ['GrLivArea', '1stFlrSF', 'SalePrice']
df_train.loc[:,log_cols] = np.log(df_train.loc[:,log_cols])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scale_target = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', 'GarageArea','YearBuilt']
df_train.loc[:,scale_target] = scaler.fit_transform(df_train.loc[:,scale_target])

df_train

NaN Check: SalePrice       0
OverallQual     0
GrLivArea       0
GarageCars      0
GarageArea      0
TotalBsmtSF     0
1stFlrSF        0
FullBath        0
TotRmsAbvGrd    0
YearBuilt       0
Fireplaces      0
dtype: int64


Unnamed: 0,SalePrice,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,Fireplaces
0,12.247694,7,0.539624,2,0.357973,-0.473766,-0.806414,2,8,-1.045249,0
1,12.109011,6,-0.380198,2,-0.056795,0.504925,0.428328,2,6,-0.185182,1
2,12.317167,7,0.671287,2,0.640770,-0.319490,-0.577066,2,6,-0.979090,1
3,11.849398,7,0.551993,3,0.801022,-0.714823,-0.438379,1,7,1.799589,1
4,12.429216,8,1.299759,3,1.715398,0.222888,0.118856,2,9,-0.946011,1
...,...,...,...,...,...,...,...,...,...,...,...
1455,12.072541,6,0.425968,2,-0.056795,-0.239941,-0.464969,2,7,-0.946011,1
1456,12.254863,6,1.122480,2,0.131736,1.179884,2.006969,2,7,-0.152102,2
1457,12.493130,7,1.489306,1,-1.037158,0.239762,0.236122,2,9,1.071840,2
1458,11.864462,5,-0.857345,1,-1.093717,0.061380,-0.072939,1,5,0.774124,0


In [43]:
df_train = pd.get_dummies(df_train, columns=['OverallQual', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces'], drop_first=True)
df_train

Unnamed: 0,SalePrice,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,YearBuilt,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,FullBath_1,FullBath_2,FullBath_3,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9,TotRmsAbvGrd_10,TotRmsAbvGrd_11,TotRmsAbvGrd_12,TotRmsAbvGrd_14,Fireplaces_1,Fireplaces_2,Fireplaces_3
0,12.247694,0.539624,2,0.357973,-0.473766,-0.806414,-1.045249,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,12.109011,-0.380198,2,-0.056795,0.504925,0.428328,-0.185182,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
2,12.317167,0.671287,2,0.640770,-0.319490,-0.577066,-0.979090,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
3,11.849398,0.551993,3,0.801022,-0.714823,-0.438379,1.799589,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
4,12.429216,1.299759,3,1.715398,0.222888,0.118856,-0.946011,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,12.072541,0.425968,2,-0.056795,-0.239941,-0.464969,-0.946011,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1456,12.254863,1.122480,2,0.131736,1.179884,2.006969,-0.152102,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1457,12.493130,1.489306,1,-1.037158,0.239762,0.236122,1.071840,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1458,11.864462,-0.857345,1,-1.093717,0.061380,-0.072939,0.774124,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [44]:
from sklearn.model_selection import train_test_split
X = df_train.iloc[:,1:]
y = df_train.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [45]:
# (Multiple) Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)
y_train_pred = regressor.predict(X_train)

# trainデータとtestデータそれぞれのR2スコアを表示する関数
def print_r2_score(y_train, y_test, y_train_pred, y_pred):
    from sklearn.metrics import r2_score
    print(f'R2 Score(train) is {r2_score(y_train, y_train_pred):.5f}')
    print(f'R2 Score(test) is {r2_score(y_test, y_pred):.5f}')

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.86121
R2 Score(test) is 0.85014


In [28]:
# Polymial Regression
from sklearn.preprocessing import PolynomialFeatures
polynomial_instance = PolynomialFeatures(degree = 2)
X_poly_train = polynomial_instance.fit_transform(X_train)
X_poly_test = polynomial_instance.fit_transform(X_test)

regressor = LinearRegression()
regressor.fit(X_poly_train, y_train)
y_train_pred = regressor.predict(X_poly_train)
y_pred = regressor.predict(X_poly_test)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.90340
R2 Score(test) is -205433801893031378944.00000


In [8]:
# SVR
from sklearn.svm import SVR
svr_regressor = SVR(kernel='linear')
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
y_train_pred = svr_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.85375
R2 Score(test) is 0.82466


In [9]:
# SVR (rbf)
from sklearn.svm import SVR
svr_regressor = SVR(kernel='rbf')
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
y_train_pred = svr_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.90163
R2 Score(test) is 0.80745


In [60]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
rndm_regressor = RandomForestRegressor(n_estimators = 100, criterion='mae', min_samples_split=5,random_state=0, bootstrap=True)
rndm_regressor.fit(X_train, y_train)
y_pred = rndm_regressor.predict(X_test)
y_train_pred = rndm_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.96312
R2 Score(test) is 0.81642


In [26]:
# grid search
from sklearn.model_selection import GridSearchCV
grid_parameters = [
    {'n_estimators': [1, 2, 5, 10, 100], 
    #'criterion': ['mse', 'mae'],
    #'min_samples_split': [1, 2, 5, 10, 20],
    #'min_samples_leaf': [1, 2, 5, 10, 20],
    'bootstrap': [True, False],
    #'max_features': ['auto', 'sqrt', 'log2']
    }
]
grid_search = GridSearchCV(RandomForestRegressor(), grid_parameters, cv=5, scoring='r2', n_jobs = -1, verbose=10)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:    9.9s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:   11.8s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   12.0s finished


{'bootstrap': True, 'n_estimators': 100}

In [12]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
#test_dataset = NaN_check(test_dataset)
test_dataset = NaN_processing(test_dataset)
X_sub, y_sub, test_dataset = data_preprocessing(test_dataset, mode='predict')

y_pred = rndm_regressor.predict(X_sub)

output = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

NameError: name 'NaN_processing' is not defined