In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 81)

In [39]:
# trainデータを取得、NaN値などの状況を確認
dataset = pd.read_csv('data/train.csv', sep=',')
test_dataset = pd.read_csv('data/test.csv', sep=',')

In [41]:
# SalePriceとの相関を確認し、上位x位を採用する
# 一旦築年数は時系列データのため外す
correlation = dataset.corr()
larger_corr_cols = correlation.nlargest(15, 'SalePrice')['SalePrice'].index.drop(['YearRemodAdd', 'GarageYrBlt', 'MasVnrArea', 'BsmtFinSF1'])

dataset['YearBuilt'] = dataset['YrSold'] - dataset['YearBuilt']
dataset_train = dataset.loc[:,larger_corr_cols]
print(f'NaN Check: {dataset_train.isnull().sum()}')

print(larger_corr_cols)

# log targetsにおけるOutlierを削除 
# EDA検証により、SalePriceの外れ値に対応したIdを削除
outliers = [ 524, 1299]
for x in outliers:
    dataset = dataset.drop(dataset[dataset['Id'] == x].index)
    
# Electricalは1つのみ欠損なので行自体を削除
dataset.dropna(subset=['Electrical'])

df_train = dataset.loc[:,larger_corr_cols]

# Logを取って正規分布へ変換
log_cols = ['GrLivArea', '1stFlrSF', 'SalePrice']
df_train.loc[:,log_cols] = np.log(df_train.loc[:,log_cols])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scale_target = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', 'GarageArea','YearBuilt']
df_train.loc[:,scale_target] = scaler.fit_transform(df_train.loc[:,scale_target])

df_train

NaN Check: SalePrice       0
OverallQual     0
GrLivArea       0
GarageCars      0
GarageArea      0
TotalBsmtSF     0
1stFlrSF        0
FullBath        0
TotRmsAbvGrd    0
YearBuilt       0
Fireplaces      0
dtype: int64
Index(['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
       'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',
       'Fireplaces'],
      dtype='object')


Unnamed: 0,SalePrice,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,Fireplaces
0,12.247694,7,0.548398,2,0.359050,-0.476217,-0.815867,2,8,-1.044786,0
1,12.109011,6,-0.387473,2,-0.056722,0.511373,0.431829,2,6,-0.185182,1
2,12.317167,7,0.682358,2,0.642531,-0.320538,-0.584113,2,6,-0.978663,1
3,11.849398,7,0.560983,3,0.803170,-0.719466,-0.443971,1,7,1.798520,1
4,12.429216,8,1.321795,3,1.719758,0.226772,0.119110,2,9,-0.945601,1
...,...,...,...,...,...,...,...,...,...,...,...
1455,12.072541,6,0.432759,2,-0.056722,-0.240266,-0.470840,2,7,-0.945601,1
1456,12.254863,6,1.141423,2,0.132265,1.192469,2.027033,2,7,-0.152120,2
1457,12.493130,7,1.514650,1,-1.039456,0.243799,0.237606,2,9,1.071163,2
1458,11.864462,5,-0.872945,1,-1.096152,0.063795,-0.074697,1,5,0.773608,0


In [42]:
df_train = pd.get_dummies(df_train, columns=['OverallQual', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces'], drop_first=True)
df_train

Unnamed: 0,SalePrice,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,YearBuilt,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,FullBath_1,FullBath_2,FullBath_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9,TotRmsAbvGrd_10,TotRmsAbvGrd_11,TotRmsAbvGrd_12,TotRmsAbvGrd_14,Fireplaces_1,Fireplaces_2,Fireplaces_3
0,12.247694,0.548398,2,0.359050,-0.476217,-0.815867,-1.044786,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,12.109011,-0.387473,2,-0.056722,0.511373,0.431829,-0.185182,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0
2,12.317167,0.682358,2,0.642531,-0.320538,-0.584113,-0.978663,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0
3,11.849398,0.560983,3,0.803170,-0.719466,-0.443971,1.798520,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
4,12.429216,1.321795,3,1.719758,0.226772,0.119110,-0.945601,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,12.072541,0.432759,2,-0.056722,-0.240266,-0.470840,-0.945601,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1456,12.254863,1.141423,2,0.132265,1.192469,2.027033,-0.152120,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1457,12.493130,1.514650,1,-1.039456,0.243799,0.237606,1.071163,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1458,11.864462,-0.872945,1,-1.096152,0.063795,-0.074697,0.773608,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [43]:
from sklearn.model_selection import train_test_split
X = df_train.iloc[:,1:].values
y = df_train.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [44]:
# (Multiple) Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)
y_train_pred = regressor.predict(X_train)

# trainデータとtestデータそれぞれのR2スコアを表示する関数
def print_r2_score(y_train, y_test, y_train_pred, y_pred):
    from sklearn.metrics import r2_score
    print(f'R2 Score(train) is {r2_score(y_train, y_train_pred):.5f}')
    print(f'R2 Score(test) is {r2_score(y_test, y_pred):.5f}')

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.85441
R2 Score(test) is 0.85298


In [28]:
# Polymial Regression
from sklearn.preprocessing import PolynomialFeatures
polynomial_instance = PolynomialFeatures(degree = 2)
X_poly_train = polynomial_instance.fit_transform(X_train)
X_poly_test = polynomial_instance.fit_transform(X_test)

regressor = LinearRegression()
regressor.fit(X_poly_train, y_train)
y_train_pred = regressor.predict(X_poly_train)
y_pred = regressor.predict(X_poly_test)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.90340
R2 Score(test) is -205433801893031378944.00000


In [8]:
# SVR
from sklearn.svm import SVR
svr_regressor = SVR(kernel='linear')
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
y_train_pred = svr_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.85375
R2 Score(test) is 0.82466


In [9]:
# SVR (rbf)
from sklearn.svm import SVR
svr_regressor = SVR(kernel='rbf')
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
y_train_pred = svr_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.90163
R2 Score(test) is 0.80745


In [49]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
rndm_regressor = RandomForestRegressor(n_estimators = 100, criterion='mae', random_state=0, bootstrap=True)
rndm_regressor.fit(X_train, y_train)
y_pred = rndm_regressor.predict(X_test)
y_train_pred = rndm_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.97374
R2 Score(test) is 0.83842


In [48]:
# grid search
from sklearn.model_selection import GridSearchCV
grid_parameters = [
    {'n_estimators': [1, 2, 5, 10, 100], 
    'criterion': ['mse', 'mae'],
    #'min_samples_split': [1, 2, 5, 10, 20],
    #'min_samples_leaf': [1, 2, 5, 10, 20],
    'bootstrap': [True, False],
    #'max_features': ['auto', 'sqrt', 'log2']
    }
]
grid_search = GridSearchCV(RandomForestRegressor(), grid_parameters, cv=5, scoring='r2', n_jobs = -1, verbose=10)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0620s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0833s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:   10.4s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   32.6s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   34.7s finished


{'bootstrap': True, 'criterion': 'mae', 'n_estimators': 100}

In [12]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
#test_dataset = NaN_check(test_dataset)
test_dataset = NaN_processing(test_dataset)
X_sub, y_sub, test_dataset = data_preprocessing(test_dataset, mode='predict')

y_pred = rndm_regressor.predict(X_sub)

output = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

NameError: name 'NaN_processing' is not defined