In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 81)

In [2]:
# trainデータを取得、NaN値などの状況を確認
dataset = pd.read_csv('data/train.csv', sep=',')
test_dataset = pd.read_csv('data/test.csv', sep=',')
dataset_header = dataset.columns
print(f'Raw dataset shape: {dataset.shape[0]} * {dataset.shape[1]}')
# NaNチェック
# datasetから一列ずつ取り出して、nullが含まれている場合はそのカラム名を返す
def print_NaN_information(dataset):
    for data in dataset:
        numof_null = dataset[data].isnull().sum()
        if numof_null > 0:
            ratio = (numof_null / dataset.shape[0]) * 100
            print(f'NaN in {data}:{numof_null} \tNaN, ratio is\t{ratio:.2f}%')

Raw dataset shape: 1460 * 81


In [9]:
def nan_imputer(dataset):
    # NaNデータの処理
    # 以下のカラムリストについては、NAのものは"None"に置き換えることで欠損値を補完する
    convert_nan_to_other_list = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                                'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                                'PoolQC', 'Fence', 'MiscFeature']
    convert_nan_to_other_list = dict.fromkeys(convert_nan_to_other_list, 'None')
    dataset = dataset.fillna(convert_nan_to_other_list)

    # ******************************************************************************#
    # Basementの処理。BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1のいずれかが'None'なら、
    # BsmtFinType2はNoneに、BsmtFinSF1とBsmtFinSF2, BsmtUnfSF, TotalBsmtSFは0となる
    # どれかから地下室がない情報が得られていて、もし他のデータに齟齬や欠損があれば、その情報で補完

    # BsmtFinType1がUnfなら、BsmtFinSF1は0となる
    # BsmtFinType2がUnfなら、BsmtFinSF2は0となる
    # ベースメントの建設がUnfなら、完成エリアの面積は0で補完
    basement_condition = (dataset['BsmtQual'] == 'None')| (dataset['BsmtCond'] == 'None') | (dataset['BsmtExposure'] == 'None') | (dataset['BsmtFinType1'] == 'None')
    basement_target = ['BsmtFinSF1', 'BsmtFinType2', 'BsmtUnfSF', 'TotalBsmtSF']
    dataset.loc[basement_condition, basement_target] = 0
    dataset.loc[basement_condition, ['BsmtFinType2']] = 'None'

    dataset.loc[(dataset['BsmtFinType1'] == 'Unf'), ['BsmtFinSF1']] = 0
    dataset.loc[(dataset['BsmtFinType2'] == 'Unf'), ['BsmtFinSF2']] = 0
    # ******************************************************************************#

    # LotFrontageのNAは、接続無しとして0で置換
    dataset[['LotFrontage']] = dataset['LotFrontage'].fillna(0)

    # ******************************************************************************#
    # Garageの処理。GarageTypeがNAの場合は製造年は最古 - 100年とする
    # nanとの比較を行うときに、 dataset['aaa'] == np.nan としても、正しくnanを検出できないので避けること
    # dataset.loc[(dataset['GarageType'].isnull() | (dataset['GarageType'] == 'None')), 'GarageYrBlt'] = dataset['GarageYrBlt'].dropna().value_counts().idxmin() - 100
    # ******************************************************************************#

    # ******************************************************************************#
    # その他の欠損値処理
    # いずれも名義尺度のため、NaNについては最頻値で補完してみる
    dataset.loc[dataset['MasVnrType'].isnull(), 'MasVnrType'] = dataset['MasVnrType'].dropna().value_counts().idxmax()
    dataset.loc[dataset['MasVnrArea'].isnull(), 'MasVnrArea'] = dataset['MasVnrArea'].dropna().value_counts().idxmax()
    dataset.loc[dataset['Electrical'].isnull(), 'Electrical'] = dataset['Electrical'].dropna().value_counts().idxmax()
    # ******************************************************************************#

    print_NaN_information(dataset)

    return dataset

dataset = nan_imputer(dataset)

In [44]:
# Data Scalling
# 数値尺度のカラム
num_cols = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'TotRmsAbvGrd', 
                'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch','ScreenPorch',
                'PoolArea', 'MiscVal']
# 名義尺度のカラム。Idはいずれでもないのでpreprocessingの対象外。SalePriceはターゲットのため対象外。
nominal_cols = [i for i in dataset.columns if i not in np.append(num_cols, ['Id', 'SalePrice'])]

def data_preprocessing(dataset):

    # ******************************************************************************#
    # Yearの処理
    this_year = 2021
    #dataset['GarageYrBlt'] = this_year - dataset['GarageYrBlt']
    #dataset['YrSold'] = this_year - dataset['YrSold']
    # ******************************************************************************#

    # ******************************************************************************#
    # 正規分布に従っていないものはlogを取って
    log_convert_required = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']
    for x in log_convert_required:
        # 0はlogを取ると'-Inf'となるのでclipで回避
        dataset[x] = np.log(np.clip(dataset[x], a_min=1e-323, a_max=1e+10))
    # ******************************************************************************#
    # 数値尺度たちは外れ値も考慮してZ-Scoreでスケーリング
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    dataset[num_cols] = scaler.fit_transform(dataset.loc[:,num_cols].values)
    # 名義尺度たちはOne-Hotエンコーディング。他の値で残りの1つはわかるので、drop_first=True
    #dataset = pd.get_dummies(dataset, columns=nominal_cols, drop_first=True)
    return dataset

dataset = data_preprocessing(dataset)
dataset.to_csv('preprocessed.csv')
print_NaN_information(dataset)

In [47]:
# Model Selection
def get_train_test_data(dataset):
    features = ['LotFrontage', 'LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea']
    from sklearn.model_selection import train_test_split
    #X = dataset.loc[:,features].values
    X = dataset.drop(['SalePrice', 'Id'], axis=1)
    Id = dataset['Id']
    if 'SalePrice' in dataset:
        y = np.log(dataset['SalePrice'].values)
    else:
        y = np.nan

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    return X_train, X_test, y_train, y_test, Id

X_train, X_test, y_train, y_test, Id = get_train_test_data(dataset)

In [42]:
import statsmodels.api as sm
model = sm.OLS(y_train,X_train)
fit = model.fit()
print(fit.summary2())
#print(fit.summary2().tables[0])
#p_values = fit.summary2().tables[1]['P>|t|']
#print(p_values)

                        Results: Ordinary least squares
Model:                  OLS              Adj. R-squared (uncentered): -0.005   
Dependent Variable:     y                AIC:                         9137.4720
Date:                   2021-01-26 16:11 BIC:                         9172.9133
No. Observations:       1168             Log-Likelihood:              -4561.7  
Df Model:               7                F-statistic:                 0.2496   
Df Residuals:           1161             Prob (F-statistic):          0.972    
R-squared (uncentered): 0.002            Scale:                       145.38   
-------------------------------------------------------------------------------------
          Coef.        Std.Err.          t          P>|t|         [0.025       0.975]
-------------------------------------------------------------------------------------
x1       -0.1326         0.3534       -0.3752       0.7076       -0.8259       0.5608
x2        0.0298         0.4118        0

In [48]:
# (Multiple) Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)
y_train_pred = regressor.predict(X_train)

# trainデータとtestデータそれぞれのR2スコアを表示する関数
def print_r2_score(y_train, y_test, y_train_pred, y_pred):
    from sklearn.metrics import r2_score
    print(f'R2 Score(train) is {r2_score(y_train, y_train_pred):.5f}')
    print(f'R2 Score(test) is {r2_score(y_test, y_pred):.5f}')

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.96176
R2 Score(test) is -1020489913696940544.00000


In [13]:
# Polymial Regression
from sklearn.preprocessing import PolynomialFeatures
polynomial_instance = PolynomialFeatures(degree = 2)
X_poly_train = polynomial_instance.fit_transform(X_train)
X_poly_test = polynomial_instance.fit_transform(X_test)

regressor = LinearRegression()
regressor.fit(X_poly_train, y_train)
y_train_pred = regressor.predict(X_poly_train)
y_pred = regressor.predict(X_poly_test)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.76559
R2 Score(test) is 0.73769


In [14]:
# SVR
from sklearn.svm import SVR
svr_regressor = SVR(kernel='linear')
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
y_train_pred = svr_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.73139
R2 Score(test) is 0.52200


In [15]:
# SVR (rbf)
from sklearn.svm import SVR
svr_regressor = SVR(kernel='rbf')
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
y_train_pred = svr_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.81207
R2 Score(test) is 0.74288


In [49]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
rndm_regressor = RandomForestRegressor(n_estimators = 100, random_state=0)
rndm_regressor.fit(X_train, y_train)
y_pred = rndm_regressor.predict(X_test)
y_train_pred = rndm_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.98011
R2 Score(test) is 0.86764


In [17]:
# grid search
from sklearn.model_selection import GridSearchCV
grid_parameters = [
    {'n_estimators': [1, 2, 5, 10, 100, 1000], 
    'criterion': ['mse', 'mae'],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10, 20],
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt', 'log2']
    }
]
grid_search = GridSearchCV(RandomForestRegressor(), grid_parameters, cv=5, scoring='r2', n_jobs = -1)
grid_search.fit(X_train, y_train)
grid_search.best_params_

KeyboardInterrupt: 

In [281]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
#test_dataset = NaN_check(test_dataset)
test_dataset = NaN_processing(test_dataset)
X_sub, y_sub, test_dataset = data_preprocessing(test_dataset, mode='predict')

y_pred = rndm_regressor.predict(X_sub)

output = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

TypeError: fillna() got an unexpected keyword argument 'subset'