In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 81)

In [63]:
# trainデータを取得、NaN値などの状況を確認
dataset = pd.read_csv('data/train.csv', sep=',')

In [4]:
## NaNデータの処理
# 以下のカラムリストについては、NAのものは"None"に置き換えることで欠損値を補完する
dataset = pd.read_csv('data/train.csv', sep=',')

def feature_engineering(dataset):
    
    # 明らかな外れ値についてはデータから削除
    outliers = [ 524, 1299]
    for x in outliers:
        dataset = dataset.drop(dataset[dataset['Id'] == x].index)
    
    Id = dataset['Id']
    dataset = dataset.drop(['Id'], axis=1)
    if 'SalePrice' in dataset.columns:
        y = dataset['SalePrice'].values
        dataset = dataset.drop(['SalePrice'], axis=1)
    else:
        y = np.NaN

    # ************************************************************* NaN処理 *************************************************************
    convert_nan_to_other_list = ['MSSubClass', 'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                                'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                                'PoolQC', 'Fence', 'MiscFeature']
    convert_nan_to_other_list = dict.fromkeys(convert_nan_to_other_list, 'None')
    dataset = dataset.fillna(convert_nan_to_other_list)

    # LotFrontageのNAは、各近接地における中央値で置換
    dataset['LotFrontage'] = dataset['LotFrontage'].fillna(dataset.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median())))
    # Functionalについてはdescriptionに"Assume typical unless deductions are warranted"とある
    dataset['Functional'] = dataset['Functional'].fillna('Typ')

    # ************************* 0でfillna *************************
    for x in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
        dataset[x] = dataset[x].fillna(0)

    # ************************* 最頻値でfillna *************************
    for x in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType'):
        dataset[x] = dataset[x].fillna(dataset[x].mode()[0])

    dataset = dataset.drop(['Utilities'],axis=1)

    if dataset.isnull().sum().sum() == 0:
        print('Dataset doesnt have NaN')
    # ************************************************************* NaN完了 *************************************************************


    # *********************************************************** Feature Engineering *************************************************************
    categorical = ['MSSubClass', 'OverallQual','OverallCond', 'YrSold', 'MoSold']
    for x in categorical:
        dataset[x] = dataset[x].astype(str)

    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    # データセットが情報を表す順序で並んでいる"だろう"ものについて、label encoding
    label_encoding = ['MSSubClass', 'Street', 'Alley', 'LotShape', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 
                    'BsmtQual', 'BsmtCond', 'BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual',
                    'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'MoSold', 'YrSold']
    for x in label_encoding:
        dataset[x] = label_encoder.fit_transform(dataset[x].values)

    dataset['TotalSF'] = dataset['TotalBsmtSF'] + dataset['1stFlrSF'] + dataset['2ndFlrSF']
    
    # 数値尺度のカラムを取得
    numeric_cols = dataset.dtypes[dataset.dtypes != 'object'].index
    skew = dataset.loc[:,numeric_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
    # skewが中央にないものは、log変換
    skewness = skew[abs(skew) > 0.75].index
    dataset[skewness] = np.log1p(dataset[skewness])

    onehot_encoding = dataset.dtypes[dataset.dtypes == 'object'].index
    dataset = pd.get_dummies(dataset, columns=onehot_encoding, drop_first=True)
    # ********************************************************* End of Feature Engineering **********************************************************

    return Id, dataset.values, y

Id, X, y = feature_engineering(dataset)

Dataset doesnt have NaN


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import Lasso, ElasticNet, BayesianRidge, LassoLarsIC
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score

stratified = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def cross_val(model):
    score = cross_val_score(model, X, y, scoring='r2', cv=stratified)
    print(f'score = : {score.mean():.8f}')

lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=0))
elastic = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=0)) 
kernel = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

for x in (lasso, elastic, kernel):
    cross_val(x)


score = : 0.85841015
score = : 0.86091266
score = : 0.82656832


In [81]:
# (Multiple) Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)
y_train_pred = regressor.predict(X_train)

# trainデータとtestデータそれぞれのR2スコアを表示する関数
def print_r2_score(y_train, y_test, y_train_pred, y_pred):
    from sklearn.metrics import r2_score
    print(f'R2 Score(train) is {r2_score(y_train, y_train_pred):.5f}')
    print(f'R2 Score(test) is {r2_score(y_test, y_pred):.5f}')

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.91997
R2 Score(test) is 0.84336


In [25]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
rndm_regressor = RandomForestRegressor(n_estimators = 100, criterion='mae', random_state=0, bootstrap=True)
rndm_regressor.fit(X_train, y_train)
y_pred = rndm_regressor.predict(X_test)
y_train_pred = rndm_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.98457
R2 Score(test) is 0.88754


In [48]:
# grid search
from sklearn.model_selection import GridSearchCV
grid_parameters = [
    {'n_estimators': [1, 2, 5, 10, 100], 
    'criterion': ['mse', 'mae'],
    #'min_samples_split': [1, 2, 5, 10, 20],
    #'min_samples_leaf': [1, 2, 5, 10, 20],
    'bootstrap': [True, False],
    #'max_features': ['auto', 'sqrt', 'log2']
    }
]
grid_search = GridSearchCV(RandomForestRegressor(), grid_parameters, cv=5, scoring='r2', n_jobs = -1, verbose=10)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0620s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0833s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:   10.4s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   32.6s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   34.7s finished


{'bootstrap': True, 'criterion': 'mae', 'n_estimators': 100}

In [12]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
#test_dataset = NaN_check(test_dataset)
test_dataset = NaN_processing(test_dataset)
X_sub, y_sub, test_dataset = data_preprocessing(test_dataset, mode='predict')

y_pred = rndm_regressor.predict(X_sub)

output = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

NameError: name 'NaN_processing' is not defined