In [27]:
%matplotlib inline

## House price prediction, exercise #1

In [28]:
import pandas as pd
import numpy as np
import xgboost
import sklearn
import seaborn as sb
import math
import matplotlib.pyplot as plot

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from scipy.stats import skew
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV


train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')



## Exploratory Data Analysis

## Preprocessing
* (1) 1460개 행 중에서 1000개 이상의 결측값을 가진 열 삭제 
* (2) 수치형 변수와 범주형 변수로 나누어서 결측치 대체
* (3) 범주형 데이터는 카디널리티 10 기준으로 나누어서 각각 Oridinal, OneHot으로 인코딩

In [29]:
'''

# (1)
loss_cols  = [col for col in train if train[col].isnull().sum() > 1000]
train_f = train.drop(loss_cols, axis=1)


# (2)
train_num_cols = [col for col in train_f if train_f[col].dtypes !='object' ]
sim = SimpleImputer()
train_num = pd.DataFrame(sim.fit_transform(train_f[train_num_cols]), columns=train_num_cols)

train_cat = train_f.select_dtypes(include='object')
cat_sim = SimpleImputer(strategy='most_frequent')
train_cat = pd.DataFrame(cat_sim.fit_transform(train_cat), columns=train_cat.columns)

# (3)
high_cardinal_cols = [col for col in train_cat.columns if train_cat[col].nunique() >= 10]
low_cardinal_cols = [col for col in train_cat.columns if train_cat[col].nunique() < 10]

ore = OrdinalEncoder()
train_ohe = pd.get_dummies(train_cat[low_cardinal_cols],  prefix=low_cardinal_cols, prefix_sep='_') #pd.DataFrame(ohe.fit_transform(train_cat[low_cardinal_cols]))
train_ore = pd.DataFrame(ore.fit_transform(train_cat[high_cardinal_cols]), columns = high_cardinal_cols)

# concatenation
train_f.drop(train_cat.columns, axis=1, inplace=True)
train_f.drop(train_num.columns, axis=1, inplace=True)

train_f = pd.concat([train_num, train_ohe, train_ore], axis=1)

# # of joined dataframe's col is 223
print('====null values====')
print(train_f.isnull().sum().sum())
print('====Validation====')
print(len(train_cat.columns), len(train_num.columns))
print(len(train_ohe.columns), len(train_ore.columns), len(train_num.columns))
print("Valid : " ,((len(train_ohe.columns)+len(train_ore.columns)+len(train_num.columns)) == 223))
'''


'\n\n# (1)\nloss_cols  = [col for col in train if train[col].isnull().sum() > 1000]\ntrain_f = train.drop(loss_cols, axis=1)\n\n\n# (2)\ntrain_num_cols = [col for col in train_f if train_f[col].dtypes !=\'object\' ]\nsim = SimpleImputer()\ntrain_num = pd.DataFrame(sim.fit_transform(train_f[train_num_cols]), columns=train_num_cols)\n\ntrain_cat = train_f.select_dtypes(include=\'object\')\ncat_sim = SimpleImputer(strategy=\'most_frequent\')\ntrain_cat = pd.DataFrame(cat_sim.fit_transform(train_cat), columns=train_cat.columns)\n\n# (3)\nhigh_cardinal_cols = [col for col in train_cat.columns if train_cat[col].nunique() >= 10]\nlow_cardinal_cols = [col for col in train_cat.columns if train_cat[col].nunique() < 10]\n\nore = OrdinalEncoder()\ntrain_ohe = pd.get_dummies(train_cat[low_cardinal_cols],  prefix=low_cardinal_cols, prefix_sep=\'_\') #pd.DataFrame(ohe.fit_transform(train_cat[low_cardinal_cols]))\ntrain_ore = pd.DataFrame(ore.fit_transform(train_cat[high_cardinal_cols]), columns = hig

## Preprocessing. V2

* Log transformation (@ skewed featrues)
* Encoding categorial values 
* Null value imputation


In [30]:
# 1
train["SalePrice"] = np.log1p(train["SalePrice"])

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],test.loc[:,'MSSubClass':'SaleCondition']))

numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
skewed_feats  = train[numeric_feats].apply(lambda x : skew(x.dropna()) > 0.75).index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

#2
all_data = pd.get_dummies(all_data)

#3
all_data = all_data.fillna(all_data.mean())

train_f = all_data[:train.shape[0]]
test_f = all_data[train.shape[0]:]
train_y = train.SalePrice

print(all_data.shape, train_f.shape, test_f.shape)

(2919, 288) (1460, 288) (1459, 288)


## Modeling
*  #1. train_test_split
*  #2. Cross_val_score
*  #3. Kfold
*  #4. KFold + hyperparameter tuning(GCV)
*  #5. Ridge 

In [31]:
flag = 2
if flag == 1: # error : 
    train_x, valid_x, train_y, valid_y = train_test_split(train_f,train_y, train_size=0.8, test_size=0.2)

    model = XGBRegressor(eta=0.1, colsample_bytree=0.75, max_depth= 3, min_child_weight=3, eval_metric="rmse")
    model.fit(train_x,train_y)

    pred = model.predict(valid_x)

    score = mean_squared_error(pred, valid_y, squared=False)
    print(score)
    print("root_mean_squared_error: ", score)
    
elif flag == 2:  
    model = XGBRegressor(eta=0.1, colsample_bytree=0.75, max_depth= 3, min_child_weight=3, eval_metric="rmse")
    model.fit(train_f,train_y)
    scores = -1*cross_val_score(model, train_f, train_y, cv=5, scoring='neg_mean_squared_error')
    print(scores)
    print("root_mean_squared_error: ",math.sqrt(scores.mean()))
elif flag == 3:
    pass
elif flag == 4:
    train_x, valid_x, train_y, valid_y = train_test_split(train_f, train_y, train_size=0.8, test_size = 0.2)
    
    model = XGBRegressor()
    kf = KFold(random_state=30, shuffle=True, n_splits=5)
    params = {'eta':[0.05, 0.1],'max_depth':[5,7], 'min_child_weight':[1,3], 'colsample_bytree':[0.5,0.75]}
    
    gcv = GridSearchCV(estimator=model, cv=kf, n_jobs=10, scoring='neg_mean_squared_error', verbose=True, param_grid=params)
    
    
    gcv.fit(train_x, train_y)
    print(gcv.best_params_)
    
    model = gcv.best_estimator_
    pred = model.predict(valid_x)
    score = mean_squared_error(pred, valid_y, squared=False)
    
    print("mean_squared_error: ",math.sqrt(score))
#     Fitting 10 folds for each of 8 candidates, totalling 80 fits
#     {'colsample_bytree': 0.75, 'max_depth': 5, 'min_child_weight': 3}
#     Mean_absolute_error:  17648.85913420377
elif flag == 5 : 
    def rmse_cv(model):
        rmse= np.sqrt(-cross_val_score(model, train_f, train_y, scoring="neg_mean_squared_error", cv = 5))
        return rmse
    alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10 ,15, 30, 50, 75]
    cv_ridge = [rmse_cv(Ridge(alpha=alpha)).mean() for alpha in alphas]
    cv_ridge = pd.Series(cv_ridge, index=alphas)
    cv_ridge.plot()

    train_f.describe()
    score  = cv_ridge.min()
    print('score', score)

[0.01319954 0.01929687 0.01713127 0.01394279 0.01934873]
root_mean_squared_error:  0.12877825831898793


## Submission

In [32]:
pred_res = best_model.predict(test_f)
pred_res = np.exp(pred_res)

sub_df = pd.DataFrame({"Id":test.Id, "SalePrice":pred_res})
sub_df.to_csv('./data/my_kernel_submission.csv',index=False)