In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import cross_val_score
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
import matplotlib.pyplot as plt

In [None]:
train_full = pd.read_csv('train.csv',index_col = 'Id')
test_full = pd.read_csv('test.csv',index_col = 'Id')
print('train_full size',train_full.shape)
print('test_full size',test_full.shape)
train_full.head()

In [None]:
train_full.describe()

In [None]:
##check if NaN values in the response, if yes, drop NaN values
train_full.dropna(axis = 0,subset = ['SalePrice'],inplace = True)
y_train = train_full.SalePrice
X_train_full = train_full.drop(['SalePrice'],axis = 1)

##do the same thing for test set
X_test = test_full

In [None]:
col_category = [col for col in X_train_full.columns
              if X_train_full[col].dtypes == 'O']
print('categorical columns : \n',len(col_category))
col_numeric = [col for col in X_train_full.columns
              if X_train_full[col].dtypes in ['int64','float64']]
print('numeric columns: \n',len(col_numeric))

##total number of columns in the final training set
col_to_keep = col_category + col_numeric

In [None]:
##split training set and validation set
X_train,X_valid,y_train,y_valid = train_test_split(X_train_full,y_train,train_size = 0.8,test_size= 0.2,random_state = 0)

In [None]:
X_train_numeric = X_train[col_numeric]
X_valid_numeric = X_valid[col_numeric]
X_test_numeric = X_test[col_numeric]

In [None]:
##categorical variables: find good features and bad features
##get categorical columns of trainset and valid set
X_train_category = X_train[col_category]
X_valid_category = X_valid[col_category]
X_test_category = X_test[col_category]
##find columns with good features and columns with bad features
##check if all faetures of valid set is consistent with all features of training set
good_col_category = [col for col in col_category if 
                   set(X_valid_category[col]).issubset(set(X_train_category[col]))]
bad_col_category = list(set(col_category)-set(good_col_category))

In [None]:
print(len(good_col_category))
print(len(bad_col_category))
bad_col_category

In [None]:
##remove bad features for prediction
X_train_category = X_train_category.drop(bad_col_category,axis = 1)
X_valid_category = X_valid_category.drop(bad_col_category,axis = 1)
X_test_category = X_test_category.drop(bad_col_category,axis = 1)

In [None]:
good2_col_category = [col for col in good_col_category if 
                   set(X_test_category[col]).issubset(set(X_train_category[col]))]
bad2_col_category = list(set(good_col_category)-set(good2_col_category))

In [None]:
print(len(good2_col_category))
print(len(bad2_col_category))
bad2_col_category

In [None]:
##remove bad features for prediction
X_train_category = X_train_category.drop(bad2_col_category,axis = 1)
X_valid_category = X_valid_category.drop(bad2_col_category,axis = 1)
X_test_category = X_test_category.drop(bad2_col_category,axis = 1)

In [None]:
print(X_train_category.shape)
print(X_valid_category.shape)
print(X_test_category.shape)
print(len(good2_col_category)+len(col_numeric))

In [None]:
##combine category adn numeric
X_train_model = pd.concat([X_train_numeric,X_train_category],axis = 1)
X_valid_model = pd.concat([X_valid_numeric,X_valid_category],axis = 1)
X_test_model = pd.concat([X_test_numeric,X_test_category],axis = 1)
X_full_model = pd.concat([X_train_model,X_valid_model],axis = 0)
y_full_model = y_train.append(y_valid)

# Pipeline for random forest

In [None]:
##now define pre-processing framework
numeric_transformer = SimpleImputer(strategy = 'mean')
category_transformer = Pipeline(steps = [('imputer',SimpleImputer(strategy = 'most_frequent')),('ordinal',OrdinalEncoder())])
pre_processor = ColumnTransformer(transformers = [('num',numeric_transformer,col_numeric),
                                                  ('cat',category_transformer,good2_col_category)])

##define model
model = RandomForestRegressor(n_estimators = 100,random_state = 0)
clf = Pipeline(steps = [('preprocessor',pre_processor),('model',model)])
clf.fit(X_train_model,y_train)
preds_valid = clf.predict(X_valid_model)
print('MAE: \n',mean_absolute_error(preds_valid,y_valid))

# output: base line model

In [None]:
preds_test = clf.predict(X_test_model)
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

# mannually impuating and encoding

In [None]:
##imputation of numerical variables (fill NaN with mean)
numeric_imputer = SimpleImputer()
X_full_model_numeric_impute = pd.DataFrame(numeric_imputer.fit_transform(X_full_model[col_numeric]))
X_full_model_numeric_impute.columns = col_numeric
X_full_model_numeric_impute.index = X_full_model.index
X_full_model_numeric_impute.head()

X_test_model_numeric_impute = pd.DataFrame(numeric_imputer.transform(X_test_model[col_numeric]))
X_test_model_numeric_impute.columns = col_numeric
X_test_model_numeric_impute.index = X_test_model.index

In [None]:
##imputation of categorical columns
category_imputer = SimpleImputer(strategy = 'most_frequent')
X_full_model_category_impute = pd.DataFrame(category_imputer.fit_transform(X_full_model[good2_col_category]))
X_full_model_category_impute.columns = good2_col_category
X_full_model_category_impute.index = X_full_model.index
X_full_model_category_impute.head()

X_test_model_category_impute = pd.DataFrame(category_imputer.transform(X_test_model[good2_col_category]))
X_test_model_category_impute.columns = good2_col_category
X_test_model_category_impute.index = X_test_model.index

In [None]:
# ## one-hot encoding here
# oh_encoder = OneHotEncoder(handle_unknown = 'ignore',sparse = False)
# X_full_model_category_impute_oh = pd.DataFrame(oh_encoder.fit_transform(X_full_model_category_impute[good2_col_category]))
# X_full_model_category_impute_oh.index = X_full_model_category_impute.index
# X_full_model_category_impute_oh.head()

# X_test_model_category_impute_oh = pd.DataFrame(oh_encoder.transform(X_test_model_category_impute[good2_col_category]))
# X_test_model_category_impute_oh.index = X_test_model_category_impute.index

## ordinal encoding is better for Random Forest
or_encoder = OrdinalEncoder()
X_full_model_category_impute_or = pd.DataFrame(or_encoder.fit_transform(X_full_model_category_impute[good2_col_category]))
X_full_model_category_impute_or.index = X_full_model_category_impute.index

X_test_model_category_impute_or = pd.DataFrame(or_encoder.fit_transform(X_test_model_category_impute[good2_col_category]))
X_test_model_category_impute_or.index = X_test_model_category_impute.index

In [None]:
##combine imputed and encoded categorical columns and numeric columns (one-hot encoder)
# X_full_model_ready = pd.concat([X_full_model_numeric_impute,X_full_model_category_impute_oh],axis = 1)
# X_test_model_ready = pd.concat([X_test_model_numeric_impute,X_test_model_category_impute_oh],axis=1)

##ordinal encoder
X_full_model_ready = pd.concat([X_full_model_numeric_impute,X_full_model_category_impute_or],axis = 1)
X_test_model_ready = pd.concat([X_test_model_numeric_impute,X_test_model_category_impute_or],axis=1)

In [None]:
##fit randomforest
rf_model1 = RandomForestRegressor(n_estimators = 2000,random_state=0)
rf_model1.fit(X_full_model_ready,y_full_model)
np.mean(-1*cross_val_score(rf_model1,X_full_model_ready,y_full_model,cv = 5,scoring = 'neg_mean_absolute_error'))

In [None]:
##find the optimal number of trees
MAE_sen = []
num_estimators_sen = np.arange(200,3000,200)
for n_estimators in num_estimators_sen:
    rf_model1 = RandomForestRegressor(n_estimators = n_estimators,random_state=0,n_jobs = 6)
    rf_model1.fit(X_full_model_ready,y_full_model)
    MAE_sen.append(np.mean(-1*cross_val_score(rf_model1,X_full_model_ready,y_full_model,
                                              cv = 10,scoring = 'neg_mean_absolute_error')))

In [None]:
plt.plot(num_estimators_sen,MAE_sen)
plt.xlabel(r'n_estimators',fontsize = 16)
plt.ylabel(r"MAE",fontsize = 16)

print(MAE_sen)
num_estimators_sen[np.argmin(MAE_sen)]
MAE_sen[np.argmin(MAE_sen)]
##seems n_estimators = 800 is best value

# cross validation to optimize number of trees

In [None]:
##cross validation
def get_score(n_estimators):
    model = RandomForestRegressor(n_estimators,random_state = 0)
    clf_pipe = Pipeline(steps = [('preprocessor',pre_processor),('model',model)])
    scores = -1*cross_val_score(clf_pipe,X_full_model,y_full_model,cv = 5,scoring = 'neg_mean_absolute_error')
    return scores.mean()

# to be done: cross_val_score not work for pipeline of randomforest regressor