In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import collections
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
#import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor

# Function Definitions

In [65]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred)/y_true))*100

In [66]:
def score(y_test,y_pred):
    #### Score using RMSE (root mean square error)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    print("RMSE score: %f" % rmse)
    
    #### Score using MAPE (mean absolute porcentage error)
    ###### (MAPE is how far the model’s predictions are off from their corresponding outputs on average)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print("MAPE score: %f" % mape)

# Read File

In [67]:
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv') #Data Test for kaggle

### Replace values where NaN has meaning

In [68]:
def replace_NaN_meaning(data):
# columns where NaN values have meaning e.g. no pool etc.
    cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']
    # replace 'NaN' with 'None' in these columns
    for col in cols_fillna:
        data[col].fillna('None',inplace=True)
    
    return data

In [69]:
data = replace_NaN_meaning(data)
data_test = replace_NaN_meaning(data_test)

### Split features and target

In [70]:
lis_drop_num_columns = ['SalePrice','Id']
X = data.drop(lis_drop_num_columns,axis=1)
Y = data[['SalePrice']]

id_test_kaggle = data_test[['Id']]
X_kaggle = data_test.drop('Id',axis=1)

# Handle Missing Data

In [71]:
def find_numerical_categorical_columns(X):
    num_columns = X.select_dtypes(exclude=['object'])
    categ_columns = X.select_dtypes(['object'])
    return num_columns, categ_columns

In [72]:
def replace_NaN_numerical(X, num_columns):
    #Replace NAN in numerical column data by the mean of the column
    X[num_columns.columns] = X[num_columns.columns].groupby(num_columns.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))
    return X[num_columns.columns]

In [73]:
def most_frequent_word(col):
    col = [x for x in col if str(x) != 'nan']
    counter = collections.Counter(col)
    return counter.most_common()[0][0]

In [74]:
def replace_NaN_categ(X, categ_columns):
    for col in categ_columns:
        X[col].fillna(most_frequent_word(col),inplace=True)
    return X[categ_columns]

In [53]:
#Find Numerical and Categorical Columns
num_columns, categ_columns = find_numerical_categorical_columns(X)
num_columns_kaggle, categ_colums_kaggle = find_numerical_categorical_columns(X_kaggle)

#Handle NaN values in numerical data
X[num_columns.columns] = replace_NaN_numerical(X, num_columns)
X_kaggle[num_columns.columns] = replace_NaN_numerical(X_kaggle, num_columns_kaggle)

#Handle NaN values in categorical data
X[categ_columns.columns] = replace_NaN_categ(X, categ_columns.columns)
X_kaggle[categ_colums_kaggle.columns] = replace_NaN_categ(X_kaggle, categ_columns.columns)

In [81]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


## Normalize numerical data

In [83]:
X[num_columns.columns] = preprocessing.scale(num_columns)
X_kaggle[num_columns.columns] = preprocessing.scale(num_columns_kaggle)

## Using One-Hot encoding

In [84]:
#use one-hot encoding in categorical data
print(categ_columns.shape)
print(categ_colums_kaggle.shape)
conc_categ_df = pd.concat([categ_columns,categ_colums_kaggle])

one_hot_encoding_all = pd.get_dummies(conc_categ_df)

(1460, 43)
(1459, 43)


In [85]:
# Split again between the training kaggle data and test kaggle data
one_hot_encoding = one_hot_encoding_all[:categ_columns.shape[0]]
one_hot_encoding_kaggle = one_hot_encoding_all[categ_columns.shape[0]:]

In [86]:
one_hot_encoding.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_None,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,...,SaleType_New,SaleType_None,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,1,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0


### Join categorical and numerical columns again

In [89]:
#print(X_kaggle[num_columns.columns].shape)
#print(one_hot_encoding_kaggle.shape)

In [59]:
X_final = pd.concat([ X[num_columns.columns], one_hot_encoding], axis=1)
X_final_kaggle = pd.concat([ X_kaggle[num_columns.columns], one_hot_encoding_kaggle], axis=1)
X_final.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       ...
       'SaleType_New', 'SaleType_None', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=310)

In [60]:
data_final = pd.concat([X_final, Y], axis=1)
data_final = data_final.dropna()
data_final.isnull().values.any()

False

In [61]:
X_fin = data_final.drop('SalePrice',axis=1)
Y_fin = data_final[['SalePrice']]

## PCA

In [21]:
#pca = PCA(n_components=0.95)
#X_fin3 = pca.fit_transform(X_fin2)

In [22]:
# Shape before
#print(X_fin2.shape)
#Shapre after
#print(X_fin3.shape)

## Split train and test

In [23]:
#Split train x test
X_train, X_test, y_train, y_test = train_test_split(X_fin, Y_fin, shuffle=True, random_state=42)

# Train and prediction

#### Linear Regression

In [None]:
reg1 = LinearRegression().fit(X_train, y_train)
y_pred1 = reg1.predict(X_test)

#### Decision Tree

In [None]:
reg2 = DecisionTreeRegressor().fit(X_train, y_train)
y_pred2 = reg2.predict(X_test)

#### Random Florest

In [24]:
reg3 = RandomForestRegressor().fit(X_train, y_train)
y_pred3 = reg3.predict(X_test)

  """Entry point for launching an IPython kernel.


In [None]:
param_grid = {'min_samples_split' : [3,4,6,10], 'n_estimators' : [70,100] }
grid_rf = GridSearchCV(RandomForestRegressor(), param_grid, cv=10, verbose=1)
reg = grid_rf.fit(X_train, y_train)

#### Gradient Boosting Regressor

In [25]:
reg4 = GradientBoostingRegressor(max_features='sqrt',loss='huber').fit(X_train, y_train)
y_pred4 = reg4.predict(X_test)

#### MLP 

In [None]:
reg5 = MLPRegressor().fit(X_train, y_train)
y_pred5 = reg5.predict(X_test)

In [None]:
param_grid = {'hidden_layer_sizes' : [100,(100,50),(100,50,20)], 'solver' : ['lbfgs', 'sgd', 'adam'],  }
grid_rf = GridSearchCV(MLPRegressor(learning_rate = 'adaptive', activation = 'logistic'), param_grid, cv=10)
reg = grid_rf.fit(X_train, y_train)

#### SVR

In [None]:
reg6 = SVR().fit(X_train, y_train)
y_pred6 = reg6.predict(X_test)

In [None]:
param_grid = {'kernel' : ['rbf','sigmoid'], 'C' : [0.01,0.1,1,10,100,1000], 'gamma': [0.01,0.1,1,10,100]  }
grid_rf = GridSearchCV(SVR(), param_grid, cv=10)
reg = grid_rf.fit(X_train, y_train)

#### Recursive feature elimination with cross-validation#### 

In [None]:
# Create the RFE object and compute a cross-validated score.
rf = RandomForestRegressor()
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=rf, step=1, cv=StratifiedKFold(2))
reg = rfecv.fit(X_train, y_train)
print("Optimal number of features : %d" % rfecv.n_features_)

## Final Predictor to Combine the predictors

In [26]:
# Train
X_comb = pd.DataFrame({'rf': reg3.predict(X_train), 'gb': reg4.predict(X_train)})
reg_comb = LinearRegression().fit(X_comb, y_train)

#Test
X_comb_test = pd.DataFrame({'rf': y_pred3, 'gb': y_pred4})
y_pred_ens = reg_comb.predict(X_comb_test)

## Score

In [27]:
score(y_test,y_pred_ens)

RMSE score: 32758.972556
MAPE score: 10.589531


# Test Kaggle

In [31]:
X_final_kaggle[num_columns.columns] = X_final_kaggle[num_columns.columns].groupby(num_columns.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))

In [33]:
Y_kaggle = reg3.predict(X_final_kaggle)
Y_kaggle2 = reg4.predict(X_final_kaggle)

### Ensemble Predictor 

In [34]:
X_comb_kaggle = pd.DataFrame({'rf': Y_kaggle, 'gb': Y_kaggle2})
Y_kaggle_ens = reg_comb.predict(X_comb_kaggle)

In [35]:
df_submission_kaggle = pd.DataFrame(Y_kaggle_ens, columns =['SalePrice']) 
df_submission_kaggle['Id'] = id_test_kaggle

In [36]:
df_submission_kaggle.to_csv('submission.csv', index=False)

In [37]:
df_submission_kaggle.shape

(1459, 2)