In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import collections
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
#import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor

# Function Definitions

In [2]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred)/y_true))*100

# Data Analysis

In [3]:
data = pd.read_csv('train.csv')

In [4]:
data_test = pd.read_csv('test.csv') #Data Test for kaggle

### Replace values where NaN has meaning

In [5]:
def replace_NaN_meaning(data):
# columns where NaN values have meaning e.g. no pool etc.
    cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']
    # replace 'NaN' with 'None' in these columns
    for col in cols_fillna:
        data[col].fillna('None',inplace=True)
    
    return data

In [6]:
data = replace_NaN_meaning(data)
data_test = replace_NaN_meaning(data_test)

In [7]:
#data.info()

### check skewness and kutosis

In [8]:
#sns.distplot(data['SalePrice'])

#print('Skewness: %f' % data['SalePrice'].skew())
#print('Kurtosis: %f' % data['SalePrice'].kurt())

##### The data has high skewness -> Many houses were being sold for less than the average value
##### The data has high kurtosis -> Is a indicator that data has many outliers
#### The data SalePrice is not normally distributed, and many ML assume normal distribution, it can be a problem

In [9]:
lis_drop_num_columns = ['SalePrice','Id']
X = data.drop(lis_drop_num_columns,axis=1)
Y = data[['SalePrice']]

id_test_kaggle = data_test[['Id']]
X_kaggle = data_test.drop('Id',axis=1)

# Handle Missing Data

In [10]:
def find_numerical_categorical_columns(X):
    num_columns = X.select_dtypes(exclude=['object'])
    categ_columns = X.select_dtypes(['object'])
    return num_columns, categ_columns

In [11]:
def replace_NaN_numerical(X, num_columns):
    #Replace NAN in numerical column data by the mean of the column
    X[num_columns.columns] = X[num_columns.columns].groupby(num_columns.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))
    return X[num_columns.columns]

In [12]:
def most_frequent_word(col):
    col = [x for x in col if str(x) != 'nan']
    counter = collections.Counter(col)
    return counter.most_common()[0][0]

In [13]:
def replace_NaN_categ(X, categ_columns):
    for col in categ_columns:
        X[col].fillna(most_frequent_word(col),inplace=True)
    return X[categ_columns]

In [14]:
num_columns, categ_columns = find_numerical_categorical_columns(X)
num_columns_kaggle, categ_colums_kaggle = find_numerical_categorical_columns(X_kaggle)

In [15]:
X[num_columns.columns] = replace_NaN_numerical(X, num_columns)
X_kaggle[num_columns.columns] = replace_NaN_numerical(X_kaggle, num_columns_kaggle)

In [16]:
X[categ_columns.columns] = replace_NaN_categ(X, categ_columns.columns)
X_kaggle[categ_colums_kaggle.columns] = replace_NaN_categ(X_kaggle, categ_columns.columns)

In [17]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


## Normalize numerical data

In [18]:
X[num_columns.columns] = preprocessing.scale(num_columns)
X_kaggle[num_columns.columns] = preprocessing.scale(num_columns_kaggle)

## Using One-Hot encoding

In [19]:
#use one-hot encoding
print(categ_columns.shape)
print(categ_colums_kaggle.shape)
conc_categ_df = pd.concat([categ_columns,categ_colums_kaggle])
#one_hot_encoding = pd.get_dummies(categ_columns)
#one_hot_encoding_kaggle = pd.get_dummies(categ_columns_X_kaggle)

(1460, 43)
(1459, 43)


In [20]:
one_hot_encoding_all = pd.get_dummies(conc_categ_df)

In [21]:
categ_columns.shape[0]

1460

In [22]:
one_hot_encoding = one_hot_encoding_all[:categ_columns.shape[0]]
one_hot_encoding_kaggle = one_hot_encoding_all[categ_columns.shape[0]:]

In [23]:
one_hot_encoding.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_None,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,...,SaleType_New,SaleType_None,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,1,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0


### Join categorical and numerical columns again

In [24]:
X_final = pd.concat([ X[num_columns.columns], one_hot_encoding], axis=1)
X_final_kaggle = pd.concat([ X_kaggle[num_columns.columns], one_hot_encoding_kaggle], axis=1)

In [25]:
X_final.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       ...
       'SaleType_New', 'SaleType_None', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=310)

In [26]:
data_final = pd.concat([X_final, Y], axis=1)

In [27]:
data_final = data_final.dropna()

In [28]:
data_final.isnull().values.any()

False

In [29]:
X_fin2 = data_final.drop('SalePrice',axis=1)
Y_fin = data_final[['SalePrice']]

## PCA

In [30]:
#pca = PCA(n_components=0.95)
#X_fin3 = pca.fit_transform(X_fin2)

In [31]:
# Shape before
#print(X_fin2.shape)
#Shapre after
#print(X_fin3.shape)

## Split train and test

In [32]:
#Split train x test
X_train, X_test, y_train, y_test = train_test_split(X_fin2, Y_fin, shuffle=True, random_state=42)

In [33]:
#X_train.head()

In [34]:
#X_train.isnull().values.any()

# Train

#### Linear Regression

In [35]:
reg = LinearRegression().fit(X_train, y_train)

#### Decision Tree

In [36]:
reg = DecisionTreeRegressor().fit(X_train, y_train)

#### Random Florest

In [39]:
reg = RandomForestRegressor().fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


In [38]:
param_grid = {'min_samples_split' : [3,4,6,10], 'n_estimators' : [70,100] }
grid_rf = GridSearchCV(RandomForestRegressor(), param_grid, cv=10, verbose=1)
reg = grid_rf.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estim

KeyboardInterrupt: 

#### Gradient Boosting Regressor

In [40]:
reg2 = GradientBoostingRegressor(max_features='sqrt',loss='huber').fit(X_train, y_train)

#### MLP 

In [45]:
reg = MLPRegressor().fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [88]:
X_train.isnull().values.any()

False

In [104]:
param_grid = {'hidden_layer_sizes' : [100,(100,50),(100,50,20)], 'solver' : ['lbfgs', 'sgd', 'adam'],  }
grid_rf = GridSearchCV(MLPRegressor(learning_rate = 'adaptive', activation = 'logistic'), param_grid, cv=10)
reg = grid_rf.fit(X_train, y_train)

#### SVR

In [49]:
reg = SVR().fit(X_train, y_train)

In [None]:
param_grid = {'kernel' : ['rbf','sigmoid'], 'C' : [0.01,0.1,1,10,100,1000], 'gamma': [0.01,0.1,1,10,100]  }
grid_rf = GridSearchCV(SVR(), param_grid, cv=10)
reg = grid_rf.fit(X_train, y_train)

#### Recursive feature elimination with cross-validation#### 

In [41]:
# Create the RFE object and compute a cross-validated score.
rf = RandomForestRegressor()
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=rf, step=1, cv=StratifiedKFold(2))
reg = rfecv.fit(X_train, y_train)
print("Optimal number of features : %d" % rfecv.n_features_)

Optimal number of features : 219


In [75]:
wx = pd.DataFrame({'rf': reg.predict(X_train), 'gb': reg2.predict(X_train)})
reg3 = LinearRegression().fit(wx, y_train)
weights = reg3.coef_

In [89]:
#rf_weight = weights[0][0] / (weights[0][0] + weights[0][1])
#gb_weight = weights[0][1] / (weights[0][0] + weights[0][1])
rf_weight = weights[0][0]
gb_weight = weights[0][1]

## Test

In [90]:
#test
y_pred = reg.predict(X_test)

### Ensemble regressors

In [91]:
y_pred2 = reg2.predict(X_test)

In [92]:
#y_pred_ens = (y_pred+y_pred2)/2
y_pred_ens = rf_weight * y_pred + gb_weight * y_pred2
y_pred = y_pred_ens

#### Score using RMSE (root mean square error)

In [93]:
mse = mean_squared_error(y_test, y_pred)
score = reg.score(X_test, y_test)
rmse = sqrt(mse)
print(rmse)

30524.76890694931


#### Score using MAPE (mean absolute porcentage error)
###### (MAPE is how far the model’s predictions are off from their corresponding outputs on average)

In [94]:
mape = mean_absolute_percentage_error(y_test, y_pred)
print(mape)

49.90638152426219


# Test Kaggle

In [95]:
X_final_kaggle[num_columns.columns] = X_final_kaggle[num_columns.columns].groupby(num_columns.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))

### PCA Test 

In [96]:
#X_final_kaggle_pca = pca.fit_transform(X_final_kaggle)

In [97]:
Y_kaggle = reg.predict(X_final_kaggle)

### ensemble regressors

In [98]:
Y_kaggle2 = reg2.predict(X_final_kaggle)
#Y_kaggle_ens = (Y_kaggle+Y_kaggle2)/2
Y_kaggle_ens = rf_weight * Y_kaggle + gb_weight * Y_kaggle2
Y_kaggle = Y_kaggle_ens

In [99]:
df_submission_kaggle = pd.DataFrame(Y_kaggle, columns =['SalePrice']) 
df_submission_kaggle['Id'] = id_test_kaggle

In [100]:
df_submission_kaggle.to_csv('submission.csv', index=False)

In [101]:
df_submission_kaggle.shape

(1459, 2)