House Prices - Advanced Regression Techniques
https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
import category_encoders as ce

# list of variables whose levels are not present partially in test (so to be removed, just for a rough solution)
notInTestList = ['RoofMatl','HouseStyle','Exterior2nd','BsmtFullBath','BsmtHalfBath', 'MiscFeature','Condition2','Heating','Electrical','GarageCars',
                 'Exterior1st','PoolQC','TotRmsAbvGrd']

# change to categorical/oridnal (from numerical)
listOfCategorical = ['MSSubClass','OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                     'TotRmsAbvGrd','Fireplaces','GarageCars']
listOfCategorical = [x for x in listOfCategorical if x not in notInTestList]

listOfOrdinal = ['LotShape','LandSlope','OverallQual','OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
                 'HeatingQC','KitchenQual','Functional','FireplaceQu','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence']
listOfOrdinal = [x for x in listOfOrdinal if x not in notInTestList]


def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75]) #Get 1st and 3rd quartiles (25% -> 75% of data will be kept)
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5) #Get lower bound
    upper_bound = quartile_3 + (iqr * 1.5) #Get upper bound
    return np.where((ys > upper_bound) | (ys < lower_bound)) #Get outlier values
    
def cleanDf(df): 

    for col in (listOfCategorical+listOfOrdinal): 
        frequencies = df[col].value_counts(normalize=True)
        mapping = df[col].map(frequencies)
        df[col].mask(mapping < (0.3/len(frequencies)), 'Other', inplace=True)
    
    for col in listOfCategorical:
        df[col] = df[col].astype('category')
    
    # correlation analysis for numerical variables
#     corrMat = df.corr()
#     listOfFeatures = [i for i in corrMat]
    # which numerical data is weakly correlated with Price?
#     setOfLeastCorrelated = set() 
#     for f in listOfFeatures :
#         if abs(corrMat[f]['SalePrice']) < 0.1: 
#                 setOfLeastCorrelated.add(f)
    # setOfLeastCorrelated = {'3SsnPorch','BsmtFinSF2','Id', 'LowQualFinSF', 'MiscVal', 'MoSold', 'PoolArea', 'YrSold'}   
    setOfLeastCorrelated = {'3SsnPorch','BsmtFinSF2','LowQualFinSF', 'MiscVal', 'PoolArea'}   
    # But for now we keep YrSold and MoSold to use later for computing the Age of the home
    df.drop(setOfLeastCorrelated, axis=1, inplace=True)

    # which numerical data is strongly correlated with Price?
#     setOfMostCorrelated = set() 
#     for f in listOfFeatures :
#         if abs(corrMat[f]['SalePrice']) > 0.6: 
#                 setOfMostCorrelated.add(f)
    # setOfMostCorrelated  = {'1stFlrSF', 'GarageArea', 'GrLivArea', 'SalePrice', 'TotalBsmtSF'}
    
#     listOfFeatures = [i for i in corrMat]
#     setOfDroppedFeatures = set() 
#     for i in range(len(listOfFeatures)) :
#         for j in range(i+1,len(listOfFeatures)): #Avoid repetitions 
#             feature1=listOfFeatures[i]
#             feature2=listOfFeatures[j]
#             if abs(corrMat[feature1][feature2]) > 0.8: #If the correlation between the features is > 0.8
#                 setOfDroppedFeatures.add(feature1) #Add one of them to the set
    # Someone tried different values of threshold and 0.8 was the one that gave the best results

    # data = data.drop(setOfDroppedFeatures, axis=1)
    # setOfDroppedFeatures = {'TotalBsmtSF', 'YearBuilt'} I do not touch YearBuilt for now
    # TotalBsmtSF is highly correlated with 1stFlrSF and both of them are in the list of MostCorrelated
    # For now, I don't delete any of them

    # computing Age and remocing unnecessary columns
    df['Age'] = df['YrSold'] - df['YearBuilt']
    df['AgeOfRemodel'] = df['YrSold'] - df['YearRemodAdd']
    df['YrSold'] = df['YrSold'].astype('category') # as there are only 5 yeras
    df.drop(['YearRemodAdd','YearBuilt','MoSold'], axis=1, inplace=True)    
    
    # total surface
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df.drop(['TotalBsmtSF','1stFlrSF','2ndFlrSF'], axis=1, inplace=True)    
    
    df.drop(notInTestList, axis=1, inplace=True)    
    
    # drop columns using t-test and wilcoxon test
    setOfWeakestTtest = ['Street','Utilities']
    df.drop(setOfWeakestTtest, axis=1, inplace=True)

    # NA is meaningfull somwehre
    df['Alley'].fillna("No alley access") 
    df['BsmtQual'].fillna("No basement") 
#     df = pd.get_dummies(df, columns=['BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2'])
    ordinalEncoder = ce.PolynomialEncoder(cols=listOfOrdinal, drop_invariant=True)
    df = ordinalEncoder.fit_transform(df)
    
    df = pd.get_dummies(df, dummy_na=True)
    # house co located are similar in size 
#     df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    df.fillna(0, inplace=True)
           
    return df

In [32]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

train.drop('Id', axis=1, inplace=True)
train = cleanDf(train)
newdf = train
for column in train:
    outlierValuesList = np.ndarray.tolist(outliers_iqr(newdf[column])[0]) #outliers_iqr() returns an array
    train = newdf.drop(outlierValuesList)

ids = test[['Id']]
test.drop('Id', axis=1, inplace=True)
test = cleanDf(test)

diff1List = list(set(train.keys()) - set(test.keys()))
diff1List.remove('SalePrice')
diff2List = list(set(test.keys()) - set(train.keys()))
test[diff1List] = 0
test.drop(diff2List, axis=1, inplace=True)

numericCols = list(train.select_dtypes(include = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).keys())
numericCols.remove('SalePrice')

# random splitting
from sklearn.model_selection import train_test_split
train, dev = train_test_split(train, test_size=0.2, random_state=42)

scalerX =StandardScaler()
scalerX.fit(train[numericCols])
train[numericCols] = scalerX.transform(train[numericCols])
dev[numericCols] = scalerX.transform(dev[numericCols])
test[numericCols] = scalerX.transform(test[numericCols])

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [33]:
# scalery = StandardScaler()
# scalery.fit(train[['SalePrice']])
train['SalePrice'] = np.log1p(train[['SalePrice']])
dev['SalePrice'] = np.log1p(dev[['SalePrice']])
X = train.drop('SalePrice', axis=1)
y = train['SalePrice']
Xp = dev.drop('SalePrice', axis=1)
yp = dev['SalePrice']

In [67]:
# load packages
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingRegressor


In [35]:
# linear regression
regLin = LinearRegression()
regLin.fit(X, y)
y_pred = regLin.predict(X)
yp_pred = regLin.predict(Xp)
print("training accuracy = %.3f" %mean_squared_error(y.T, y_pred), "\nvalidation accuracy = %.3f" %mean_squared_error(yp.T, yp_pred))
# something is wrong but idk!

training accuracy = 0.014 
validation accuracy = 6786807782557477888.000


In [51]:
# support vector machine
param_grid = [ {'C': [1,10, 100, 1000]}, {'kernel': ['linear','poly','rbf']} ] 
regSVM = SVR()
regSVM_grdSearch = GridSearchCV(regSVM, param_grid)
regSVM_grdSearch.fit(X, y)
y_pred = regSVM_grdSearch.predict(X)
yp_pred = regSVM_grdSearch.predict(Xp)
print("training accuracy = %.3f" %mean_squared_error(y.T, y_pred), "\nvalidation accuracy = %.3f" %mean_squared_error(yp.T, yp_pred))

training accuracy = 0.018 
validation accuracy = 0.019


In [64]:
# gradient boosting
param_grid = [ {'learning_rate': [0.1,0.05,0.01]}, {'n_estimators': [100,200,400]}, {'max_depth': range(3,10)} ] 
regBoost = GradientBoostingRegressor()
regBoost_grdSearch = GridSearchCV(regBoost, param_grid)
regBoost_grdSearch.fit(X, y)
y_pred = regBoost_grdSearch.predict(X)
yp_pred = regBoost_grdSearch.predict(Xp)
print("training accuracy = %.3f" %mean_squared_error(y.T, y_pred), "\nvalidation accuracy = %.3f" %mean_squared_error(yp.T, yp_pred))

training accuracy = 0.004 
validation accuracy = 0.019


In [66]:
# random forest
param_grid = [ {'n_estimators': [100,200,400]}, {'max_depth': range(3,10)} ] 
regFoerst = RandomForestRegressor()
regForest_grdSearch = GridSearchCV(regForest, param_grid)
regForest_grdSearch.fit(X, y)
y_pred = regForest_grdSearch.predict(X)
yp_pred = regForest_grdSearch.predict(Xp)
print("training accuracy = %.3f" %mean_squared_error(y.T, y_pred), "\nvalidation accuracy = %.3f" %mean_squared_error(yp.T, yp_pred))

training accuracy = 0.003 
validation accuracy = 0.023


In [81]:
# voting
regVoting = VotingRegressor([('svm',regSVM_grdSearch), ('boost',regBoost_grdSearch), ('rf',regForest_grdSearch)])
regVoting.fit(X, y)
y_pred = regVoting.predict(X)
yp_pred = regVoting.predict(Xp)
print("training accuracy = %.3f" %mean_squared_error(y.T, y_pred), "\nvalidation accuracy = %.3f" %mean_squared_error(yp.T, yp_pred))

training accuracy = 0.005 
validation accuracy = 0.018


In [84]:
predictions = np.expm1(regSVM_grdSearch.predict(test))
results = ids.assign(SalePrice = predictions) # assign predictions to ids
results.to_csv("results.csv", index=False) # write the final dataset to a csv file.