House Prices - Advanced Regression Techniques

https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=data_description.txt
    

In [126]:
# Import all the libraries required
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor 
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle
from pandas_profiling import ProfileReport


In [127]:
traindf = pd.read_csv("train.csv")

In [128]:
traindf.head(5)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [129]:
# Know the number of features and rows
traindf.shape

(1460, 81)

In [130]:
# Know the columns and its data types
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [131]:
# Descriptive Analysis
traindf.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [132]:
# Pandas profiling helps to ease the descriptive analysis, but providing the summary of content in charts
profile = ProfileReport(traindf)
profile.to_file('Pandas_profiling.html')

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=94.0), HTML(value='')))




KeyboardInterrupt: 

In [None]:
# drop the Id column as its unique identifier for each record. This variable doesnot have any dependence with target varilable
traindf.drop('Id',axis=1,inplace=True)

## Find the variables that has % of misisng values

## These steps will calculate and create a new dataframe with column name that has missing values, % of missing values in that column

In [None]:
##these steps will calculate and create a new dataframe with column name that has missing values, % of missing values in that column
cols_with_null = (traindf.isnull().sum()/traindf.shape[0]) * 100
cols_with_null.sort_values(ascending=False,inplace=True)
cols_with_nulldf = pd.DataFrame(cols_with_null,columns={'Missingpct'}).reset_index()
cols_with_nulldf= cols_with_nulldf.rename(columns={'index':'missingcolumn'})
cols_with_nulldf = cols_with_nulldf[cols_with_nulldf['Missingpct'] > 0]
cols_with_nulldf

In [None]:
# Take the column names that has more than 90 % of missing values
null_cols_to_drop = cols_with_nulldf[cols_with_nulldf['Missingpct'] >90 ]['missingcolumn']
null_cols_to_drop =null_cols_to_drop.to_list()
null_cols_to_drop

In [None]:
for variable in cols_with_nulldf['missingcolumn'].values:
    data1 = traindf.copy()
    
    data1[variable] = np.where(data1[variable].isnull(),1,0)
    print(data1[variable])
    
    data1.groupby(variable)['SalePrice'].mean().plot.bar()
    plt.title(variable)
    plt.show()

In [None]:
# Identify those columns that has more than 85% of rows has same value. 
not_nul_col_to_drop = []
for features in traindf.columns : 
    aa =(traindf[features].value_counts())
    aa1 = aa.apply(lambda x: (x/sum(aa.values)*100))
    print(aa1.head(5))
    print('\n')
    if aa1.head(1).values > 85:
        print('column to be dropped ',features)
        print('\n')
        not_nul_col_to_drop.append(features) 

In [None]:
print(not_nul_col_to_drop)
print(len(not_nul_col_to_drop))

In [None]:
print(type(null_cols_to_drop))
print(type(not_nul_col_to_drop))
cols_to_drop = null_cols_to_drop + not_nul_col_to_drop
cols_to_drop = set(cols_to_drop)
print(len(cols_to_drop))

In [None]:
# Drop the columns from traindf dataframe 
traindf_drop = traindf.drop(cols_to_drop, axis=1)

In [None]:

traindf_drop.info()

In [None]:
traindf_drop.hist(bins=30, figsize=(50,40 ))


In [None]:
# Split the dataframe into two, numerical & categorical. This split will help easy to fill the missing values for numerical 
# categorical variables separately
numeric_features  = [features for features in traindf_drop.columns if traindf_drop[features].dtype != 'object']
Object_features  = [features for features in traindf_drop.columns if traindf_drop[features].dtype == 'O']
print(' Numeric features ', len(numeric_features),' columns \n', numeric_features,'\n\n')
print(' Object features ', len(Object_features),' columns \n', Object_features,'\n\n')

In [None]:
cols_list_with_null = cols_with_nulldf['missingcolumn'].tolist() 


In [None]:
numeric_null = [feature for feature in numeric_features if feature in cols_list_with_null]
print('numeric columns that has null ', numeric_null)

catagory_null = [feature for feature in Object_features if feature in cols_list_with_null]
print('catagory columns that has null', catagory_null)

In [None]:
# logic to fill the missing data for numeric columns using mean() value of the column
for feature in numeric_null:
    print('Before replacing Null')
    print(traindf_drop[feature].isnull().sum())
    traindf_drop[feature].fillna(traindf_drop[feature].mean(),inplace=True)

for feature in numeric_null:
    print('After replacing Null')
    print(traindf_drop[feature].isnull().sum())
    

In [None]:
traindf_drop.info()

In [None]:
# logic to fill the missing data for numeric columns using mean() value of the column
for feature in catagory_null:
    print('NUmber of records with null Before replacing Null',feature, print(traindf_drop[feature].isnull().sum()))     
    #traindf_drop[feature].fillna('missing',inplace=True)
    traindf_drop[feature].fillna(traindf_drop[feature].modede(),inplace=True)
    # rework
 
for feature in catagory_null:
    print('NUmber of records with null after replacing Null ',feature, print(traindf_drop[feature].isnull().sum())) 
    

In [None]:
#build new columns for years and delete existing ones
traindf_drop['YearBuilt_Age']   = traindf_drop['YrSold'] - traindf_drop['YearBuilt'] 
traindf_drop['YearRemodAdd_Age']  = traindf_drop['YrSold'] - traindf_drop['YearRemodAdd']

traindf_drop['GarageYrBlt'] = traindf_drop['GarageYrBlt'].astype('int')
traindf_drop['GarageYrBlt_Age'] =traindf_drop['YrSold'] -  traindf_drop['GarageYrBlt'] 
 

In [None]:
# Drop the original date variables from the dataframe
traindf_drop.drop(['YearBuilt','YearRemodAdd','GarageYrBlt'],axis=1,inplace=True)

In [None]:
traindf_drop.info()

In [None]:
traindf_drop.MasVnrType.value_counts()

In [None]:
for feature in traindf_drop:
    print(feature ,'     => ', traindf_drop[feature].isnull().sum()) 

In [None]:
traindf_drop[traindf_drop['MasVnrType'] == 'None']

In [None]:
traindf_drop['MasVnrType'] = traindf_drop['MasVnrType'].replace('None','NNN')

In [None]:
numeric_features  = [features for features in traindf_drop.columns if traindf_drop[features].dtype != 'object']
Object_features  = [features for features in traindf_drop.columns if traindf_drop[features].dtype == 'O']
print(' Numeric features ', len(numeric_features),' columns \n', numeric_features,'\n\n')
print(' Object features ', len(Object_features),' columns \n', Object_features,'\n\n')

In [None]:
# Copy the numerical not null data, categorical not null data and merge them
train_drop_num1 = traindf_drop[numeric_features]
train_drop_cat1 = traindf_drop[Object_features]


In [None]:
print(type(train_drop_cat1)) 
print(type(train_drop_num1)) 

In [None]:
train_drop_cat1_copy = train_drop_cat1

In [None]:
train_drop_cat1_copy.head(6)

In [None]:
train_drop_cat1.Fence.value_counts()

In [None]:
##LABEL ENCODING

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for feature in Object_features:
    print('== feature ==' , feature)
    train_drop_cat1_copy[feature] = le.fit_transform(np.array(train_drop_cat1[feature]))
    print('== feature ==' , train_drop_cat1_copy[feature].unique()) 
 


In [None]:
type(train_drop_cat1_copy)

In [None]:
train_drop_cat1_copy.head()

In [None]:
## GET DUMMIES
train_drop_cat_dum = pd.get_dummies(data=train_drop_cat1_copy,columns=Object_features,drop_first=True)

In [None]:
train_drop_cat_dum

In [None]:
#LOG normal transformation
train_drop_num1.describe()

In [None]:
log_non_zero_cols =[]
for feature in train_drop_num1.columns:
    if np.min(train_drop_num1[feature]) > 0:
        print(feature, 'log transformed')
        log_non_zero_cols.append(feature)
   
    

In [None]:
train_drop_num2 = train_drop_num1
for feature in log_non_zero_cols:
    train_drop_num1[feature] = np.log(train_drop_num1[feature])


In [None]:
train_drop_num1.describe()

In [None]:
# Merge numerical variables, categorical variables to one dataframe
traindf_EDA = pd.concat([train_drop_num1,train_drop_cat_dum],axis=1)
traindf_EDA.info()


In [None]:
trainx = traindf_EDA
trainy = traindf_EDA['SalePrice']
trainx.drop('SalePrice',axis=1,inplace=True)

In [None]:
## Min max scalar
from sklearn.preprocessing import MinMaxScaler
mn = MinMaxScaler()
dfmn = mn.fit_transform(trainx)

In [None]:
## convert the array to dataframe

trainx_scaled = pd.DataFrame(dfmn, columns=trainx.columns, index=trainx.index)

In [None]:
#PCA
from sklearn.decomposition import PCA#Declared neeed library
pca_object = PCA(n_components=100)#19 features all total
principalComponents = pca_object.fit_transform(trainx_scaled)#fitting feature data for creating PCA
principalComponents


In [None]:
principalDf = pd.DataFrame(data = principalComponents)#creating pandas dataframe from Principal componenets
principalDf


In [None]:

import numpy as np
#The amount of variance that each PC explains
var= pca_object.explained_variance_ratio_
var


In [None]:

#Cumulative Variance explains
var1=np.cumsum(np.round(pca_object.explained_variance_ratio_, decimals=4)*100)
var1

In [None]:
trainx_scaled.head()

In [None]:
## train test split

from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(principalDf, trainy, test_size=0.3, random_state=1)

In [None]:
# Building a set of models and measuring its results

models = []
models.append(('LINREG      ',LinearRegression()))
models.append(('Lasso       ',Lasso()))
models.append(('XGB         ',XGBRegressor()))
models.append(('DECISIONTREE',DecisionTreeRegressor()))
models.append(('RANDF       ',RandomForestRegressor()))
models.append(('GRADIENT    ',GradientBoostingRegressor()))
models.append(('ADABOOST    ',AdaBoostRegressor()))
models.append(('EXTRATRE    ',ExtraTreesRegressor()))


In [None]:
# In this set of code, we are iterating the list models and cross validation score is checked for each model.
results   = []
model_names =[]
kfold =KFold(n_splits=10)
for modelname, model in models:
    crossval_result = cross_val_score(model,X_train,y_train,cv=kfold)
    results.append(crossval_result)
    model_names.append(modelname)
    print_data = "%s : %f " %(modelname,crossval_result.mean())
    print(print_data)
    

In [None]:

seed = 10
model = GradientBoostingRegressor(random_state=seed)
parameter_grid = dict(n_estimators=np.array([50,100,150,200,300,400]))
k_fold = KFold(n_splits=10)
gridsearch_gradient = GridSearchCV(estimator=model,param_grid=parameter_grid,cv=kfold)
grid_result_gradient = gridsearch_gradient.fit(X_train,y_train)

In [None]:
print('Best score gradient ',grid_result_gradient.best_score_, '\n' ,'Best estimator',grid_result_gradient.best_estimator_)

In [None]:
seed = 10
scaledX = StandardScaler().fit_transform(X_train)
model = ExtraTreesRegressor(random_state=seed)
parameter_grid = dict(n_estimators=np.array([50,100,150,200,300,400]))
k_fold = KFold(n_splits=10)
gridsearch_xtra  = GridSearchCV(estimator=model,param_grid=parameter_grid,cv=kfold)
grid_result_xtra = gridsearch_xtra.fit(X_train,y_train)

In [None]:
print('Best score Xtra ',grid_result_xtra.best_score_, '\n' ,'Best estimator',grid_result_xtra.best_estimator_)

In [None]:
# Finalized model
seed = 10 
final_model = GradientBoostingRegressor(n_estimators=400,random_state=seed)
final_model.fit(X_train,y_train)

In [None]:
Y_pred = final_model.predict(X_test)
# Performance measures of the model
print('R2_score                 - ' ,r2_score(y_test,Y_pred))
print('mean squared error       - ' , mean_squared_error(y_test,Y_pred))
print('Root mean squared errorr - ' ,np.sqrt(mean_squared_error(y_test,Y_pred)))

In [None]:
# Plot the prdicted result with the actual target variable to see how the prediction stays closer to the actual result.
plt.scatter(Y_pred,y_test)

In [None]:
# Store the trained model as a pickle file, so that it can be re-used directly
pickle.dump(model,open('House_prediction_final_model.pkl','wb'))
