In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('house-prices/train.csv',usecols=['MSZoning', 'LotFrontage', 'LotArea', 'BldgType', 'OverallQual', 'OverallCond', 'YearBuilt', 'Foundation', 'BsmtCond', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' , 'GrLivArea' , 'KitchenQual' ,'TotRmsAbvGrd' , 'GarageType' , 'GarageCars' , 'GarageArea' , 'GarageQual' , 'GarageCond' , 'PoolArea' ,  'MiscVal' , 'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' , 'ExterQual', 'ExterCond' , 'BsmtQual', 'CentralAir', 'SalePrice'])
df

Unnamed: 0,MSZoning,LotFrontage,LotArea,BldgType,OverallQual,OverallCond,YearBuilt,RoofStyle,RoofMatl,Exterior1st,...,GarageType,GarageCars,GarageArea,GarageQual,GarageCond,PoolArea,MiscVal,SaleType,SaleCondition,SalePrice
0,RL,65.0,8450,1Fam,7,5,2003,Gable,CompShg,VinylSd,...,Attchd,2,548,TA,TA,0,0,WD,Normal,208500
1,RL,80.0,9600,1Fam,6,8,1976,Gable,CompShg,MetalSd,...,Attchd,2,460,TA,TA,0,0,WD,Normal,181500
2,RL,68.0,11250,1Fam,7,5,2001,Gable,CompShg,VinylSd,...,Attchd,2,608,TA,TA,0,0,WD,Normal,223500
3,RL,60.0,9550,1Fam,7,5,1915,Gable,CompShg,Wd Sdng,...,Detchd,3,642,TA,TA,0,0,WD,Abnorml,140000
4,RL,84.0,14260,1Fam,8,5,2000,Gable,CompShg,VinylSd,...,Attchd,3,836,TA,TA,0,0,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,62.0,7917,1Fam,6,5,1999,Gable,CompShg,VinylSd,...,Attchd,2,460,TA,TA,0,0,WD,Normal,175000
1456,RL,85.0,13175,1Fam,6,6,1978,Gable,CompShg,Plywood,...,Attchd,2,500,TA,TA,0,0,WD,Normal,210000
1457,RL,66.0,9042,1Fam,7,9,1941,Gable,CompShg,CemntBd,...,Attchd,1,252,TA,TA,0,2500,WD,Normal,266500
1458,RL,68.0,9717,1Fam,5,6,1950,Hip,CompShg,MetalSd,...,Attchd,1,240,TA,TA,0,0,WD,Normal,142125


In [3]:
df.columns

Index(['MSZoning', 'LotFrontage', 'LotArea', 'BldgType', 'OverallQual',
       'OverallCond', 'YearBuilt', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'TotalBsmtSF', 'CentralAir', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'KitchenQual', 'TotRmsAbvGrd', 'GarageType', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'PoolArea', 'MiscVal',
       'SaleType', 'SaleCondition', 'SalePrice'],
      dtype='object')

In [4]:
df.iloc[1,:-1]

MSZoning              RL
LotFrontage         80.0
LotArea             9600
BldgType            1Fam
OverallQual            6
OverallCond            8
YearBuilt           1976
RoofStyle          Gable
RoofMatl         CompShg
Exterior1st      MetalSd
Exterior2nd      MetalSd
ExterQual             TA
ExterCond             TA
Foundation        CBlock
BsmtQual              Gd
BsmtCond              TA
TotalBsmtSF         1262
CentralAir             Y
1stFlrSF            1262
2ndFlrSF               0
GrLivArea           1262
KitchenQual           TA
TotRmsAbvGrd           6
GarageType        Attchd
GarageCars             2
GarageArea           460
GarageQual            TA
GarageCond            TA
PoolArea               0
MiscVal                0
SaleType              WD
SaleCondition     Normal
Name: 1, dtype: object

In [5]:
df['YearBuilt'] = pd.to_datetime(df['YearBuilt']).dt.year

In [6]:
removable_col = [x for x in df.columns if (df[x].isnull().sum() > 0)]
removable_col

['LotFrontage',
 'BsmtQual',
 'BsmtCond',
 'GarageType',
 'GarageQual',
 'GarageCond']

In [7]:
# df = df[removable_col].dropna()

In [8]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=46)

In [10]:
X_train.sample(30)

Unnamed: 0,MSZoning,LotFrontage,LotArea,BldgType,OverallQual,OverallCond,YearBuilt,RoofStyle,RoofMatl,Exterior1st,...,TotRmsAbvGrd,GarageType,GarageCars,GarageArea,GarageQual,GarageCond,PoolArea,MiscVal,SaleType,SaleCondition
628,RL,70.0,11606,1Fam,5,5,1970,Gable,CompShg,Plywood,...,9,Attchd,2,504,TA,TA,0,0,WD,Family
5,RL,85.0,14115,1Fam,5,5,1970,Gable,CompShg,VinylSd,...,5,Attchd,2,480,TA,TA,0,700,WD,Normal
1293,RL,78.0,10140,1Fam,7,5,1970,Gable,CompShg,HdBoard,...,8,Attchd,2,528,TA,TA,0,0,WD,Normal
1254,RL,60.0,6931,1Fam,7,5,1970,Gable,CompShg,VinylSd,...,7,BuiltIn,2,397,TA,TA,0,0,WD,Normal
955,RH,82.0,7136,Duplex,6,6,1970,Gable,CompShg,MetalSd,...,8,Attchd,2,492,TA,TA,0,0,WD,Normal
968,RM,50.0,5925,1Fam,3,6,1970,Gable,CompShg,VinylSd,...,6,,0,0,,,0,0,WD,Abnorml
889,RL,128.0,12160,1Fam,6,4,1970,Hip,CompShg,Wd Sdng,...,6,Attchd,2,505,TA,TA,0,0,WD,Normal
852,RL,53.0,7128,1Fam,7,5,1970,Gable,CompShg,MetalSd,...,7,Detchd,1,240,TA,TA,0,0,WD,Normal
534,RL,74.0,9056,1Fam,8,5,1970,Gable,CompShg,VinylSd,...,6,Attchd,2,403,TA,TA,0,0,WD,Normal
1417,RL,,16545,1Fam,8,5,1970,Gable,CompShg,VinylSd,...,7,Attchd,3,1069,TA,TA,0,0,WD,Normal


In [11]:
X_train['LotArea'].describe()

count      1022.000000
mean      10704.016634
std       10526.169757
min        1300.000000
25%        7545.000000
50%        9565.500000
75%       11712.750000
max      215245.000000
Name: LotArea, dtype: float64

In [12]:
categoricalColumns = ['MSZoning', 'BldgType',  'Foundation', 'BsmtCond',   'KitchenQual' , 'GarageType' ,  'GarageQual' , 'GarageCond' ,  'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' , 'ExterQual', 'ExterCond' , 'BsmtQual', 'CentralAir']

cat_impute = ['BsmtCond','GarageType','GarageQual' , 'GarageCond' , 'BsmtQual']

ordinalColumns = ['BsmtCond', 'KitchenQual' ,'GarageQual' , 'GarageCond' ,'ExterQual', 'ExterCond' , 'BsmtQual']
oneHotColumns = ['MSZoning', 'BldgType',  'Foundation',    'GarageType' ,    'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' ,  'CentralAir']

In [13]:
numaricalColumns = ['LotFrontage','OverallQual', 'OverallCond', 'LotArea','YearBuilt', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' ,'GrLivArea' ,'TotRmsAbvGrd' , 'GarageCars' , 'GarageArea' ,'PoolArea' , 'MiscVal']

num_std= ['OverallQual', 'OverallCond', 'LotArea','YearBuilt', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' ,'GrLivArea' ,'TotRmsAbvGrd' , 'GarageCars' , 'GarageArea' ,'PoolArea' , 'MiscVal']

num_im_st = ['LotFrontage']

In [14]:
X_train.columns

Index(['MSZoning', 'LotFrontage', 'LotArea', 'BldgType', 'OverallQual',
       'OverallCond', 'YearBuilt', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'TotalBsmtSF', 'CentralAir', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'KitchenQual', 'TotRmsAbvGrd', 'GarageType', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'PoolArea', 'MiscVal',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [15]:
numaricalColumns


['LotFrontage',
 'OverallQual',
 'OverallCond',
 'LotArea',
 'YearBuilt',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageArea',
 'PoolArea',
 'MiscVal']

# Impute and Scalling

In [16]:
numerical_transfromer = Pipeline(
    steps=[
        ("num_imputer",SimpleImputer(strategy='mean')),
        ("numScaler",StandardScaler())
    ]
)


categorical_ordi_transformer = Pipeline(
    steps=[
        ("cat_imputer1",SimpleImputer(strategy='most_frequent')),
        ("ord_encod",OrdinalEncoder(categories=[
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex']
        ]))
    ]
)

categorical_oneHot_transformer = Pipeline(
    steps=[
        ("cat_imputer1",SimpleImputer(strategy='most_frequent')),
        ("ohe_encod",OneHotEncoder(handle_unknown='ignore'))
    ]
)


# Using transformer

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('imp_stand',numerical_transfromer,numaricalColumns),
        ('cat_ordi',categorical_ordi_transformer,ordinalColumns),
        ('cat_oneHot',categorical_oneHot_transformer,oneHotColumns)
    ],remainder='passthrough'    
)

In [18]:
# clf = Pipeline(
#     steps=[
#         ('preprocessor',preprocessor),
#         ('classifier',LinearRegression())
#     ]
# )

In [19]:
# clf.fit(X_train,X_test)
X_train_trf = preprocessor.fit_transform(X_train)
X_test_trf = preprocessor.transform(X_test)


In [20]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)

X_train_trf = pca.fit_transform(X_train_trf)
X_test_trf = pca.transform(X_test_trf)

In [21]:
rfr = RandomForestRegressor()

rfr.fit(X_train_trf,y_train)

In [22]:

y_pred = rfr.predict(X_test_trf)

In [23]:

r2_score(y_test,y_pred)

0.8886160750950424

In [24]:
# import plotly.express as px
# 
# y_train_trf = y_train.astype(str)
# fig = px.scatter_3d(df, x=X_train_trf[:, 0], y=X_train_trf[:, 1], z=X_train_trf[:, 2],
#                     color=y_train_trf)
# fig.update_layout(
#     margin=dict(l=20, r=20, t=20, b=20),
#     paper_bgcolor="LightSteelBlue",
# )
# fig.show()

In [25]:
import joblib
joblib.dump(preprocessor,'preprocessor')
joblib.dump(rfr,'price_predictions')
joblib.dump(pca,"PCA")

['PCA']