In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('house-prices/train.csv',usecols=['MSZoning', 'LotFrontage', 'LotArea', 'BldgType', 'OverallQual', 'OverallCond', 'YearBuilt', 'Foundation', 'BsmtCond', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' , 'GrLivArea' , 'KitchenQual' ,'TotRmsAbvGrd' , 'GarageType' , 'GarageCars' , 'GarageArea' , 'GarageQual' , 'GarageCond' , 'PoolArea' ,  'MiscVal' , 'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' , 'ExterQual', 'ExterCond' , 'BsmtQual', 'CentralAir', 'YrSold', 'SalePrice'])
df

Unnamed: 0,MSZoning,LotFrontage,LotArea,BldgType,OverallQual,OverallCond,YearBuilt,RoofStyle,RoofMatl,Exterior1st,...,GarageCars,GarageArea,GarageQual,GarageCond,PoolArea,MiscVal,YrSold,SaleType,SaleCondition,SalePrice
0,RL,65.0,8450,1Fam,7,5,2003,Gable,CompShg,VinylSd,...,2,548,TA,TA,0,0,2008,WD,Normal,208500
1,RL,80.0,9600,1Fam,6,8,1976,Gable,CompShg,MetalSd,...,2,460,TA,TA,0,0,2007,WD,Normal,181500
2,RL,68.0,11250,1Fam,7,5,2001,Gable,CompShg,VinylSd,...,2,608,TA,TA,0,0,2008,WD,Normal,223500
3,RL,60.0,9550,1Fam,7,5,1915,Gable,CompShg,Wd Sdng,...,3,642,TA,TA,0,0,2006,WD,Abnorml,140000
4,RL,84.0,14260,1Fam,8,5,2000,Gable,CompShg,VinylSd,...,3,836,TA,TA,0,0,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,62.0,7917,1Fam,6,5,1999,Gable,CompShg,VinylSd,...,2,460,TA,TA,0,0,2007,WD,Normal,175000
1456,RL,85.0,13175,1Fam,6,6,1978,Gable,CompShg,Plywood,...,2,500,TA,TA,0,0,2010,WD,Normal,210000
1457,RL,66.0,9042,1Fam,7,9,1941,Gable,CompShg,CemntBd,...,1,252,TA,TA,0,2500,2010,WD,Normal,266500
1458,RL,68.0,9717,1Fam,5,6,1950,Hip,CompShg,MetalSd,...,1,240,TA,TA,0,0,2010,WD,Normal,142125


In [3]:
df['YearBuilt'] = pd.to_datetime(df['YearBuilt']).dt.year
df['YrSold'] = pd.to_datetime(df['YrSold']).dt.year

In [4]:
removable_col = [x for x in df.columns if (df[x].isnull().sum() > 0)]
removable_col

['LotFrontage',
 'BsmtQual',
 'BsmtCond',
 'GarageType',
 'GarageQual',
 'GarageCond']

In [5]:
# df = df[removable_col].dropna()

In [6]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=46)

In [8]:
X_train

Unnamed: 0,MSZoning,LotFrontage,LotArea,BldgType,OverallQual,OverallCond,YearBuilt,RoofStyle,RoofMatl,Exterior1st,...,GarageType,GarageCars,GarageArea,GarageQual,GarageCond,PoolArea,MiscVal,YrSold,SaleType,SaleCondition
358,RL,92.0,6930,1Fam,5,4,1970,Hip,CompShg,Wd Sdng,...,BuiltIn,1,288,TA,TA,0,0,1970,WD,Abnorml
1423,RL,,19690,1Fam,6,7,1970,Flat,Tar&Grv,Plywood,...,Attchd,2,432,Gd,Gd,738,0,1970,WD,Alloca
1073,RL,75.0,7950,1Fam,6,6,1970,Hip,CompShg,HdBoard,...,Attchd,2,440,TA,TA,0,0,1970,WD,Normal
332,RL,85.0,10655,1Fam,8,5,1970,Gable,CompShg,VinylSd,...,Attchd,3,880,TA,TA,0,0,1970,WD,Normal
225,RM,21.0,1680,Twnhs,5,5,1970,Gable,CompShg,HdBoard,...,Detchd,1,280,TA,TA,0,0,1970,COD,Abnorml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,RL,93.0,12090,1Fam,8,5,1970,Hip,CompShg,VinylSd,...,BuiltIn,2,420,TA,TA,0,0,1970,WD,Abnorml
442,RM,52.0,6240,1Fam,5,7,1970,Gable,CompShg,MetalSd,...,Detchd,2,360,TA,TA,0,0,1970,WD,Normal
1396,RL,,57200,1Fam,5,5,1970,Gable,CompShg,Wd Sdng,...,Detchd,2,572,TA,TA,0,0,1970,WD,Normal
837,RM,21.0,1680,Twnhs,6,5,1970,Gable,CompShg,HdBoard,...,Detchd,1,264,TA,TA,0,0,1970,WD,Normal


In [9]:
categoricalColumns = ['MSZoning', 'BldgType',  'Foundation', 'BsmtCond',   'KitchenQual' , 'GarageType' ,  'GarageQual' , 'GarageCond' ,  'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' , 'ExterQual', 'ExterCond' , 'BsmtQual', 'CentralAir']

cat_impute = ['BsmtCond','GarageType','GarageQual' , 'GarageCond' , 'BsmtQual']

ordinalColumns = ['BsmtCond', 'KitchenQual' ,'GarageQual' , 'GarageCond' ,'ExterQual', 'ExterCond' , 'BsmtQual']
oneHotColumns = ['MSZoning', 'BldgType',  'Foundation',    'GarageType' ,    'SaleType' ,  'SaleCondition' ,'RoofStyle', 'RoofMatl' , 'Exterior1st', 'Exterior2nd' ,  'CentralAir']

In [10]:
numaricalColumns = ['LotFrontage','OverallQual', 'OverallCond', 'LotArea','YearBuilt','YrSold', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' ,'GrLivArea' ,'TotRmsAbvGrd' , 'GarageCars' , 'GarageArea' ,'PoolArea' , 'MiscVal']

num_std= ['OverallQual', 'OverallCond', 'LotArea','YearBuilt','YrSold', 'TotalBsmtSF', '1stFlrSF' ,'2ndFlrSF' ,'GrLivArea' ,'TotRmsAbvGrd' , 'GarageCars' , 'GarageArea' ,'PoolArea' , 'MiscVal']

num_im_st = ['LotFrontage']

In [11]:
X_train.columns

Index(['MSZoning', 'LotFrontage', 'LotArea', 'BldgType', 'OverallQual',
       'OverallCond', 'YearBuilt', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'TotalBsmtSF', 'CentralAir', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'KitchenQual', 'TotRmsAbvGrd', 'GarageType', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'PoolArea', 'MiscVal',
       'YrSold', 'SaleType', 'SaleCondition'],
      dtype='object')

In [12]:
numaricalColumns


['LotFrontage',
 'OverallQual',
 'OverallCond',
 'LotArea',
 'YearBuilt',
 'YrSold',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageArea',
 'PoolArea',
 'MiscVal']

# Impute and Scalling

In [13]:
numerical_transfromer = Pipeline(
    steps=[
        ("num_imputer",SimpleImputer(strategy='mean')),
        ("numScaler",StandardScaler())
    ]
)


categorical_ordi_transformer = Pipeline(
    steps=[
        ("cat_imputer1",SimpleImputer(strategy='most_frequent')),
        ("ord_encod",OrdinalEncoder(categories=[
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['Po' ,'Fa' ,'TA' ,'Gd','Ex'],
            ['NA',	'Po' ,'Fa' ,'TA' ,'Gd','Ex']
        ]))
    ]
)

categorical_oneHot_transformer = Pipeline(
    steps=[
        ("cat_imputer1",SimpleImputer(strategy='most_frequent')),
        ("ohe_encod",OneHotEncoder(handle_unknown='ignore'))
    ]
)


# Using transformer

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('imp_stand',numerical_transfromer,numaricalColumns),
        ('cat_ordi',categorical_ordi_transformer,ordinalColumns),
        ('cat_oneHot',categorical_oneHot_transformer,oneHotColumns)
    ],remainder='passthrough'    
)

In [15]:
# clf = Pipeline(
#     steps=[
#         ('preprocessor',preprocessor),
#         ('classifier',LinearRegression())
#     ]
# )

In [16]:
# clf.fit(X_train,X_test)
X_train_trf = preprocessor.fit_transform(X_train)
X_test_trf = preprocessor.transform(X_test)


In [17]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)

X_train_trf = pca.fit_transform(X_train_trf)
X_test_trf = pca.transform(X_test_trf)

In [18]:
lr = LinearRegression()

lr.fit(X_train_trf,y_train)

In [19]:

y_pred = lr.predict(X_test_trf)

In [20]:

r2_score(y_test,y_pred)

0.8164074294266068

In [21]:
# import plotly.express as px
# 
# y_train_trf = y_train.astype(str)
# fig = px.scatter_3d(df, x=X_train_trf[:, 0], y=X_train_trf[:, 1], z=X_train_trf[:, 2],
#                     color=y_train_trf)
# fig.update_layout(
#     margin=dict(l=20, r=20, t=20, b=20),
#     paper_bgcolor="LightSteelBlue",
# )
# fig.show()