In [519]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,LabelEncoder,OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [520]:
trn_data=pd.read_csv(r".\train.csv")
tst_data=pd.read_csv(r".\test.csv")

In [521]:
trn_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [522]:
#Allocating 1 and 0 to train and test data to later segregate them
trn_data['is_train_data']=1
tst_data['is_train_data']=0

#Concatanating the training and testing data sets
overall_data=pd.concat([trn_data,tst_data],sort=False).reset_index(drop=True)

indicator=pd.DataFrame()
salesprice=pd.DataFrame()
indicator=overall_data['is_train_data']
salesprice=overall_data['SalePrice']

overall_data.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice        float64
is_train_data      int64
Length: 82, dtype: object

We'll check for the fill rate of various features so that we can cut down those factors which are less affecting the salePrice

In [523]:
null_count=overall_data.isnull().sum()
empty_percent=(overall_data.isnull().sum()*100)/(overall_data.isnull().count())

Fill_rate=pd.concat([null_count.sort_values(ascending=False),100-empty_percent],axis=1,keys=['Empty','Fill Rate'])
Fill_rate.head(20)

Unnamed: 0,Empty,Fill Rate
PoolQC,2909,0.342583
MiscFeature,2814,3.597122
Alley,2721,6.783145
Fence,2348,19.561494
MasVnrType,1766,39.499829
SalePrice,1459,50.017129
FireplaceQu,1420,51.353203
LotFrontage,486,83.350462
GarageCond,159,94.552929
GarageFinish,159,94.552929


Now We'll exclude all those features whose fill rate < 80%

In [524]:
threshold_limit=0.8
overall_data.dropna(thresh=threshold_limit*len(overall_data),inplace=True,axis=1)

Next we'll divide the data into 2 parts - one with numerical values & other with non-numerical values

In [525]:
data_num=overall_data.select_dtypes(exclude=['object'])
data_obj=overall_data.select_dtypes(include=['object'])


Beginning with non-numerical values

In [526]:
# Initialize df_stats with specified columns which indicate the metrics for analysis
df_stats = pd.DataFrame(columns=['column', 'Distinct_value_incl_na', 'Distinct_value_without_na','missing_val', '%_missing_val'])

# List to hold the data for each column
stats = []

for c in data_obj.columns:
    column_stats = {
        'column': c,
        'Distinct_value_incl_na': len(list(data_obj[c].unique())),
        'Distinct_value_without_na': int(data_obj[c].nunique()),
        'missing_val': data_obj[c].isnull().sum(),
        '%_missing_val': (data_obj[c].isnull().sum() / len(data_obj)).round(4) * 100
    }
    stats.append(column_stats)

# Convert the list of dictionaries to a DataFrame
df_stats = pd.DataFrame(stats)
df_stats.head()

Unnamed: 0,column,Distinct_value_incl_na,Distinct_value_without_na,missing_val,%_missing_val
0,MSZoning,6,5,4,0.14
1,Street,2,2,0,0.0
2,LotShape,4,4,0,0.0
3,LandContour,4,4,0,0.0
4,Utilities,3,2,2,0.07


We notice some missing values,So we'll fill up those cells

In [527]:
for col in data_obj.columns:
    mode=data_obj[col].mode().iloc[0]
    data_obj[col].fillna(mode,inplace=True)

#Rechecking the Fill rate for non-numerical Values
empt = data_obj.isnull().sum()
empt_percent = (data_obj.isnull().sum()*100/data_obj.isnull().count())
missing_data = pd.concat([empt.sort_values(ascending=False),100- empt_percent], axis=1, keys=['Total', 'Fill rate'])
missing_data.head(20)

Unnamed: 0,Total,Fill rate
MSZoning,0,100.0
BsmtQual,0,100.0
BsmtExposure,0,100.0
BsmtFinType1,0,100.0
BsmtFinType2,0,100.0
Heating,0,100.0
HeatingQC,0,100.0
CentralAir,0,100.0
Electrical,0,100.0
KitchenQual,0,100.0


We noticed that the non-numerical values can be divided into Nominal and Ordinal types

In [528]:
nominal_cols = ['MSZoning', 'Street','LandContour','Neighborhood','Condition1','Condition2','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','Heating','GarageType','SaleType','SaleCondition']
ordinal_cols = ['ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                'BsmtFinType2','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','GarageFinish','GarageQual','GarageCond','PavedDrive','LotShape',
                'Utilities','LandSlope','BldgType','HouseStyle','LotConfig']

ordinal=data_obj[ordinal_cols]
nominal=data_obj[nominal_cols]

Encoding the Ordinal Data with Label Encoder

In [529]:
ordinal_encoded=ordinal.copy()
label_encoded={}
for c in ordinal_encoded:
    label_encoded[c]=LabelEncoder()
    ordinal_encoded[c]=label_encoded[c].fit_transform(ordinal[c])

ordinal_encoded.head()

Unnamed: 0,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,CentralAir,Electrical,...,GarageFinish,GarageQual,GarageCond,PavedDrive,LotShape,Utilities,LandSlope,BldgType,HouseStyle,LotConfig
0,2,4,2,3,3,2,5,0,1,4,...,1,4,4,2,3,0,0,0,5,4
1,3,4,2,3,1,0,5,0,1,4,...,1,4,4,2,3,0,0,0,2,2
2,2,4,2,3,2,2,5,0,1,4,...,1,4,4,2,0,0,0,0,5,4
3,3,4,3,1,3,0,5,2,1,4,...,2,4,4,2,0,0,0,0,5,0
4,2,4,2,3,0,2,5,0,1,4,...,1,4,4,2,0,0,0,0,5,2


For Nominal Data, We'll use One-Hot Encoding

In [530]:
nominal_encoded=pd.get_dummies(nominal[nominal.columns[:-1]])
# If the encoded DataFrame contains True/False, convert them to 0/1
nominal_encoded=nominal_encoded.astype(int)

nominal_encoded.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,LandContour_Bnk,LandContour_HLS,LandContour_Low,...,GarageType_Detchd,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Concatenating the Ordinal and Nominal data and further Applying Stardardization

In [531]:
data_obj=pd.concat([ordinal_encoded,nominal_encoded],axis=1)

scaler=StandardScaler()
data_obj=pd.DataFrame(scaler.fit_transform(data_obj),columns=data_obj.columns)
data_obj.head()

Unnamed: 0,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,CentralAir,Electrical,...,GarageType_Detchd,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,-0.755793,0.376461,-0.294203,0.28567,0.624607,-0.449736,0.315909,-0.880324,0.26829,0.300615,...,-0.60334,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.394439
1,0.668455,0.376461,-0.294203,0.28567,-1.1585,-1.546918,0.315909,-0.880324,0.26829,0.300615,...,-0.60334,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.394439
2,-0.755793,0.376461,-0.294203,0.28567,-0.266947,-0.449736,0.315909,-0.880324,0.26829,0.300615,...,-0.60334,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.394439
3,0.668455,0.376461,0.835768,-2.68713,0.624607,-1.546918,0.315909,0.267617,0.26829,0.300615,...,1.657441,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.394439
4,-0.755793,0.376461,-0.294203,0.28567,-2.050054,-0.449736,0.315909,-0.880324,0.26829,0.300615,...,-0.60334,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.394439


Now Working with Numerical data


In [532]:
numerical_features = ['OverallCond', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath','TotRmsAbvGrd', 'GarageCars', 'GarageArea']
#data_num should contain only the specified features
data_num=data_num[numerical_features]

for col in data_num.columns:
    median=data_num[col].median()
    data_num[col].fillna(median,inplace=True)

In [533]:
VIF_data=pd.DataFrame()
VIF_data['features']=numerical_features
VIF_data['VIF']=[variance_inflation_factor(data_num.values,i) for i in range(len(data_num.columns))]
print(VIF_data)

       features           VIF
0   OverallCond     35.973078
1     YearBuilt  10312.548843
2  YearRemodAdd  10854.046685
3   TotalBsmtSF     19.943884
4      1stFlrSF     32.248921
5     GrLivArea     43.085497
6      FullBath     17.117667
7  TotRmsAbvGrd     53.317068
8    GarageCars     34.000905
9    GarageArea     30.664975


In [534]:
data_num['latest_contruction'] = data_num[['YearBuilt', 'YearRemodAdd']].max(axis=1)
data_num.drop(['YearBuilt', 'YearRemodAdd'], axis=1, inplace=True)

trn_data['latest_contruction'] = trn_data[['YearBuilt', 'YearRemodAdd']].max(axis=1)
trn_data.drop(['YearBuilt', 'YearRemodAdd'], axis=1, inplace=True)

In [535]:
spl_features = ['OverallCond', 'latest_contruction', 'TotalBsmtSF', 'GrLivArea', 'GarageArea']
data_num=data_num[spl_features]

data_num=pd.DataFrame(scaler.fit_transform(data_num),columns=data_num.columns)
data_num.head()

Unnamed: 0,OverallCond,latest_contruction,TotalBsmtSF,GrLivArea,GarageArea
0,-0.507284,0.896804,-0.444278,0.413547,0.348888
1,2.188279,-0.395615,0.477158,-0.471891,-0.059804
2,-0.507284,0.848937,-0.299027,0.563755,0.627542
3,-0.507284,-0.68282,-0.671232,0.427382,0.785446
4,-0.507284,0.753202,0.211621,1.378042,1.686426


In [536]:
data_1=pd.concat([data_num,data_obj],axis=1)
data_1['indicator']=indicator
data_1['salePrice']=salesprice
data_1.columns

Index(['OverallCond', 'latest_contruction', 'TotalBsmtSF', 'GrLivArea',
       'GarageArea', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure',
       ...
       'SaleType_CWD', 'SaleType_Con', 'SaleType_ConLD', 'SaleType_ConLI',
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'indicator', 'salePrice'],
      dtype='object', length=148)

In [537]:
train_data=data_1[data_1['indicator']==1].drop(columns=['indicator'])
test_data=data_1[data_1['indicator']==0].drop(columns=['indicator'])

train_data['salePrice']=np.log(train_data['salePrice'])
X=train_data.drop('salePrice',axis=1)
y=train_data['salePrice']

In [538]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.3)
X_train_sld=scaler.fit_transform(X_train)
X_test_sld=scaler.fit_transform(X_test)


In [539]:
kNN_9=KNeighborsClassifier(n_neighbors=5)
kNN_9.fit(X_train_sld,y_train)

y_pred=kNN_9.predict(X_test_sld)

mse=mean_squared_error(y_pred,y_test)

print(f'Mean Squared Error: {mse}')

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_test)


# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.4785570498685275e+25
