In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

In [111]:
pd.set_option('display.max_rows' , None)
pd.set_option('display.max_columns' , None)
pd.set_option('display.max_colWidth' , None)

In [112]:
df = pd.read_csv('gurgaon_properties_post_feature_selection.csv')

In [113]:
df.columns

Index(['Property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category', 'price'],
      dtype='object')

In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3550 entries, 0 to 3549
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Property_type    3550 non-null   float64
 1   sector           3550 non-null   float64
 2   bedRoom          3550 non-null   int64  
 3   bathroom         3550 non-null   int64  
 4   balcony          3550 non-null   float64
 5   agePossession    3550 non-null   float64
 6   built_up_area    3550 non-null   float64
 7   servant room     3550 non-null   int64  
 8   store room       3550 non-null   int64  
 9   furnishing_type  3550 non-null   int64  
 10  luxury_category  3550 non-null   float64
 11  floor_category   3550 non-null   float64
 12  price            3550 non-null   float64
dtypes: float64(8), int64(5)
memory usage: 360.7 KB


In [115]:
df.head(2)

Unnamed: 0,Property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,1.0,18.0,7,7,3.0,2.0,2700.0,0,0,1,1.0,1.0,2.9
1,0.0,87.0,2,2,2.0,1.0,1214.0,0,0,2,2.0,0.0,0.86


In [116]:
df.shape

(3550, 13)

In [117]:
df.isnull().sum()

Unnamed: 0,0
Property_type,0
sector,0
bedRoom,0
bathroom,0
balcony,0
agePossession,0
built_up_area,0
servant room,0
store room,0
furnishing_type,0


In [118]:
# here we apply some things

# 1) one hot encode -> sector, balcony, agePossession, furnishing type, luxury category, floor category
# 2) standarization
# 3) log transform

In [119]:
X = df.drop(columns=['price'])
y = df['price']

In [120]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [121]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [122]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [123]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop='first' , handle_unknown='ignore'), columns_to_encode)
    ],
    remainder='passthrough'
)

In [124]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [125]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [126]:
scores.mean()

np.float64(0.8807435756079173)

In [127]:
scores.std()

np.float64(0.01700439065420432)

In [128]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [129]:
pipeline.fit(X_train,y_train)

In [130]:
y_pred = pipeline.predict(X_test)

In [131]:
y_pred = np.expm1(y_pred)

In [132]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.5382658807278019