In [67]:
!pip install xgboost


Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/e2/7b/8c1b410cd0604cee9a167a19f7e1746f5b92ae7d02ad574ab560b73c5a48/xgboost-2.1.1-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 393.8 kB/s eta 0:05:18
   ---------------------------------------- 0.0/124.9 MB 393.8 kB/s eta 0:05:18
   ---------------------------------------- 0.2/124.9 MB 1.1 MB/s eta 0:01:51
   ---------------------------------------- 0.2/124.9 MB 1.1 MB/s eta 0:01:51
   ---------------------------------------- 0.5/124.9 MB 1.7 MB/s eta 0:01:12
   -------------------------------------

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

  from pandas.core import (


In [3]:
df = pd.read_csv(r'C:\Users\HARSHIT JAIN\Desktop\data science projects\Real_Estate_Data_science_Project\DATASETS\gurgaon_properties_post_feature_selection_v4\gurgaon_properties_post_feature_selection_v4.csv')

In [4]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,falt,sector 54,5.75,3,4,3,Relatively New,2329.0,1,0,furnished,high,high floor
1,falt,sector 2,3.35,4,4,3+,New Property,5000.0,0,0,semifurnished,medium,low floor
2,falt,sector 69,1.4,3,3,2,Relatively New,1428.0,0,0,semifurnished,medium,medium floor
3,falt,sector 79,1.7,3,2,3,Relatively New,1570.0,0,0,semifurnished,low,medium floor
4,falt,sector 33,1.35,3,2,3+,New Property,1409.0,0,0,semifurnished,low,high floor


In [5]:
X = df.drop(columns=['price'])
y = df['price']

In [6]:
y_transformed = np.log1p(y)

# Ordinal Encoding

In [7]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [9]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [10]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [11]:
scores.mean(),scores.std()

(0.7332383881120567, 0.022535541539423876)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [13]:
pipeline.fit(X_train,y_train)

In [14]:
y_pred = pipeline.predict(X_test)

In [15]:
y_pred = np.expm1(y_pred)

In [16]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9122773716432088

In [17]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [18]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [19]:
model_output = []

for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))


In [20]:
model_output

[['linear_reg', 0.7332383881120567, 0.9122773716432088],
 ['svr', 0.7560935354238038, 0.8768539501562563],
 ['ridge', 0.7332429205669009, 0.9121534632345769],
 ['LASSO', 0.05367902025579411, 1.55932834710039],
 ['decision tree', 0.7894304830799158, 0.6632246512641797],
 ['random forest', 0.885016153573658, 0.5313562674006017],
 ['extra trees', 0.8707335386066626, 0.592958451332523],
 ['gradient boosting', 0.8749014783544456, 0.5729641576530117],
 ['adaboost', 0.7535134018588947, 0.8647399527063042],
 ['mlp', 0.8066839402281183, 0.7677425112487076],
 ['xgboost', 0.8896482187967653, 0.527922855973579]]

In [21]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [22]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889648,0.527923
5,random forest,0.885016,0.531356
7,gradient boosting,0.874901,0.572964
6,extra trees,0.870734,0.592958
4,decision tree,0.78943,0.663225
9,mlp,0.806684,0.767743
8,adaboost,0.753513,0.86474
1,svr,0.756094,0.876854
2,ridge,0.733243,0.912153
0,linear_reg,0.733238,0.912277


# OneHotEncoding

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [24]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [25]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [26]:
scores.mean()

0.8535935377844238

In [27]:
scores.std()

0.020401525151815512

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [29]:
pipeline.fit(X_train,y_train)

In [30]:
y_pred = pipeline.predict(X_test)



In [31]:
y_pred = np.expm1(y_pred)

In [32]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6534862177974399

In [33]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [34]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [35]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [36]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [37]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.892159,0.502523
10,xgboost,0.894815,0.505857
6,extra trees,0.893348,0.515183
7,gradient boosting,0.875435,0.567581
9,mlp,0.870921,0.582186
0,linear_reg,0.853594,0.653486
2,ridge,0.85408,0.653498
4,decision tree,0.809108,0.676805
8,adaboost,0.757885,0.850589
1,svr,0.759638,0.871865


# OneHotEncoding With PCA

In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown ='ignore'),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [39]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [40]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [41]:
scores.mean()

0.056226625771625086

In [42]:
scores.std()

0.01801771294464952

In [43]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [44]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [45]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [46]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [47]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.751404,0.735901
6,extra trees,0.72901,0.768528
4,decision tree,0.683173,0.827882
10,xgboost,0.608233,0.93989
7,gradient boosting,0.609627,1.007941
1,svr,0.227092,1.406477
9,mlp,0.211854,1.444726
8,adaboost,0.300211,1.490105
3,LASSO,0.053866,1.559215
2,ridge,0.056227,1.561283


# Target Encoder

In [48]:
!pip install category_encoders



In [49]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [50]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [51]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [52]:
scores.mean(),scores.std()

(0.8234446963037995, 0.02080199415208476)

In [53]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [54]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [55]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [56]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [57]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.901167,0.475183
6,extra trees,0.900781,0.50035
10,xgboost,0.901047,0.504396
7,gradient boosting,0.887648,0.533169
9,mlp,0.846192,0.645142
4,decision tree,0.81472,0.668178
0,linear_reg,0.823445,0.732825
2,ridge,0.823472,0.733023
8,adaboost,0.819423,0.767905
1,svr,0.772015,0.852708


# Hyperparameter Tuning

In [58]:
from sklearn.model_selection import GridSearchCV

In [59]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [60]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [61]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [62]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [63]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [64]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


  warn(


In [65]:
final_pipe = search.best_estimator_

In [66]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'auto',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [67]:
search.best_score_

0.9013565894746115

In [68]:
final_pipe.fit(X,y_transformed)

  warn(


# Exporting the model

In [69]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown = "ignore"),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [70]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [71]:
pipeline.fit(X,y_transformed)

In [72]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [73]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [74]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,falt,sector 54,3,4,3,Relatively New,2329.0,1,0,furnished,high,high floor
1,falt,sector 2,4,4,3+,New Property,5000.0,0,0,semifurnished,medium,low floor
2,falt,sector 69,3,3,2,Relatively New,1428.0,0,0,semifurnished,medium,medium floor
3,falt,sector 79,3,2,3,Relatively New,1570.0,0,0,semifurnished,low,medium floor
4,falt,sector 33,3,2,3+,New Property,1409.0,0,0,semifurnished,low,high floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3550,falt,sector 83,3,3,2,Relatively New,1972.0,0,0,unfurnished,medium,low floor
3551,falt,sector 56,1,1,1,Moderately Old,839.0,0,0,semifurnished,medium,medium floor
3552,house,sector 6,4,2,2,Moderately Old,1600.0,0,0,semifurnished,low,low floor
3553,falt,sector 89,3,3,2,New Property,1611.0,0,0,semifurnished,medium,medium floor


# Trying out the predictions

In [75]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [76]:
X.iloc[0].values

array(['falt', 'sector 54', 3, 4, '3', 'Relatively New', 2329.0, 1, 0,
       'furnished', 'high', 'high floor'], dtype=object)

In [96]:
data = [['house', 'sector 87', 4, 4, '3+', 'New Property', 4750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']


one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 87,4,4,3+,New Property,4750,0,0,unfurnished,Low,Low Floor


In [97]:
np.expm1(pipeline.predict(one_df))



array([4.14945932])

In [98]:
X.dtypes

property_type       object
sector              object
bedRoom              int64
bathroom             int64
balcony             object
agePossession       object
built_up_area      float64
servant room         int64
store room           int64
furnishing_type     object
luxury_category     object
floor_category      object
dtype: object

In [99]:
sorted(X['sector'].unique().tolist())

['dwarka expressway',
 'gwal pahari',
 'manesar',
 'new',
 'new sector 2',
 'sector 1',
 'sector 102',
 'sector 103',
 'sector 104',
 'sector 105',
 'sector 106',
 'sector 107',
 'sector 108',
 'sector 109',
 'sector 10a',
 'sector 11',
 'sector 110',
 'sector 111',
 'sector 112',
 'sector 113',
 'sector 12',
 'sector 13',
 'sector 14',
 'sector 15',
 'sector 17',
 'sector 17a',
 'sector 17b',
 'sector 2',
 'sector 21',
 'sector 22',
 'sector 23',
 'sector 24',
 'sector 25',
 'sector 26',
 'sector 27',
 'sector 28',
 'sector 3',
 'sector 3 phase 2',
 'sector 3 phase 3 extension',
 'sector 30',
 'sector 31',
 'sector 33',
 'sector 36',
 'sector 36a',
 'sector 37',
 'sector 37c',
 'sector 37d',
 'sector 38',
 'sector 39',
 'sector 4',
 'sector 40',
 'sector 41',
 'sector 43',
 'sector 45',
 'sector 46',
 'sector 47',
 'sector 48',
 'sector 49',
 'sector 5',
 'sector 50',
 'sector 51',
 'sector 52',
 'sector 53',
 'sector 54',
 'sector 55',
 'sector 56',
 'sector 57',
 'sector 58',
 'sect