In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
#loading data
def load_data(path):
    return pd.read_csv(path,thousands=',')

data=load_data("data of gurugram real Estate.csv")


In [3]:
#Cleaning Data
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

data = remove_outliers_iqr(data, ['Area', 'Rate per sqft', 'BHK_Count'])
data=data[data['Flat Type'] != 'Penthouse']

In [4]:
#adding features
data['mult']=data['Area']*data['Rate per sqft']
data=data.drop(['Area','Rate per sqft','Property Type','Socity','Builder Name','Company Name','Locality'],axis=1)


In [5]:
#data split
binss=[0,10000000,20000000,30000000,40000000,50000000,np.inf]
labelss=[1,2,3,4,5,6]
data['mult_cat']=pd.cut(data["mult"],bins=binss,labels=labelss)#spliting column

#Stratified Split
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_idx,test_idx in split.split(data,data["mult_cat"]):
    strat_train_set=data.iloc[train_idx]
    strat_test_set=data.iloc[test_idx]

#Spliting features and labels
strat_test_set=strat_test_set.drop('mult_cat',axis=1)
strat_train_set=strat_train_set.drop('mult_cat',axis=1)
train_labels=strat_train_set['Price']
test_labels=strat_test_set['Price']
trainer=strat_train_set.drop('Price',axis=1)
tester=strat_test_set.drop('Price',axis=1)

In [6]:
#data preprocessing
num_attrs=['BHK_Count','mult']
cat_attr=['Status','RERA Approval','Flat Type']
full_pipeline=ColumnTransformer([
    ("num",StandardScaler(),num_attrs),
    ("cat",OneHotEncoder(handle_unknown="ignore"),cat_attr)
])

trainer_prepared=full_pipeline.fit_transform(trainer)
tester_prepared=full_pipeline.fit_transform(tester)#same pipeline for testing also

In [7]:
#models selection

linReg=LinearRegression()
linReg.fit(trainer_prepared,train_labels)

TreeReg=DecisionTreeRegressor()
TreeReg.fit(trainer_prepared,train_labels)

RanReg=RandomForestRegressor()
RanReg.fit(trainer_prepared,train_labels)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
#standard Errors(RMSE)
lin_preds = linReg.predict(trainer_prepared)
tree_preds = TreeReg.predict(trainer_prepared)
forest_preds = RanReg.predict(trainer_prepared)

# Calculate RMSE
lin_rmse = root_mean_squared_error(train_labels, lin_preds)
tree_rmse = root_mean_squared_error(train_labels, tree_preds)
forest_rmse = root_mean_squared_error(train_labels, forest_preds)
print("Linear Regression RMSE:", lin_rmse)
print("Decision Tree RMSE:", tree_rmse)
print("Random Forest RMSE:", forest_rmse)

Linear Regression RMSE: 459613.5101075227
Decision Tree RMSE: 29067.889409654155
Random Forest RMSE: 174582.02495323188


In [9]:
#cross validation
scores=-cross_val_score(TreeReg,trainer_prepared,train_labels,scoring='neg_mean_squared_error',cv=10)
tree_rmse_scores=np.sqrt(scores)
print("--------------------Decision Tree Regresser--------------------")
print("scores:",tree_rmse_scores)
print("Mean:",tree_rmse_scores.mean())

scores=-cross_val_score(linReg,trainer_prepared,train_labels,scoring='neg_mean_squared_error',cv=10)
lin_rmse_scores=np.sqrt(scores)
print("--------------------Linear Regressor--------------------")
print("scores:",lin_rmse_scores)
print("Mean:",lin_rmse_scores.mean())

scores=-cross_val_score(RanReg,trainer_prepared,train_labels,scoring='neg_mean_squared_error',cv=10)
forest_rmse_scores=np.sqrt(scores)
print("--------------------Random Forest Regresser--------------------")
print("scores:",forest_rmse_scores)
print("Mean:",forest_rmse_scores.mean())

--------------------Decision Tree Regresser--------------------
scores: [733912.47716233 471044.91643594 457938.92204994 639674.84811663
 161240.86080783 788821.9492149  456326.22813979 733537.71507954
 407847.53536051 494249.90000354]
Mean: 534459.5352370947
--------------------Linear Regressor--------------------
scores: [652256.6898924  379593.43327374 477178.58082179 433617.03054845
 136957.05983057 354572.82668184 414138.97956659 704281.93071942
 306153.12261857 472056.32273622]
Mean: 433080.5976689607
--------------------Random Forest Regresser--------------------
scores: [657543.68242557 395967.91950002 437941.42795619 550567.02839231
 176639.65629543 484994.35973617 420878.4798496  676320.84218246
 334491.0611512  483615.58706081]
Mean: 461896.0044549772


In [10]:
#gridsearchCV(Random Forest Regressor)
params=[
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]},
]

grid_src=GridSearchCV(RanReg,params,cv=5,scoring='neg_mean_squared_error',return_train_score=True)
grid_src.fit(trainer_prepared,train_labels)

#RMSE caluculations
c=grid_src.cv_results_
for mean_score,prms in zip(c['mean_test_score'],c['params']):
    print(np.sqrt(-mean_score),prms)

878594.5566303822 {'max_features': 2, 'n_estimators': 3}
941205.7024518505 {'max_features': 2, 'n_estimators': 10}
905915.5997122544 {'max_features': 2, 'n_estimators': 30}
982246.8412162084 {'max_features': 4, 'n_estimators': 3}
833905.9395268069 {'max_features': 4, 'n_estimators': 10}
814374.9457184505 {'max_features': 4, 'n_estimators': 30}
774358.9580804247 {'max_features': 6, 'n_estimators': 3}
703618.878464519 {'max_features': 6, 'n_estimators': 10}
648293.8408697395 {'max_features': 6, 'n_estimators': 30}
659516.2449230668 {'max_features': 8, 'n_estimators': 3}
572316.7314001196 {'max_features': 8, 'n_estimators': 10}
503057.2490490484 {'max_features': 8, 'n_estimators': 30}
864707.5584813795 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
833749.5390536491 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
749936.7983297105 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
864600.0563384185 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}

In [11]:
#Feature Importance
feature_tally=grid_src.best_estimator_.feature_importances_
feature_tally
sorted(feature_tally,reverse=True)#tells which feature takes model towards correct outputs

[np.float64(0.9360037135563237),
 np.float64(0.046846081379410424),
 np.float64(0.00536405841068903),
 np.float64(0.0049369149111805535),
 np.float64(0.0019481502601063645),
 np.float64(0.0019455711521312654),
 np.float64(0.000681353228724848),
 np.float64(0.0005847238017292004),
 np.float64(0.000578381929947087),
 np.float64(0.0005557185965933308),
 np.float64(0.0003306009416534919),
 np.float64(0.00020956609762956007),
 np.float64(1.5165733881264313e-05)]

In [12]:
#Final testing


#Linear Rgeressor(Selected Model)
final_predictions=linReg.predict(tester_prepared)
final_rmse=root_mean_squared_error(test_labels,final_predictions)
print("Final Error using Linear Regressor:",final_rmse)

#Random Forest Regressor(Overfits)
final_model=grid_src.best_estimator_
final_predictions2=final_model.predict(tester_prepared)
final_rmse2=root_mean_squared_error(test_labels,final_predictions2)
print("Final Error using Random Forest Regressor:",final_rmse2)

Final Error using Linear Regressor: 350505.7900560198
Final Error using Random Forest Regressor: 605021.8385051964


In [13]:
#saving Predicted Data
tester['expected']=test_labels
tester['predicted(LR)']=final_predictions
tester['predicted(FR)']=final_predictions2
tester['diference(LR)']=tester['expected']-tester['predicted(LR)']
tester['diference(FR)']=tester['expected']-tester['predicted(FR)']

tester.to_csv("predictions.csv")

In [14]:
#Saving Model
joblib.dump(linReg,"ggn_predicter.pkl")

['ggn_predicter.pkl']