# <center> Machine Hack Challenge <center>
# <center> Predicting House Prices <center>

## Load libraries

In [1]:
import re
import pickle
import numpy as np
import pandas as pd

import sklearn
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

import xgboost
import lightgbm

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Reading Data

In [2]:
train_data = pd.read_csv('train.csv')

In [3]:
test_data = pd.read_csv('test.csv')

In [4]:
train_data.shape

(29451, 12)

In [5]:
train_data.head(10)

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5
5,Owner,0,0,3,BHK,1250.0,1,1,"South Chittoor,Kochi",10.03328,76.282571,42.0
6,Dealer,0,0,3,BHK,1495.053957,1,1,"Sodala,Jaipur",26.916347,75.7956,66.5
7,Owner,0,1,3,BHK,1181.012946,1,1,"Kharar,Mohali",30.74,76.65,52.0
8,Dealer,0,1,2,BHK,1040.0,1,1,"Bileshivale,Bangalore",13.054202,77.674002,41.6
9,Owner,0,1,2,BHK,879.120879,1,1,"Chromepet,Chennai",12.95161,80.14097,36.0





## Pre-Processing




In [6]:
train_data.BHK_OR_RK.unique()

array(['BHK', 'RK'], dtype=object)

In [7]:
df2 = train_data["ADDRESS"].apply(lambda x: x.rsplit(",",1))
train_data["City"] = df2.apply(lambda x: x[1])

In [8]:
train_data.City.unique()

array(['Bangalore', 'Mysore', 'Ghaziabad', 'Kolkata', 'Kochi', 'Jaipur',
       'Mohali', 'Chennai', 'Siliguri', 'Noida', 'Raigad', 'Bhubaneswar',
       'Wardha', 'Pune', 'Mumbai', 'Nagpur', 'Deoghar', 'Bhiwadi',
       'Faridabad', 'Lalitpur', 'Maharashtra', 'Vadodara',
       'Visakhapatnam', 'Vapi', 'Mangalore', 'Aurangabad', 'Ottapalam',
       'Vijayawada', 'Belgaum', 'Bhopal', 'Lucknow', 'Kanpur',
       'Gandhinagar', 'Pondicherry', 'Agra', 'Ranchi', 'Gurgaon', 'Udupi',
       'Indore', 'Jodhpur', 'Coimbatore', 'Valsad', 'Palghar', 'Surat',
       'Varanasi', 'Guwahati', 'Amravati', 'Anand', 'Tirupati',
       'Secunderabad', 'Raipur', 'Vizianagaram', 'Thrissur', 'Satna',
       'Madurai', 'Chandigarh', 'Shimla', 'Gwalior', 'Rajkot', 'Sonipat',
       'Allahabad', 'Berhampur', 'Roorkee', 'Dharuhera', 'Latur',
       'Durgapur', 'Panchkula', 'Solapur', 'Durg', 'Goa', 'Jamshedpur',
       'Hazaribagh', 'Jabalpur', 'Hosur', 'Morbi', 'Hubli', 'Karnal',
       'Patna', 'Bilaspur', '

In [9]:
train_data["ADDRESS"][train_data['City'] == 'Maharashtra']

34            Mulund (West),Maharashtra
54            Kalyan (West),Maharashtra
80                 DN Nagar,Maharashtra
93                   Deonar,Maharashtra
101          Bhandup (West),Maharashtra
                      ...              
29330                 Powai,Maharashtra
29356                Balkum,Maharashtra
29372    Kanjur Marg (East),Maharashtra
29400             Anjurdive,Maharashtra
29443                 Marol,Maharashtra
Name: ADDRESS, Length: 1579, dtype: object

In [10]:
train_data.replace(to_replace ="Maharashtra", value ="Mumbai")

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS),City
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.969910,77.597960,55.0,Bangalore
1,Dealer,0,0,2,BHK,1275.000000,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0,Mysore
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0,Bangalore
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.642300,77.344500,62.5,Ghaziabad
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.592200,88.484911,60.5,Kolkata
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29446,Owner,0,0,3,BHK,2500.000000,1,1,"Shamshabad Road,Agra",27.140626,78.043277,45.0,Agra
29447,Owner,0,0,2,BHK,769.230769,1,1,"E3-108, Lake View Recidency,,Vapi",39.945409,-86.150721,16.0,Vapi
29448,Dealer,0,0,2,BHK,1022.641509,1,1,"Ajmer Road,Jaipur",26.928785,75.828002,27.1,Jaipur
29449,Owner,0,0,2,BHK,927.079009,1,1,"Sholinganallur,Chennai",12.900150,80.227910,67.0,Chennai


In [11]:
df2 = test_data["ADDRESS"].apply(lambda x: x.rsplit(",",1))
test_data["City"] = df2.apply(lambda x: x[1])
test_data.replace(to_replace ="Maharashtra", value ="Mumbai")

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,City
0,Owner,0,0,1,BHK,545.171340,1,1,"Kamrej,Surat",21.262000,73.047700,Surat
1,Dealer,1,1,2,BHK,800.000000,0,0,"Panvel,Lalitpur",18.966114,73.148278,Lalitpur
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.592200,88.484911,Kolkata
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.988300,75.584600,Jaipur
4,Owner,0,0,1,BHK,430.477830,1,1,"Mai Mandir,Nadiad",22.700000,72.870000,Nadiad
...,...,...,...,...,...,...,...,...,...,...,...,...
68715,Dealer,0,1,2,BHK,856.555505,1,1,"Thane West,Maharashtra",19.180000,72.963330,Mumbai
68716,Dealer,0,1,3,BHK,2304.147465,1,1,"Sector-66A Mohali,Mohali",30.661104,76.746082,Mohali
68717,Dealer,1,1,1,BHK,33362.792750,0,0,"Balkum,Maharashtra",19.222101,72.988231,Mumbai
68718,Dealer,0,0,2,BHK,1173.708920,1,1,"Hadapsar,Pune",18.496670,73.941670,Pune


In [12]:
train_data.drop(['ADDRESS'], axis = 1, inplace = True) 




### Convert the categorical columns by Encoding.




In [13]:
def replace_posted_by(my_string):
    if my_string == 'Owner':
        return 1
    else:
        return 0

In [14]:
def replace_BHK_OR_RK(my_string):
    if my_string == 'BHK':
        return 0
    else:
        return 1

In [15]:
train_data['POSTED_BY'] = train_data.POSTED_BY.apply(replace_posted_by)
train_data['BHK_OR_RK'] = train_data.BHK_OR_RK.apply(replace_BHK_OR_RK)

In [16]:
city_encoder = LabelEncoder()
city_encoder.fit(train_data['City'].astype('str').append(test_data['City'].astype('str')))
train_data['City'] = city_encoder.transform(train_data['City'])

In [17]:
train_data.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS),City
0,1,0,0,2,0,1300.236407,1,1,12.96991,77.59796,55.0,24
1,0,0,0,2,0,1275.0,1,1,12.274538,76.644605,51.0,196
2,1,0,0,2,0,933.159722,1,1,12.778033,77.632191,43.0,24
3,1,0,1,2,0,929.921143,1,1,28.6423,77.3445,62.5,97
4,0,1,0,2,0,999.009247,0,1,22.5922,88.484911,60.5,166


In [18]:
test_data.drop(['ADDRESS'], axis = 1, inplace = True) 
test_data['POSTED_BY'] = test_data.POSTED_BY.apply(replace_posted_by)
test_data['BHK_OR_RK'] = test_data.BHK_OR_RK.apply(replace_BHK_OR_RK)
test_data.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,City
0,1,0,0,1,0,545.17134,1,1,21.262,73.0477,Surat
1,0,1,1,2,0,800.0,0,0,18.966114,73.148278,Lalitpur
2,0,0,0,2,0,1257.096513,1,1,22.5922,88.484911,Kolkata
3,0,0,0,3,0,1400.329489,1,1,26.9883,75.5846,Jaipur
4,1,0,0,1,0,430.47783,1,1,22.7,72.87,Nadiad


In [19]:
test_data['City'] = city_encoder.transform(test_data['City'])
test_data.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,City
0,1,0,0,1,0,545.17134,1,1,21.262,73.0477,283
1,0,1,1,2,0,800.0,0,0,18.966114,73.148278,174
2,0,0,0,2,0,1257.096513,1,1,22.5922,88.484911,166
3,0,0,0,3,0,1400.329489,1,1,26.9883,75.5846,126
4,1,0,0,1,0,430.47783,1,1,22.7,72.87,197


In [20]:
train_data.rename(columns = {'TARGET(PRICE_IN_LACS)':'TARGET'}, inplace = True) 
train_data.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,TARGET,City
0,1,0,0,2,0,1300.236407,1,1,12.96991,77.59796,55.0,24
1,0,0,0,2,0,1275.0,1,1,12.274538,76.644605,51.0,196
2,1,0,0,2,0,933.159722,1,1,12.778033,77.632191,43.0,24
3,1,0,1,2,0,929.921143,1,1,28.6423,77.3445,62.5,97
4,0,1,0,2,0,999.009247,0,1,22.5922,88.484911,60.5,166


### Preprocessed train data
- train data
- X_train
- y_train

In [21]:
X_train = train_data.drop(['TARGET'], axis = 1) 
y_train = train_data[['TARGET']]

In [22]:
y_train

Unnamed: 0,TARGET
0,55.0
1,51.0
2,43.0
3,62.5
4,60.5
...,...
29446,45.0
29447,16.0
29448,27.1
29449,67.0


In [23]:
X_train

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,City
0,1,0,0,2,0,1300.236407,1,1,12.969910,77.597960,24
1,0,0,0,2,0,1275.000000,1,1,12.274538,76.644605,196
2,1,0,0,2,0,933.159722,1,1,12.778033,77.632191,24
3,1,0,1,2,0,929.921143,1,1,28.642300,77.344500,97
4,0,1,0,2,0,999.009247,0,1,22.592200,88.484911,166
...,...,...,...,...,...,...,...,...,...,...,...
29446,1,0,0,3,0,2500.000000,1,1,27.140626,78.043277,1
29447,1,0,0,2,0,769.230769,1,1,39.945409,-86.150721,301
29448,0,0,0,2,0,1022.641509,1,1,26.928785,75.828002,126
29449,1,0,0,2,0,927.079009,1,1,12.900150,80.227910,60


In [24]:
X_test = test_data
X_test.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,City
0,1,0,0,1,0,545.17134,1,1,21.262,73.0477,283
1,0,1,1,2,0,800.0,0,0,18.966114,73.148278,174
2,0,0,0,2,0,1257.096513,1,1,22.5922,88.484911,166
3,0,0,0,3,0,1400.329489,1,1,26.9883,75.5846,126
4,1,0,0,1,0,430.47783,1,1,22.7,72.87,197


## Modelling

In [25]:
# rfRegressor = RandomForestRegressor()
# model = rfRegressor.fit(X_train, y_train)

In [26]:
# lreg = LinearRegression(normalize=True)
# model = lreg.fit(X_train, y_train)

In [27]:
# dtReg = DecisionTreeRegressor()
# model = dtReg.fit(X_train, y_train)

In [28]:
# from catboost import CatBoostRegressor
# model=CatBoostRegressor(iterations=10000, depth=5, learning_rate=0.03, loss_function='RMSE')
# model.fit(X_train, y_train)

- n_estimators: 500, max_depth: 6 - Score: 86189
- n_estimators: 1000, max_depth: 8 - Score: 86694247
- n_estimators: 1300, max_depth: 8 - Score: 86775
- Highest - 867978
- 86309

In [29]:
# xgb = xgboost.XGBRegressor(n_estimators=2000, learning_rate=0.1, gamma=0, subsample=0.60,
#                           colsample_bytree=1, max_depth=8)
# model = xgb.fit(X_train, y_train)

In [30]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

params = {'min_child_weight':[4,5,6], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4], 'n_estimators':[1000, 1500, 2000], 
          'learning_rate':[0.01, 0.05, 0.1]}

xgb = xgboost.XGBRegressor(nthread=-1) 

grid = GridSearchCV(xgb, params)

In [31]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [20, 100],
    'colsample_bytree': [0.2, 1],
    'max_depth': [2, 10, None],
    'reg_alpha': [0, 1],
    'reg_lambda': [1, 2],
    'subsample': [0.5, 0.9],
    'learning_rate':[0.1, 0.9],
    'gamma':[0,1,10,100],
    'min_child_weight':[0,1,10,100],
    'sampling_method': ['uniform', 'gradient_based']
}

xgb = xgboost.XGBRegressor(nthread=-1) 

grid = GridSearchCV(xgb, param_grid = param_grid, cv = 3, verbose = True, n_jobs = -1)


In [32]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 6144 candidates, totalling 18432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   41.4s


KeyboardInterrupt: 

In [None]:
grid.best_estimator_

In [None]:
y_pred = grid.best_estimator_.predict(X_test)

In [None]:
y_pred

In [None]:
out_df = pd.DataFrame({'TARGET(PRICE_IN_LACS)': y_pred})

In [None]:
out_df.to_csv('xgb.csv', index=False)

In [None]:
from sklearn import metrics
import numpy as np
from sklearn.metrics import mean_squared_log_error
# We have created a function to print accuracy metrics which can be used
# to get accuracy metrics of all models in upcoming steps
def print_accuracy_report(y_test, y_pred,X_test):
 print('R Squared(Accuracy)', metrics.r2_score(y_test, y_pred))   
 print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
 print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
 print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
 print('Root Mean Squared Log Error',np.sqrt(mean_squared_log_error( y_test, y_pred )))

In [None]:
import pickle
pkl_filename = "xgboost_grid_search.pkl"  
with open(pkl_filename, 'wb') as file:
    pickle.dump(grid, file)