# Delhi House prediction model training

In [24]:
# importing necessary modules
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
from sklearn.ensemble import(
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [4]:
raw_data=pd.read_csv('Delhi_housing_data.csv')

In [7]:
data=raw_data.drop('Address',axis=1)
data.drop('Status',inplace=True,axis=1)
data.drop('Landmarks',inplace=True,axis=1)
data.drop('desc',inplace=True,axis=1)
data.drop('Price_sqft',inplace=True,axis=1)
data.drop('Unnamed: 0',inplace=True,axis=1)
data.head()

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,neworold,parking,Furnished_status,Lift,type_of_building
0,5600000.0,1350.0,28.60885,77.46056,3.0,3.0,,New Property,,,2.0,Flat
1,8800000.0,1490.0,28.374236,76.952416,3.0,3.0,,New Property,,Semi-Furnished,2.0,Flat
2,16500000.0,2385.0,28.645769,77.38511,4.0,5.0,,New Property,1.0,Unfurnished,,Flat
3,3810000.0,1050.0,28.566914,77.436434,2.0,2.0,3.0,New Property,1.0,Unfurnished,2.0,Flat
4,6200000.0,1350.0,28.520732,77.356491,2.0,2.0,3.0,Resale,1.0,,3.0,Flat


In [8]:
train_data,test_data=train_test_split(data,test_size=0.2,random_state=42)
train_data.shape

(6190, 12)

In [9]:
numerical_columns=['area','longitude','latitude','Bedrooms','Bathrooms','Balcony','parking','Lift']
categorical_columns=[
    'neworold',
    'Furnished_status',
    'type_of_building'
]

In [15]:
num_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)),
    ('scaler',StandardScaler())
    
])

cat_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(missing_values=np.nan,strategy='most_frequent')),
    ('one_hot_encoder',OneHotEncoder(handle_unknown='ignore')),
    ('scaler',StandardScaler(with_mean=False))
])

In [16]:
preprocessor= ColumnTransformer(
[
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline',cat_pipeline,categorical_columns)
])

In [17]:
input_feature_train_data=train_data.drop('price',axis=1)
target_feature_train_data=train_data['price']

input_feature_test_data=test_data.drop('price',axis=1)
target_feature_test_data=test_data['price']


In [19]:
input_train_feature_arr=preprocessor.fit_transform(input_feature_train_data)
input_test_feature_arr=preprocessor.transform(input_feature_test_data)

train_arr=np.c_[input_train_feature_arr,np.array(target_feature_train_data)]
test_arr=np.c_[input_test_feature_arr,np.array(target_feature_test_data)]

In [21]:
def evaluate_models(X_train, y_train,X_test,y_test,models,param):
        report = {}

        for i in range(len(list(models))):
            model = list(models.values())[i]
            para=param[list(models.keys())[i]]

            gs = GridSearchCV(model,para,cv=3)
            gs.fit(X_train,y_train)

            model.set_params(**gs.best_params_)
            model.fit(X_train,y_train)

            #model.fit(X_train, y_train)  # Train model

            y_train_pred = model.predict(X_train)

            y_test_pred = model.predict(X_test)

            train_model_score = r2_score(y_train, y_train_pred)

            test_model_score = r2_score(y_test, y_test_pred)

            report[list(models.keys())[i]] = test_model_score

        return report

In [25]:
x_train,y_train,x_test,y_test=(
                train_arr[:,:-1],
                train_arr[:,-1],
                test_arr[:,:-1],
                test_arr[:,-1]
            )
            
models={
                "Random Forest":RandomForestRegressor(),
                "Decision Tree":DecisionTreeRegressor(),
                "Gradient Boosting":GradientBoostingRegressor(),
                "Linear Regression":LinearRegression(),
                "XGBRegressor":XGBRegressor(),
                "CatBoosting Regressor":CatBoostRegressor(verbose=False),
                "AdaBoost Regressor":AdaBoostRegressor(),
}
            
params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    # 'splitter':['best','random'],
                    # 'max_features':['sqrt','log2'],
                },
                "Random Forest":{
                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    # 'max_features':['sqrt','log2',None],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Gradient Boosting":{
                    # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    # 'criterion':['squared_error', 'friedman_mse'],
                    # 'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Linear Regression":{},
                "XGBRegressor":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "CatBoosting Regressor":{
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    # 'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                }
                
}
model_report:dict=evaluate_models(x_train,y_train,x_test,y_test,models,params)
best_model_score=max(sorted(model_report.values()))
best_model_name=list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
                ]
best_model=models[best_model_name]
            
print("This is the best model: ")
print(best_model_name)

This is the best model: 
Random Forest


In [26]:
print(best_model_score)

0.9365657721370535


In [27]:
predicted=best_model.predict(x_test)
r2_square=r2_score(y_test,predicted)
print(r2_square)

0.9365657721370535
