In [46]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pickle
from xgboost import XGBClassifier

In [47]:
housing = pd.DataFrame(pd.read_csv('Housing.csv'))
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [48]:
Q1 = housing.price.quantile(0.25)
Q3 = housing.price.quantile(0.75)
IQR = Q3-Q1
housing = housing[(housing.price >= Q1 - 1.5*IQR) & (housing.price <= Q3 + 1.5*IQR)]

In [49]:
Q1 = housing.area.quantile(0.25)
Q3 = housing.area.quantile(0.75)
IQR = Q3-Q1
housing = housing[(housing.area >= Q1 - 1.5*IQR) & (housing.area <= Q3 + 1.5*IQR)]

In [50]:
header_list = ['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']
def binary_map(x):
    return x.map({'yes':1 ,'no':0})
housing[header_list] = housing[header_list].apply(binary_map)

In [51]:
status = pd.get_dummies(housing['furnishingstatus'],dtype = int)
status = pd.get_dummies(housing['furnishingstatus'],drop_first = True,dtype = int)
housing = pd.concat([housing,status] , axis = 1)
housing.drop(['furnishingstatus'],inplace = True,axis = 1)

In [52]:
from sklearn.model_selection import train_test_split

np.random.seed(0)
df_train, df_test = train_test_split(housing, train_size = 0.7 , test_size = 0.3 , random_state = 100)

In [53]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_var = ['price','area','bedrooms','bathrooms','stories','parking']
df_train[num_var] = scaler.fit_transform(df_train[num_var])
df_train.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
148,0.52381,0.526907,0.4,0.0,0.666667,1,0,0,0,0,0.0,1,1,0
236,0.390476,0.114134,0.2,0.0,0.333333,1,1,1,0,0,0.0,1,1,0
356,0.275238,0.072738,0.8,0.5,0.0,0,0,1,0,1,0.333333,0,0,0
425,0.219048,0.15139,0.2,0.0,0.0,1,0,1,0,0,0.666667,0,0,0
516,0.095238,0.157895,0.2,0.0,0.0,0,1,0,0,0,0.333333,0,0,1


In [54]:
Y_train = df_train.pop('price')
X_train = df_train

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train,Y_train)

rfe = RFE(estimator = lm , n_features_to_select = 6 )
rfe = rfe.fit(X_train,Y_train)

col = X_train.columns[rfe.support_]
X_train.columns[rfe.support_]

X_train_rfe = X_train[col]

import statsmodels.api as sm

X_train_rfe = sm.add_constant(X_train_rfe)

lm = sm.OLS(Y_train,X_train_rfe).fit()



In [55]:
pickle.dump(lm,open('lm.pkl','wb'))

In [56]:
price_pred = lm.predict(X_train_rfe)

# print(price_pred)

# Model Evaluation

num_vars = ['price','area','stories', 'bathrooms', 'airconditioning', 'prefarea','parking']
df_test[num_vars] = scaler.fit_transform(df_test[num_vars])

y_test = df_test.pop('price')
x_test = df_test

print

x_test = sm.add_constant(x_test)

x_test_rfe = x_test[X_train_rfe.columns]

fin_pred = lm.predict(x_test_rfe)
# print(fin_pred)
# print(housing['price'].head())
final_value = pd.DataFrame(fin_pred.values, columns=['predicted_price'], index=x_test_rfe.index)

#Final Prediction done , still needs a few tweaks to make it ready for user to see
#Need to convert scaled values to real values to show prediction
#print(scaler.inverse_transform(final_value))
pickled_model = pickle.load(open('lm.pkl','rb'))
pred = pickled_model.predict(x_test_rfe)
print(pred)

244    0.432293
287    0.425883
166    0.634367
176    0.553494
282    0.290346
         ...   
314    0.303544
36     0.717456
298    0.346144
435    0.207101
421    0.236043
Length: 156, dtype: float64
