In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import sklearn

In [2]:
df = pd.read_csv("../data/train/Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [6]:
df.isna().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [9]:
df['mainroad'].unique(), df['guestroom'].unique(), df["prefarea"].unique()

(array(['yes', 'no'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['yes', 'no'], dtype=object))

In [11]:
df['mainroad'] = df['mainroad'] == 'yes'
df['guestroom'] = df['guestroom'] == 'yes'
df['basement'] = df['basement'] == 'yes'
df['hotwaterheating'] = df['hotwaterheating'] == 'yes'
df['airconditioning'] = df['airconditioning'] == 'yes'
df['prefarea'] = df['prefarea'] == 'yes'

In [13]:
df = df.astype(
    {"mainroad": int,
     "guestroom": int,
     "basement": int,
     "hotwaterheating": int,
     "airconditioning": int,
     "prefarea": int,
     }
)

In [15]:
df['furnishingstatus'].unique()

array(['furnished', 'semi-furnished', 'unfurnished'], dtype=object)

In [16]:
df['furnished'] = (df['furnishingstatus'] == 'furnished').astype(int)
df['semi-furnished'] = (df['furnishingstatus'] == 'semi-furnished').astype(int)
df['unfurnished'] = (df['furnishingstatus'] == 'unfurnished').astype(int)

In [19]:
df.drop(columns=["furnishingstatus"], inplace=True)

In [21]:
for col in list(df.columns):
    df[col] /= df[col].max()

In [22]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnished,semi-furnished,unfurnished
0,1.0,0.458025,0.666667,0.5,0.75,1.0,0.0,0.0,0.0,1.0,0.666667,1.0,1.0,0.0,0.0
1,0.921053,0.553086,0.666667,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,0.921053,0.614815,0.5,0.5,0.5,1.0,0.0,1.0,0.0,0.0,0.666667,1.0,0.0,1.0,0.0
3,0.918421,0.462963,0.666667,0.5,0.5,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
4,0.857895,0.458025,0.666667,0.25,0.5,1.0,1.0,1.0,0.0,1.0,0.666667,0.0,1.0,0.0,0.0


In [27]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [28]:
X

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnished,semi-furnished,unfurnished
0,0.458025,0.666667,0.50,0.75,1.0,0.0,0.0,0.0,1.0,0.666667,1.0,1.0,0.0,0.0
1,0.553086,0.666667,1.00,1.00,1.0,0.0,0.0,0.0,1.0,1.000000,0.0,1.0,0.0,0.0
2,0.614815,0.500000,0.50,0.50,1.0,0.0,1.0,0.0,0.0,0.666667,1.0,0.0,1.0,0.0
3,0.462963,0.666667,0.50,0.50,1.0,0.0,1.0,0.0,1.0,1.000000,1.0,1.0,0.0,0.0
4,0.458025,0.666667,0.25,0.50,1.0,1.0,1.0,0.0,1.0,0.666667,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,0.185185,0.333333,0.25,0.25,1.0,0.0,1.0,0.0,0.0,0.666667,0.0,0.0,0.0,1.0
541,0.148148,0.500000,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1.0,0.0
542,0.223457,0.333333,0.25,0.25,1.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.0
543,0.179630,0.500000,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0


In [29]:
y

0      1.000000
1      0.921053
2      0.921053
3      0.918421
4      0.857895
         ...   
540    0.136842
541    0.132868
542    0.131579
543    0.131579
544    0.131579
Name: price, Length: 545, dtype: float64

In [30]:
from sklearn.linear_model import LinearRegression

In [31]:
reg = LinearRegression().fit(X, y)

In [32]:
reg.coef_

array([ 0.29737279,  0.05178386,  0.29704304,  0.13559338,  0.03167463,
        0.02259593,  0.02632383,  0.06431933,  0.06503446,  0.06250536,
        0.04898826,  0.01146815,  0.00798359, -0.01945173])

In [34]:
import pickle

In [35]:
# save
with open('../model/model.pkl','wb') as f:
    pickle.dump(reg,f)

In [36]:
# load
with open('../model/model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

In [37]:
clf2.predict(X)

array([0.61152671, 0.79406222, 0.57342765, 0.62625582, 0.50329911,
       0.63366792, 0.73749462, 0.63507565, 0.56476968, 0.57698025,
       0.62716047, 0.62155494, 0.53733867, 0.45596812, 0.46578388,
       0.38611815, 0.55974092, 0.60763101, 0.49084051, 0.5263217 ,
       0.41096818, 0.49924657, 0.4514995 , 0.49559496, 0.54534123,
       0.61039454, 0.61449337, 0.34920471, 0.54168844, 0.54438834,
       0.55789588, 0.50235118, 0.51016813, 0.49960026, 0.48044795,
       0.5800965 , 0.58248392, 0.62543983, 0.47915544, 0.54605112,
       0.46342053, 0.57566645, 0.53963132, 0.50185178, 0.54953568,
       0.48268701, 0.54090503, 0.56831657, 0.37227204, 0.53923146,
       0.52037209, 0.44169021, 0.5721316 , 0.5426527 , 0.49221971,
       0.3679277 , 0.5141298 , 0.67618889, 0.59948571, 0.56350096,
       0.42718525, 0.4057003 , 0.48614451, 0.59206899, 0.5352554 ,
       0.55229844, 0.51409297, 0.39529656, 0.36494923, 0.59831804,
       0.48738805, 0.49778068, 0.44588831, 0.5327307 , 0.38847