# Machine Learning Modelling

In [1]:
import numpy as np
import pandas as pd

In [12]:
data = pd.read_csv('./data/Cleaned_data.csv')

In [14]:
data.sort_values(by='Zipcode')

Unnamed: 0,State,City,Street,Zipcode,Bedroom,Bathroom,Area,PPSq,LotArea,MarketEstimate,RentEstimate,Latitude,Longitude,ListedPrice
9237,MA,Amherst,Sunderland Rd,1002,4.0,3.0,3120.0,181.089744,2.700000,526800.000000,3499.0,42.422096,-72.538150,565000.0
8926,MA,Belchertown,Gold St,1007,3.0,2.0,1296.0,270.061728,1.730000,355000.000000,2949.0,42.342020,-72.410630,350000.0
8804,MA,Belchertown,Mill Valley Rd,1007,3.0,2.0,1512.0,210.978836,1.760000,324600.000000,2274.0,42.263100,-72.374750,319000.0
8812,MA,Chester,Maple St,1011,5.0,2.0,2136.0,46.816479,0.450000,88900.000000,2800.0,42.278910,-72.979530,100000.0
8915,MA,Chicopee,Fairmont St,1013,2.0,1.0,1008.0,237.996032,0.511387,242000.000000,1800.0,42.185658,-72.598274,239900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
636,AK,Ketchikan,Schoenbar Rd,99901,4.0,3.0,2112.0,233.428030,0.340000,475400.000000,1949.0,55.347317,-131.637570,493000.0
516,AK,Coffman Cove,Coho Dr,99918,3.0,3.0,1850.0,539.459459,0.400000,942680.641922,2585.0,56.021515,-132.827740,998000.0
685,AK,Coffman Cove,NE Minke,99918,1.0,1.0,1200.0,290.833333,3.900000,346900.000000,1900.0,56.012170,-132.810640,349000.0
751,AK,Wrangell,.2 Mile Zimovia Hwy,99929,3.0,2.0,1800.0,206.666667,0.980000,370600.000000,2500.0,56.363064,-132.356580,372000.0


## *Feature Engineering*

In [15]:
data.isna().sum()

State             0
City              0
Street            0
Zipcode           0
Bedroom           0
Bathroom          0
Area              0
PPSq              0
LotArea           0
MarketEstimate    0
RentEstimate      0
Latitude          0
Longitude         0
ListedPrice       0
dtype: int64

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21745 entries, 0 to 21744
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   State           21745 non-null  object 
 1   City            21745 non-null  object 
 2   Street          21745 non-null  object 
 3   Zipcode         21745 non-null  int64  
 4   Bedroom         21745 non-null  float64
 5   Bathroom        21745 non-null  float64
 6   Area            21745 non-null  float64
 7   PPSq            21745 non-null  float64
 8   LotArea         21745 non-null  float64
 9   MarketEstimate  21745 non-null  float64
 10  RentEstimate    21745 non-null  float64
 11  Latitude        21745 non-null  float64
 12  Longitude       21745 non-null  float64
 13  ListedPrice     21745 non-null  float64
dtypes: float64(10), int64(1), object(3)
memory usage: 2.3+ MB


In [17]:
def split_zipcode(zipcode):
    str_zip = str(zipcode).zfill(5)
    return [int(str_zip[0]), int(str_zip[1]), int(str_zip[2]), int(str_zip[3]), int(str_zip[4])]

In [18]:
zipcode_components = data['Zipcode'].apply(split_zipcode)
zipcode_df = pd.DataFrame(zipcode_components.tolist(), columns=['NationalArea', 'CityPO1', 'CityPO2', 'AssociatePO1', 'AssociatePO2'])

df = pd.concat([data, zipcode_df], axis=1)

In [20]:
df.sort_values(by='Zipcode')

Unnamed: 0,State,City,Street,Zipcode,Bedroom,Bathroom,Area,PPSq,LotArea,MarketEstimate,RentEstimate,Latitude,Longitude,ListedPrice,NationalArea,CityPO1,CityPO2,AssociatePO1,AssociatePO2
9237,MA,Amherst,Sunderland Rd,1002,4.0,3.0,3120.0,181.089744,2.700000,526800.000000,3499.0,42.422096,-72.538150,565000.0,0,1,0,0,2
8926,MA,Belchertown,Gold St,1007,3.0,2.0,1296.0,270.061728,1.730000,355000.000000,2949.0,42.342020,-72.410630,350000.0,0,1,0,0,7
8804,MA,Belchertown,Mill Valley Rd,1007,3.0,2.0,1512.0,210.978836,1.760000,324600.000000,2274.0,42.263100,-72.374750,319000.0,0,1,0,0,7
8812,MA,Chester,Maple St,1011,5.0,2.0,2136.0,46.816479,0.450000,88900.000000,2800.0,42.278910,-72.979530,100000.0,0,1,0,1,1
8915,MA,Chicopee,Fairmont St,1013,2.0,1.0,1008.0,237.996032,0.511387,242000.000000,1800.0,42.185658,-72.598274,239900.0,0,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
636,AK,Ketchikan,Schoenbar Rd,99901,4.0,3.0,2112.0,233.428030,0.340000,475400.000000,1949.0,55.347317,-131.637570,493000.0,9,9,9,0,1
516,AK,Coffman Cove,Coho Dr,99918,3.0,3.0,1850.0,539.459459,0.400000,942680.641922,2585.0,56.021515,-132.827740,998000.0,9,9,9,1,8
685,AK,Coffman Cove,NE Minke,99918,1.0,1.0,1200.0,290.833333,3.900000,346900.000000,1900.0,56.012170,-132.810640,349000.0,9,9,9,1,8
751,AK,Wrangell,.2 Mile Zimovia Hwy,99929,3.0,2.0,1800.0,206.666667,0.980000,370600.000000,2500.0,56.363064,-132.356580,372000.0,9,9,9,2,9
