imports

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')

read in data from csv

In [37]:
data = pd.read_csv("kc_house_data.csv")
print(data.head())
print(data.info())

           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \
0      5650     1.0           0     0  ...      7        1180              0   
1      7242     2.0           0     0  ...      7        2170            400   
2     10000     1.0           0     0  ...      6         770              0   
3      5000     1.0           0     0  ...      7        1050            910   
4      8080     1.0           0     0  ...      8        1680              0   

   yr_built  yr_renovated  zipcode      lat     lo

preprocessing

In [38]:
# date time conversion
data['date'] = pd.to_datetime(data['date'])
print(data.info())
print(data.head())
# split into days since the earliest on the dataset. 
# this choice allows me to more accurately represent the change when it trains. 
min_date = data['date'].min()
print(min_date)

days_since = []
for i in range(len(data)):
    purchase_date = data.at[i, 'date']
    delta = (purchase_date - min_date).days
    days_since.append(delta)

data['days_since_min_date'] = days_since

data.drop('date', axis = 1, inplace = True)

print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             21613 non-null  int64         
 1   date           21613 non-null  datetime64[ns]
 2   price          21613 non-null  float64       
 3   bedrooms       21613 non-null  int64         
 4   bathrooms      21613 non-null  float64       
 5   sqft_living    21613 non-null  int64         
 6   sqft_lot       21613 non-null  int64         
 7   floors         21613 non-null  float64       
 8   waterfront     21613 non-null  int64         
 9   view           21613 non-null  int64         
 10  condition      21613 non-null  int64         
 11  grade          21613 non-null  int64         
 12  sqft_above     21613 non-null  int64         
 13  sqft_basement  21613 non-null  int64         
 14  yr_built       21613 non-null  int64         
 15  yr_renovated   2161

In [39]:
# don't consider things like condition and lat long, unimportant generally when we have zip code to do locations. 
# condition is subjective, hard to do prediction on more recent homes and test accuracy. 
# in place because im lazy
data.drop(columns = ['condition', 'lat','long', 'id'], inplace = True)

# drop anything with blanks
# in place again because laziness
data.dropna(inplace = True)

Begin training setup

In [40]:
x = data.drop(columns = ['price'])
y = data['price']

In [41]:
# split into training and testing sets. 
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)
print(x_train.head())
print(y_train.head())

       bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  \
6211          3       2.25         1700      3333     1.5           0     0   
21013         4       2.50         3420      6845     2.0           0     0   
20299         3       2.25         1250       811     3.0           0     0   
12732         3       1.00         1450      6000     1.0           0     0   
6675          5       2.50         2500     13034     1.0           0     0   

       grade  sqft_above  sqft_basement  yr_built  yr_renovated  zipcode  \
6211       7        1100            600      1924             0    98117   
21013      9        3420              0      2009             0    98052   
20299      8        1250              0      2014             0    98136   
12732      7        1450              0      1953             0    98118   
6675       7        1300           1200      1962             0    98092   

       sqft_living15  sqft_lot15  days_since_min_date  
6211        

In [42]:
# make training data
train_data = x_train.join(y_train)
test_data = x_train.join(y_train)
print(train_data.head())

       bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  \
6211          3       2.25         1700      3333     1.5           0     0   
21013         4       2.50         3420      6845     2.0           0     0   
20299         3       2.25         1250       811     3.0           0     0   
12732         3       1.00         1450      6000     1.0           0     0   
6675          5       2.50         2500     13034     1.0           0     0   

       grade  sqft_above  sqft_basement  yr_built  yr_renovated  zipcode  \
6211       7        1100            600      1924             0    98117   
21013      9        3420              0      2009             0    98052   
20299      8        1250              0      2014             0    98136   
12732      7        1450              0      1953             0    98118   
6675       7        1300           1200      1962             0    98092   

       sqft_living15  sqft_lot15  days_since_min_date     price  
62

make and fit random forest

In [43]:
forest = RandomForestRegressor()
forest.fit(x_train, y_train)

save forest

In [44]:
from joblib import dump
dump(forest, 'model.joblib')

['model.joblib']

evaluate forest

In [65]:
from joblib import load
model = load('model.joblib')
print(model.score(x_test, y_test))

0.7976407724655848


In [84]:
test_dict = [{
    'bedrooms' : 4,
    'bathrooms' :2.5,
    'sqft_living' : 2480,
    'sqft_lot' : 7537,
    'floors' : 2,
    'waterfront' : 0,
    'view' : 0,
    'grade' : 5,
    'sqft_above' : 2480,
    'sqft_basement' : 0,
    'yr_built' : 1999,
    'yr_renovated' : 0,
    'zipcode' : 98034,
    'sqft_living15' : 1700,
    'sqft_lot15' : 6753,
    'days_since_min_date' : 3500
}]
print(test_dict)

[{'bedrooms': 4, 'bathrooms': 2.5, 'sqft_living': 2480, 'sqft_lot': 7537, 'floors': 2, 'waterfront': 0, 'view': 0, 'grade': 5, 'sqft_above': 2480, 'sqft_basement': 0, 'yr_built': 1999, 'yr_renovated': 0, 'zipcode': 98034, 'sqft_living15': 1700, 'sqft_lot15': 6753, 'days_since_min_date': 3500}]


In [85]:
testing = pd.DataFrame(test_dict)
print(testing)

   bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  \
0         4        2.5         2480      7537       2           0     0   

   grade  sqft_above  sqft_basement  yr_built  yr_renovated  zipcode  \
0      5        2480              0      1999             0    98034   

   sqft_living15  sqft_lot15  days_since_min_date  
0           1700        6753                 3500  


In [86]:
model.predict(testing)

array([484190.5])