In [3]:
# Conventionally people rename the pandas import to pd for brevity
import pandas as pd

In [4]:
# Load in the data and preview it
sales = pd.read_csv('home_data.csv') 
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
[sales['price']]

[0        221900
 1        538000
 2        180000
 3        604000
 4        510000
           ...  
 21608    360000
 21609    400000
 21610    402101
 21611    400000
 21612    325000
 Name: price, Length: 21613, dtype: int64]

In [6]:
# Order by zipcode
sales = sales.sort_values('zipcode')

# Create a list of the unique zipcodes (numpy.ndarray)
unique_zipcodes = sales['zipcode'].unique()

# Create list of dataframes by zipcode
list_of_df = []

for zipcode in unique_zipcodes:
   # Create df for each zipcode
   df = sales[sales['zipcode'] == zipcode]
   # Append to the list_of_df
   list_of_df.append(df)

In [7]:
from sklearn.model_selection import train_test_split

list_of_df_train = []
list_of_df_test = []

# Split each dataframe into train (80%) and test data (20%) 
for df in list_of_df:
   train_data, test_data = train_test_split(df, test_size=0.2)
   list_of_df_train.append(train_data)
   list_of_df_test.append(test_data)

print(f"# of zipcodes:", len(unique_zipcodes))
print(f"# of (train) dataframes:", len(list_of_df_train))
print(f"# of (test) dataframes:", len(list_of_df_test))

# Use a loop to assign labels
list_of_df_train1 = {}
for zipcode, df in zip(unique_zipcodes, list_of_df_train):
    list_of_df_train1[zipcode] = df

# Use a loop to assign labels
list_of_df_test1 = {}
for zipcode, df in zip(unique_zipcodes, list_of_df_test):
    list_of_df_test1[zipcode] = df

# Now you have labeled DataFrames in a dictionary
print(list_of_df_train1)
print(list_of_df_test1)


# of zipcodes: 70
# of (train) dataframes: 70
# of (test) dataframes: 70
{98001:                id             date   price  bedrooms  bathrooms  sqft_living  \
6315   2214800630  20141105T000000  239950         3       2.25         1560   
17496  2817850290  20141201T000000  258000         3       2.00         1790   
19232  3751604653  20140826T000000  205000         3       1.00         1370   
16029  9264030470  20140611T000000  455000         4       2.50         3170   
1943   8159620160  20150424T000000  284200         3       2.50         1570   
...           ...              ...     ...       ...        ...          ...   
20886  8956200530  20140805T000000  457000         4       2.50         2820   
440    3353401710  20140923T000000  227950         3       1.50         1670   
7333   7967600285  20141211T000000  449888         3       2.25         2520   
13888  7967900150  20150430T000000  367950         4       2.50         3030   
2750   2408600160  20150228T000000  352

In [13]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error 

# List features to use for model to predict 
basic_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors']

list_of_models = []
list_of_rmse_basic_train = []
# Basic Model
for df in list_of_df_train:
   y = df.price # actual price column of training set
   X = df[basic_features] # rest of dataframe data
   # Create and train the model
   basic_model = linear_model.LinearRegression().fit(X, y)
   # Store the model into list of models
   list_of_models.append(basic_model)
   # Predict prices using the model
   y = df.price # actual price column of training set
   X = df[basic_features]
   y_pred = basic_model.predict(X)
   train_rmse_basic = mean_squared_error(y, y_pred, squared=False) #False = rmse
   list_of_rmse_basic_train.append(train_rmse_basic)

# Use a loop to assign labels
list_of_models1 = {}
for zipcode, df in zip(unique_zipcodes, list_of_df_test):
    list_of_models1[zipcode] = df

# Use a loop to assign labels
list_of_rmse_basic_train1 = {}
for zipcode, df in zip(unique_zipcodes, list_of_df_test):
    list_of_rmse_basic_train1[zipcode] = df

print(f"# of ML models: ", len(list_of_models1))
print(f"# of Root Mean Squered Error: ", len(list_of_rmse_basic_train1))

list_of_models1


# of ML models:  70
# of Root Mean Squered Error:  70


{98001:                id             date   price  bedrooms  bathrooms  sqft_living  \
 8891   3750603940  20140925T000000  240000         4       1.75         1880   
 19695   629650370  20150123T000000  250000         3       2.50         1750   
 5075   5729000080  20141029T000000  465000         3       3.00         2290   
 912    3329530200  20140910T000000  205000         3       2.00         1410   
 19259  8856004415  20150325T000000  168000         3       1.00         1150   
 ...           ...              ...     ...       ...        ...          ...   
 19745  7967000160  20150316T000000  355000         4       2.75         2050   
 17133  6143000020  20141027T000000  175000         3       1.75         1910   
 5218   7696600020  20150128T000000  260000         4       1.50         1540   
 14027  3874010220  20140624T000000  289000         3       2.50         1970   
 15857  8961950050  20150320T000000  409000         4       2.75         3230   
 
        sqft_lot  f

In [15]:
# Comparing with Test Data
list_of_rmse_basic_test = []

i = 0
while i < len(list_of_models):
   df = list_of_df_test[i]
   y = df.price
   X = df[basic_features]
   y_pred = list_of_models[i].predict(X)
   test_rmse_basic = mean_squared_error(y, y_pred, squared=False) #False = rmse
   list_of_rmse_basic_test.append(test_rmse_basic)
   i += 1

df = pd.DataFrame({
   'unique_zipcodes': unique_zipcodes,
   'list_of_rmse_basic_train': list_of_rmse_basic_train,
   'list_of_rmse_basic_test': list_of_rmse_basic_test
   })

df

Unnamed: 0,unique_zipcodes,list_of_rmse_basic_train,list_of_rmse_basic_test
0,98001,52222.703506,54354.246906
1,98002,29295.343475,34254.626902
2,98003,52641.484185,71287.643762
3,98004,373226.879931,364404.623615
4,98005,159007.023338,184939.069456
...,...,...,...
65,98177,232782.942968,284383.006634
66,98178,137818.291732,66895.400350
67,98188,54784.864758,55991.883429
68,98198,84972.270861,153895.696263


In [10]:
from joblib import dump, load

path = "../ml_models/"

# Create and dump models into designated folder
i = 0
while (i < len(list_of_models)):
   model = list_of_models[i]
   zipcode = unique_zipcodes[i]
   dump(model, path + str(zipcode))
   i += 1
