In [1]:
# Conventionally people rename the pandas import to pd for brevity
import pandas as pd

In [2]:
# Load in the data and preview it
sales = pd.read_csv('home_data.csv') 
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
[sales['price']]

[0        221900
 1        538000
 2        180000
 3        604000
 4        510000
           ...  
 21608    360000
 21609    400000
 21610    402101
 21611    400000
 21612    325000
 Name: price, Length: 21613, dtype: int64]

In [4]:
# Order by zipcode
sales = sales.sort_values('zipcode')

# Create a list of the unique zipcodes (numpy.ndarray)
unique_zipcodes = sales['zipcode'].unique()

# Create list of dataframes by zipcode
list_of_df = []

for zipcode in unique_zipcodes:
   # Create df for each zipcode
   df = sales[sales['zipcode'] == zipcode]
   # Append to the list_of_df
   list_of_df.append(df)

In [5]:
from sklearn.model_selection import train_test_split

list_of_df_train = []
list_of_df_test = []

# Split each dataframe into train (80%) and test data (20%) 
for df in list_of_df:
   train_data, test_data = train_test_split(df, test_size=0.2)
   list_of_df_train.append(train_data)
   list_of_df_test.append(test_data)

print(f"# of zipcodes:", len(unique_zipcodes))
print(f"# of (train) dataframes:", len(list_of_df_train))
print(f"# of (test) dataframes:", len(list_of_df_test))

# Use a loop to assign labels
list_of_df_train1 = {}
for zipcode, df in zip(unique_zipcodes, list_of_df_train):
    list_of_df_train1[zipcode] = df

# Use a loop to assign labels
list_of_df_test1 = {}
for zipcode, df in zip(unique_zipcodes, list_of_df_test):
    list_of_df_test1[zipcode] = df

# Now you have labeled DataFrames in a dictionary
print(list_of_df_train1)
print(list_of_df_test1)


# of zipcodes: 70
# of (train) dataframes: 70
# of (test) dataframes: 70
{98001:                id             date   price  bedrooms  bathrooms  sqft_living  \
966    5462100240  20140625T000000  196500         3       1.00         1320   
21170  6601200020  20150127T000000  235245         4       2.50         1954   
10006  3353402400  20150326T000000  124500         2       1.00          840   
3186   3521049048  20140811T000000  515000         3       2.50         3430   
5045   5647900670  20140620T000000  340000         3       1.75         1880   
...           ...              ...     ...       ...        ...          ...   
1388   1687000270  20140528T000000  267000         3       2.50         2495   
13819  5515600087  20141209T000000  215000         3       1.50         1100   
5126   5412101150  20150203T000000  299000         4       2.50         2400   
16699  6306100080  20140909T000000  234950         3       2.00         1430   
6628   3322049005  20140930T000000  850

In [6]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error 

# List features to use for model to predict 
basic_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors']

list_of_models = []
list_of_rmse_basic_train = []
# Basic Model
for df in list_of_df_train:
   y = df.price # actual price column of training set
   X = df[basic_features] # rest of dataframe data
   # Create and train the model
   basic_model = linear_model.LinearRegression().fit(X, y)
   # Store the model into list of models
   list_of_models.append(basic_model)
   # Predict prices using the model
   y = df.price # actual price column of training set
   X = df[basic_features]
   y_pred = basic_model.predict(X)
   train_rmse_basic = mean_squared_error(y, y_pred, squared=False) #False = rmse
   list_of_rmse_basic_train.append(train_rmse_basic)

# Use a loop to assign labels
list_of_models1 = {}
for zipcode, df in zip(unique_zipcodes, list_of_df_test):
    list_of_models1[zipcode] = df

# Use a loop to assign labels
list_of_rmse_basic_train1 = {}
for zipcode, df in zip(unique_zipcodes, list_of_df_test):
    list_of_rmse_basic_train1[zipcode] = df

print(f"# of ML models: ", len(list_of_models1))
print(f"# of Root Mean Squered Error: ", len(list_of_rmse_basic_train1))

list_of_models1


# of ML models:  70
# of Root Mean Squered Error:  70


{98001:                id             date   price  bedrooms  bathrooms  sqft_living  \
 3111   3751600176  20150306T000000  196000         3       1.50         1000   
 1033   3353400860  20140717T000000  249900         3       1.75         2080   
 12830  3751600146  20141023T000000  166000         1       1.00         1120   
 2650   1687000200  20150410T000000  259000         3       2.50         2153   
 5672   3914000095  20140718T000000  430000         5       2.50         3860   
 ...           ...              ...     ...       ...        ...          ...   
 20162  7967000130  20150401T000000  370228         4       3.00         2050   
 20962  1278000210  20150311T000000  110000         2       1.00          828   
 16661  9543000896  20140825T000000  237000         3       1.50         1800   
 9687   8159620260  20140711T000000  303000         4       2.25         2560   
 9312   4045700115  20141028T000000  370000         3       1.75         1620   
 
        sqft_lot  f

In [7]:
# Comparing with Test Data
list_of_rmse_basic_test = []

i = 0
while i < len(list_of_models):
   df = list_of_df_test[i]
   y = df.price
   X = df[basic_features]
   y_pred = list_of_models[i].predict(X)
   test_rmse_basic = mean_squared_error(y, y_pred, squared=False) #False = rmse
   list_of_rmse_basic_test.append(test_rmse_basic)
   i += 1

df = pd.DataFrame({
   'unique_zipcodes': unique_zipcodes,
   'list_of_rmse_basic_train': list_of_rmse_basic_train,
   'list_of_rmse_basic_test': list_of_rmse_basic_test
   })

df

Unnamed: 0,unique_zipcodes,list_of_rmse_basic_train,list_of_rmse_basic_test
0,98001,50864.153555,60359.081639
1,98002,28732.168135,36019.848457
2,98003,56430.187010,57123.300890
3,98004,391832.676634,269909.869790
4,98005,165315.777725,160888.566819
...,...,...,...
65,98177,248077.983989,223919.385683
66,98178,120108.330856,161813.826210
67,98188,52518.123941,62100.266560
68,98198,101273.757838,102369.408884


In [8]:
from joblib import dump, load

path = "../ml_models/"

# Create and dump models into designated folder
i = 0
while (i < len(list_of_models)):
   model = list_of_models[i]
   zipcode = unique_zipcodes[i]
   dump(model, path + str(zipcode))
   i += 1
