In [1]:
# Conventionally people rename the pandas import to pd for brevity
import pandas as pd

In [3]:
# Load in the data and preview it
sales = pd.read_csv('home_data.csv') 
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [None]:
[sales['price']]

In [None]:
# Order by zipcode
sales = sales.sort_values('zipcode')

# Create a list of the unique zipcodes (numpy.ndarray)
unique_zipcodes = sales['zipcode'].unique()

# Create list of dataframes by zipcode
list_of_df = []

for zipcode in unique_zipcodes:
   # Create df for each zipcode
   df = sales[sales['zipcode'] == zipcode]
   # Append to the list_of_df
   list_of_df.append(df)

In [None]:
from sklearn.model_selection import train_test_split

list_of_df_train = []
list_of_df_test = []

# Split each dataframe into train (80%) and test data (20%) 
for df in list_of_df:
   train_data, test_data = train_test_split(df, test_size=0.2)
   list_of_df_train.append(train_data)
   list_of_df_test.append(test_data)

print(f"# of zipcodes:", len(unique_zipcodes))
print(f"# of (train) dataframes:", len(list_of_df_train))
print(f"# of (test) dataframes:", len(list_of_df_test))


In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error 

# List features to use for model to predict 
basic_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors']

list_of_models = []
list_of_rmse_basic_train = []
# Basic Model
for df in list_of_df_train:
   y = df.price # actual price column of training set
   X = df[basic_features] # rest of dataframe data
   # Create and train the model
   basic_model = linear_model.LinearRegression().fit(X, y)
   # Store the model into list of models
   list_of_models.append(basic_model)
   # Predict prices using the model
   y = df.price # actual price column of training set
   X = df[basic_features]
   y_pred = basic_model.predict(X)
   train_rmse_basic = mean_squared_error(y, y_pred, squared=False) #False = rmse
   list_of_rmse_basic_train.append(train_rmse_basic)

print(f"# of ML models: ", len(list_of_models))
print(f"# of Root Mean Squered Error: ", len(list_of_rmse_basic_train))


In [None]:
# Comparing with Test Data
list_of_rmse_basic_test = []

i = 0
while i < len(list_of_models):
   df = list_of_df_test[i]
   y = df.price
   X = df[basic_features]
   y_pred = list_of_models[i].predict(X)
   test_rmse_basic = mean_squared_error(y, y_pred, squared=False) #False = rmse
   list_of_rmse_basic_test.append(test_rmse_basic)
   i += 1

df = pd.DataFrame({
   'unique_zipcodes': unique_zipcodes,
   'list_of_rmse_basic_train': list_of_rmse_basic_train,
   'list_of_rmse_basic_test': list_of_rmse_basic_test
   })


In [None]:
from joblib import dump, load

path = "../ml_models/"

# Create and dump models into designated folder
i = 0
while (i < len(list_of_models)):
   model = list_of_models[i]
   zipcode = unique_zipcodes[i]
   dump(model, path + str(zipcode))
   i += 1

In [None]:
#Use plotly to display awesome charts
import plotly.express as px

fig = px.scatter(sales, x="long",y="lat", 
                 color='zipcode', title='King County Housing Dataset 2014',
                 size='price')
fig.show()


In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objs as go

zipcode_list = sales['zipcode'].unique()

# Initialize the Dash app
app = dash.Dash(__name__)

# Define the app layout
app.layout = html.Div([
    dcc.Dropdown(
        id='data-dropdown',
        options=[
            {'label': zipcode, 'value': zipcode} for zipcode in zipcode_list],
        value=zipcode  # Initial dropdown value
    ),
    dcc.Graph(id='line-chart')
])

# Define a callback to update the line chart based on dropdown selection
@app.callback(
    Output('line-chart', 'figure'),
    [Input('data-dropdown', 'value')]
)

def update_line_chart(selected_data):
    xaxis = "sqft_living"
    yaxis = "price"
    
    df = sales[sales['zipcode']==int(selected_data)]

    trace = go.Scatter(
        x=df['sqft_living'],
        y=df['price'],
        mode='markers',
        name=selected_data)       

    return {
        'data': [trace],
        'layout': go.Layout(
            title=f'{xaxis} vs {yaxis} <br> zipcode {selected_data}',
            xaxis={'title': xaxis},
            yaxis={'title': yaxis}
        )
    }

# Run the Dash app
if __name__ == '__main__':
    app.run_server(debug=True)

In [None]:
# Export Jupyter Notebook to .py
# ???