# 2nd Notebook for multiple linear regression models

In [20]:
# Standard Packages
import pandas as pd
import numpy as np

# Viz Packages
import seaborn as sns
import matplotlib.pyplot as plt

# Scipy Stats
import scipy.stats as stats 

# Statsmodel Api
import statsmodels.api as sm
from statsmodels.formula.api import ols

# SKLearn Modules
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.feature_selection import f_regression

# Notebook Options
import warnings
warnings.filterwarnings("ignore", category= FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

pd.options.display.max_columns = None
pd.options.display.width = None

In [22]:
mlr_baseline_df = pd.read_csv('../data/baseline_500k_to_15mil.csv')
mlr_baseline_df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,condition,grade,heat_source,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,lat,long,renovated,zip,month,year
0,675000.0,4,1.0,1180,7140,1.0,0,0,0,0,4,7,Gas,PUBLIC,1180,0,0,40,1969,47.461975,-122.190520,0,98055,5,2022
1,920000.0,5,2.5,2770,6703,1.0,0,0,1,2,3,7,Oil,PUBLIC,1570,1570,0,240,1950,47.711525,-122.355910,0,98133,12,2021
2,775000.0,3,3.0,2160,1400,2.0,0,0,0,2,3,9,Gas,PUBLIC,1090,1070,200,270,2010,47.566110,-122.290200,0,98118,12,2021
3,592500.0,2,2.0,1120,758,2.0,0,0,1,0,3,7,Electricity,PUBLIC,1120,550,550,30,2012,47.532470,-122.071880,0,98027,8,2021
4,625000.0,2,1.0,1190,5688,1.0,0,0,1,0,3,7,Electricity,PUBLIC,1190,0,300,0,1948,47.763470,-122.340155,0,98133,7,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26116,719000.0,3,2.5,1270,1141,2.0,0,0,0,0,3,8,Gas,PUBLIC,1050,420,200,60,2007,47.690440,-122.370620,0,98117,10,2021
26117,1555000.0,5,2.0,1910,4000,1.5,0,0,0,0,4,8,Oil,PUBLIC,1600,1130,0,210,1921,47.664740,-122.329400,0,98103,11,2021
26118,1313000.0,3,2.0,2020,5800,2.0,0,0,0,1,3,7,Gas,PUBLIC,2020,0,0,520,2011,47.565610,-122.388510,0,98116,6,2021
26119,800000.0,3,2.0,1620,3600,1.0,0,0,1,0,3,7,Gas,PUBLIC,940,920,240,110,1995,47.610395,-122.295850,0,98122,5,2022


In [23]:
school_scores_df = pd.read_csv('../data/school_scores_by_zip.csv')
school_scores_df

Unnamed: 0.1,Unnamed: 0,ZIPCODE,percent_met_standard,district
0,0,98034,0.720261,LAKE WASHINGTON
1,1,98074,0.720261,LAKE WASHINGTON
2,2,98034,0.720261,LAKE WASHINGTON
3,3,98053,0.720261,LAKE WASHINGTON
4,4,98074,0.720261,LAKE WASHINGTON
...,...,...,...,...
649,649,98070,0.589803,VASHON ISLAND
650,650,98070,0.589803,VASHON ISLAND
651,651,98070,0.589803,VASHON ISLAND
652,652,98070,0.589803,VASHON ISLAND


In [24]:
school_scores_df = school_scores_df[['ZIPCODE', 'percent_met_standard']]
school_scores_df

Unnamed: 0,ZIPCODE,percent_met_standard
0,98034,0.720261
1,98074,0.720261
2,98034,0.720261
3,98053,0.720261
4,98074,0.720261
...,...,...
649,98070,0.589803
650,98070,0.589803
651,98070,0.589803
652,98070,0.589803


In [25]:
school_scores_df['zip'] = school_scores_df['ZIPCODE']
school_scores_df = school_scores_df[['percent_met_standard','zip']]
school_scores_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  school_scores_df['zip'] = school_scores_df['ZIPCODE']


Unnamed: 0,percent_met_standard,zip
0,0.720261,98034
1,0.720261,98074
2,0.720261,98034
3,0.720261,98053
4,0.720261,98074
...,...,...
649,0.589803,98070
650,0.589803,98070
651,0.589803,98070
652,0.589803,98070


In [27]:
school_scores_df.drop_duplicates(subset='zip', inplace=True)
school_scores_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  school_scores_df.drop_duplicates(subset='zip', inplace=True)


Unnamed: 0,percent_met_standard,zip
0,0.720261,98034
1,0.720261,98074
3,0.720261,98053
7,0.720261,98033
14,0.720261,98052
...,...,...
626,0.633615,98051
629,0.690543,98045
635,0.690543,98024
648,0.589803,98070


In [28]:
merged_df = mlr_baseline_df.join(school_scores_df.set_index('zip'), on='zip', how='inner')



merged_df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,condition,grade,heat_source,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,lat,long,renovated,zip,month,year,percent_met_standard
0,675000.0,4,1.0,1180,7140,1.0,0,0,0,0,4,7,Gas,PUBLIC,1180,0,0,40,1969,47.461975,-122.190520,0,98055,5,2022,0.368621
36,750000.0,3,2.0,1830,7969,1.0,0,0,0,0,3,7,Gas,PUBLIC,930,930,240,90,1950,47.466730,-122.214000,1,98055,3,2022,0.368621
97,728000.0,4,2.0,2170,7520,1.0,0,0,0,0,3,7,Gas,PUBLIC,1240,1240,490,60,1973,47.463930,-122.189740,0,98055,3,2022,0.368621
200,565000.0,4,2.0,1400,10364,1.5,0,0,0,0,4,6,Electricity,PUBLIC,1400,0,330,330,1971,47.448450,-122.212430,0,98055,3,2022,0.368621
271,645000.0,3,2.0,1520,8250,1.0,0,0,0,0,3,8,Gas,PUBLIC,1190,590,420,200,1981,47.460870,-122.188690,0,98055,12,2021,0.368621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4120,950000.0,3,2.0,2600,64626,1.5,0,0,0,3,3,8,Gas,PRIVATE,2600,0,0,360,2009,47.718260,-121.405660,0,98288,10,2021,0.476190
8053,619000.0,4,1.0,1350,21640,1.5,1,0,0,2,4,5,Electricity,PRIVATE,1350,0,0,280,1964,47.694940,-121.298765,0,98288,8,2021,0.476190
12868,560000.0,2,2.0,1170,63217,1.5,0,0,0,0,3,8,Gas,PRIVATE,1170,0,0,190,2002,47.717600,-121.404800,0,98288,9,2021,0.476190
15196,869300.0,3,2.5,3610,44686,2.0,0,0,1,0,5,7,Gas,PUBLIC,2310,1300,0,440,1923,47.708820,-121.354160,0,98288,11,2021,0.476190


In [34]:
mlr_baseline_df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,condition,grade,heat_source,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,lat,long,renovated,zip,month,year
0,675000.0,4,1.0,1180,7140,1.0,0,0,0,0,4,7,Gas,PUBLIC,1180,0,0,40,1969,47.461975,-122.190520,0,98055,5,2022
1,920000.0,5,2.5,2770,6703,1.0,0,0,1,2,3,7,Oil,PUBLIC,1570,1570,0,240,1950,47.711525,-122.355910,0,98133,12,2021
2,775000.0,3,3.0,2160,1400,2.0,0,0,0,2,3,9,Gas,PUBLIC,1090,1070,200,270,2010,47.566110,-122.290200,0,98118,12,2021
3,592500.0,2,2.0,1120,758,2.0,0,0,1,0,3,7,Electricity,PUBLIC,1120,550,550,30,2012,47.532470,-122.071880,0,98027,8,2021
4,625000.0,2,1.0,1190,5688,1.0,0,0,1,0,3,7,Electricity,PUBLIC,1190,0,300,0,1948,47.763470,-122.340155,0,98133,7,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26116,719000.0,3,2.5,1270,1141,2.0,0,0,0,0,3,8,Gas,PUBLIC,1050,420,200,60,2007,47.690440,-122.370620,0,98117,10,2021
26117,1555000.0,5,2.0,1910,4000,1.5,0,0,0,0,4,8,Oil,PUBLIC,1600,1130,0,210,1921,47.664740,-122.329400,0,98103,11,2021
26118,1313000.0,3,2.0,2020,5800,2.0,0,0,0,1,3,7,Gas,PUBLIC,2020,0,0,520,2011,47.565610,-122.388510,0,98116,6,2021
26119,800000.0,3,2.0,1620,3600,1.0,0,0,1,0,3,7,Gas,PUBLIC,940,920,240,110,1995,47.610395,-122.295850,0,98122,5,2022
