In [94]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [72]:
df = pd.read_csv('housing_analysis.csv')
df.dropna(inplace=True)
y = df[['Median home value']]

In [73]:
df.columns

Index(['Zip Code', 'Population below poverty level', 'Median household income',
       'Non-White, Non-Hispanic or Latino', 'Hispanic or Latino, of any race',
       'Population with disability', 'Unemployment',
       'Large households (5+ members)',
       'Homes affordable to people earning less than $50,000',
       'Rentals affordable to people earning less than $25,000',
       'Rent-restricted units', 'Housing Choice Voucher holders',
       'Median rent', 'Median home value',
       'Percentage of rental units in poor condition',
       'Percent change in number of housing units, 2000-2012',
       'Owner units affordable to average retail/service worker',
       'Rental units affordable to average retail/service worker',
       'Rental units affordable to average artist',
       'Owner units affordable to average artist',
       'Rental units affordable to average teacher',
       'Owner units affordable to average teacher',
       'Rental units affordable to average tech work

In [74]:
x = df[['Zip Code', 'Median household income',
        'Unemployment',
       'Large households (5+ members)',
       'Median rent',
       'Percentage of housing and transportation costs that is transportation-related'
       ]]


In [77]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [78]:
reg = linear_model.LinearRegression().fit(X_train, y_train)

In [86]:
reg.score(X_test,y_test)

0.7605904686720812

In [83]:
predictions = reg.predict(X_test)
predictions

array([[391841.67732827],
       [279497.00765918],
       [180739.42607687],
       [ 89156.2470006 ],
       [464633.25571448],
       [ 80139.13282184],
       [ 98357.27125503],
       [249334.4687157 ],
       [ 84170.19144835],
       [410476.86363412],
       [324859.50142126],
       [143349.96517949]])

In [84]:
y_test.values

array([[303100],
       [192300],
       [185500],
       [108100],
       [436800],
       [134900],
       [120200],
       [168600],
       [121000],
       [388600],
       [265100],
       [144200]])

In [95]:
np.sqrt(mean_squared_error(predictions,y_test.values))

51668.40326129765

In [87]:
import pickle

In [96]:
with open('saved_model.pkl','wb') as out_file:
    pickle.dump(reg,out_file)

In [97]:
with open('saved_model.pkl','rb') as in_file:
    saved_model = pickle.load(in_file)
saved_model.predict(X_test)

array([[391841.67732827],
       [279497.00765918],
       [180739.42607687],
       [ 89156.2470006 ],
       [464633.25571448],
       [ 80139.13282184],
       [ 98357.27125503],
       [249334.4687157 ],
       [ 84170.19144835],
       [410476.86363412],
       [324859.50142126],
       [143349.96517949]])

array([[7.87500e+04, 7.59580e+04, 6.00000e+00, 6.00000e+00, 1.01200e+03,
        3.30000e+01],
       [7.87290e+04, 5.73580e+04, 7.00000e+00, 7.00000e+00, 1.00800e+03,
        3.70000e+01],
       [7.87480e+04, 6.58890e+04, 6.00000e+00, 8.00000e+00, 1.09500e+03,
        4.40000e+01],
       [7.87440e+04, 4.10560e+04, 9.00000e+00, 2.30000e+01, 9.46000e+02,
        4.00000e+01],
       [7.87310e+04, 7.82650e+04, 4.00000e+00, 2.00000e+00, 1.01600e+03,
        3.20000e+01],
       [7.87530e+04, 3.95930e+04, 9.00000e+00, 1.60000e+01, 8.26000e+02,
        4.40000e+01],
       [7.87410e+04, 3.01830e+04, 9.00000e+00, 1.00000e+01, 8.35000e+02,
        4.30000e+01],
       [7.87280e+04, 4.74050e+04, 6.00000e+00, 4.00000e+00, 9.01000e+02,
        4.00000e+01],
       [7.87210e+04, 3.21310e+04, 1.60000e+01, 1.40000e+01, 8.70000e+02,
        4.00000e+01],
       [7.87320e+04, 1.27726e+05, 5.00000e+00, 1.50000e+01, 1.68800e+03,
        3.30000e+01],
       [7.87220e+04, 4.49170e+04, 8.00000e+00, 3.0

In [93]:
X_test

Unnamed: 0,Zip Code,Median household income,Unemployment,Large households (5+ members),Median rent,Percentage of housing and transportation costs that is transportation-related
28,78750.0,75958,6,6,1012,33
15,78729.0,57358,7,7,1008,37
26,78748.0,65889,6,8,1095,44
23,78744.0,41056,9,23,946,40
17,78731.0,78265,4,2,1016,32
31,78753.0,39593,9,16,826,44
21,78741.0,30183,9,10,835,43
14,78728.0,47405,6,4,901,40
10,78721.0,32131,16,14,870,40
18,78732.0,127726,5,15,1688,33
