## Importing dependencies and environmental variables

In [1]:
# Dependencies
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Reading in csv

In [3]:
hist_complete_data = pd.read_csv("../Project3/data/complete_data/other_hist_complete_data.csv")

## Shuffling and splitting the dataset

In [4]:
hist_complete_data = hist_complete_data[hist_complete_data["next_year_housing_price"] != 0]
hist_complete_data_shuffled = shuffle(hist_complete_data)

In [5]:
len(hist_complete_data_shuffled)

910

In [6]:
set1 = hist_complete_data_shuffled[0:227]
set2 = hist_complete_data_shuffled[228:455]
set3 = hist_complete_data_shuffled[456:682]
set4 = hist_complete_data_shuffled[683:910]

In [7]:
set1.head()

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,num_coffee_shops,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
729,2016,90049,84.747478,17.360389,59.483994,0.923285,-0.25845,0.329119,3,7.78097,2384450.0,2569983.0
205,2012,91344,64.909269,10.670766,23.548497,-0.738709,0.750932,-1.344743,3,15.105856,390466.7,449450.0
834,2017,90001,32.367462,15.453369,2.449862,-0.599565,0.247111,-9.88249,1,7.623352,333733.3,359175.0
35,2011,90057,31.765888,20.913988,10.084916,0.0,0.0,0.0,0,2.462424,234525.0,240300.0
460,2014,90077,87.247839,7.98511,54.238713,30.61798,4.568108,-0.983307,1,5.993923,2004808.0,2124975.0


In [8]:
set2.head()

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,num_coffee_shops,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
779,2016,91607,78.388792,20.165579,33.80035,5.380399,-0.232534,1.489146,2,10.492689,749966.666667,828658.3
490,2014,91402,47.15531,17.347876,9.762507,-0.495045,5.118627,12.121886,1,6.938776,345041.666667,368983.3
641,2015,94102,44.084906,22.495333,32.596972,5.785087,1.414812,2.638836,13,0.585748,843650.0,848591.7
163,2012,90035,70.124047,19.8933,44.09387,2.523216,0.242451,-5.06318,1,15.631005,758908.333333,877533.3
418,2014,90004,34.542937,16.872551,23.321321,-1.76071,1.171364,2.429233,2,12.74346,892941.666667,1006733.0


In [9]:
set3.head()

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,num_coffee_shops,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
573,2015,90026,61.777583,22.187682,25.691769,3.13495,3.642117,-5.541319,2,7.587794,728033.333333,783275.0
354,2013,91406,50.677651,16.468623,14.636878,-0.809265,5.758559,2.871391,3,11.408406,389625.0,434075.0
781,2016,94103,45.632501,22.200645,39.958053,8.346812,5.39717,1.937187,24,1.047389,957141.666667,967166.7
366,2013,94105,55.285431,39.132001,77.741761,-8.799643,-1.882704,3.439344,19,16.002085,879208.333333,1019900.0
548,2014,98122,64.629706,27.474012,43.519117,-0.166061,1.354629,-2.462262,8,15.509759,466641.666667,539016.7


In [10]:
set4.head()

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,num_coffee_shops,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
574,2015,90027,67.062556,24.4201,41.379754,2.717521,5.372534,0.884267,2,13.670993,1166033.0,1325442.0
283,2013,90013,32.595554,18.545242,28.461388,18.602995,6.208738,-4.69866,1,0.0,0.0,444350.0
180,2012,90069,83.771153,21.374459,54.973436,8.289597,-0.432975,-2.318817,0,17.085546,563341.7,659591.7
417,2014,90001,49.893455,16.367156,2.145115,-2.970073,10.297175,-16.920101,1,9.232484,239191.7,261275.0
491,2014,91403,81.242557,15.587402,46.363898,-6.325215,-0.505286,0.416344,3,7.442494,801366.7,861008.3


## Initialzing X and y variables for each set

In [11]:
set1_X = set1[['pct_25_34', 'pct_college_deg', 'pct_wht', 'current_year_housing_price', 'num_coffee_shops']]
set1_y = set1['next_year_housing_price'].values.reshape(-1, 1)

print("Shape: ", set1_X.shape, set1_y.shape)

Shape:  (227, 5) (227, 1)


In [12]:
set2_X = set2[['pct_25_34', 'pct_college_deg', 'pct_wht', 'current_year_housing_price', 'num_coffee_shops']]
set3_X = set3[['pct_25_34', 'pct_college_deg', 'pct_wht', 'current_year_housing_price', 'num_coffee_shops']]
set4_X = set4[['pct_25_34', 'pct_college_deg', 'pct_wht', 'current_year_housing_price', 'num_coffee_shops']]

In [13]:
set2_y = set2['next_year_housing_price'].values.reshape(-1, 1)
set3_y = set3['next_year_housing_price'].values.reshape(-1, 1)
set4_y = set4['next_year_housing_price'].values.reshape(-1, 1)

## Initializing a linear regression model and fitting it to set1

In [14]:
non_scaled_model = LinearRegression()

In [15]:
non_scaled_model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
non_scaled_model.fit(set1_X, set1_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Scoring the model's ability to predict the values of the other sets

In [17]:
set2_score = non_scaled_model.score(set2_X, set2_y)
set3_score = non_scaled_model.score(set3_X, set3_y)
set4_score = non_scaled_model.score(set4_X, set4_y)

print(f"Set2 Score: {set2_score}")
print(f"Set3 Score: {set3_score}")
print(f"Set4 Score: {set4_score}")

Set2 Score: 0.9612279245383888
Set3 Score: 0.9877715388539107
Set4 Score: 0.987039503437102
