In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [24]:
data = pd.read_csv("California_Houses.csv")
target = data.pop("Median_House_Value")
data["Median_House_Value"] = target 
data

Unnamed: 0,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco,Median_House_Value
0,8.3252,41,880,129,322,126,37.88,-122.23,9263.040773,556529.158342,735501.806984,67432.517001,21250.213767,452600.0
1,8.3014,21,7099,1106,2401,1138,37.86,-122.22,10225.733072,554279.850069,733236.884360,65049.908574,20880.600400,358500.0
2,7.2574,52,1467,190,496,177,37.85,-122.24,8259.085109,554610.717069,733525.682937,64867.289833,18811.487450,352100.0
3,5.6431,52,1274,235,558,219,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568,341300.0
4,3.8462,52,1627,280,565,259,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568,342200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25,1665,374,845,330,39.48,-121.09,162031.481121,654530.186299,830631.543047,248510.058162,222619.890417,78100.0
20636,2.5568,18,697,150,356,114,39.49,-121.21,160445.433537,659747.068444,836245.915229,246849.888948,218314.424634,77100.0
20637,1.7000,17,2254,485,1007,433,39.43,-121.22,153754.341182,654042.214020,830699.573163,240172.220489,212097.936232,92300.0
20638,1.8672,18,1860,409,741,349,39.43,-121.32,152005.022239,657698.007703,834672.461887,238193.865909,207923.199166,84700.0


In [25]:
X = data.loc[:, data.columns != "Median_House_Value"]
y = data["Median_House_Value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

print(mean_squared_error(y_test, y_pred, squared = False)) #Standard Deviation of 69,329.9

69329.90859379723


In [26]:
names = [col for col in data]
coefficents = lin_reg.coef_

for i in range(0, 13):
    print(names[i], ": ", coefficents[i])

Median_Income :  39549.112273163584
Median_Age :  876.7007982201466
Tot_Rooms :  -5.827335552040392
Tot_Bedrooms :  100.51410262302889
Population :  -36.97031250068348
Households :  42.25966460162566
Latitude :  -45321.803386700354
Longitude :  -26922.414595456237
Distance_to_coast :  -0.23133513726176985
Distance_to_LA :  -0.15086889429805683
Distance_to_SanDiego :  0.2508466999132291
Distance_to_SanJose :  0.17968551876991418
Distance_to_SanFrancisco :  -0.15741537279603937


In [27]:
### We overrepresent some columns:
### Distance to LA can also be viewed by looking at Latitude and Longitude
### The higher north you are, the further away from LA and SF you are 
### the further you are from LA and SF, the more north you are
### This is a double penilization that is unfavorable. 
### To fix this lets see what happens when we eliminate the Latitude and Longitude columns

In [28]:
D1 = data.drop(columns = ["Latitude", "Longitude"])
D1   #   

Unnamed: 0,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco,Median_House_Value
0,8.3252,41,880,129,322,126,9263.040773,556529.158342,735501.806984,67432.517001,21250.213767,452600.0
1,8.3014,21,7099,1106,2401,1138,10225.733072,554279.850069,733236.884360,65049.908574,20880.600400,358500.0
2,7.2574,52,1467,190,496,177,8259.085109,554610.717069,733525.682937,64867.289833,18811.487450,352100.0
3,5.6431,52,1274,235,558,219,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568,341300.0
4,3.8462,52,1627,280,565,259,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568,342200.0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25,1665,374,845,330,162031.481121,654530.186299,830631.543047,248510.058162,222619.890417,78100.0
20636,2.5568,18,697,150,356,114,160445.433537,659747.068444,836245.915229,246849.888948,218314.424634,77100.0
20637,1.7000,17,2254,485,1007,433,153754.341182,654042.214020,830699.573163,240172.220489,212097.936232,92300.0
20638,1.8672,18,1860,409,741,349,152005.022239,657698.007703,834672.461887,238193.865909,207923.199166,84700.0


In [29]:
y1 = D1["Median_House_Value"]
X1 = D1.drop(columns = ["Median_House_Value"])
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.30, random_state = 0)
lin_reg1 = LinearRegression()
lin_reg1.fit(X_train1, y_train1)
y_pred1 = lin_reg1.predict(X_test1)

print(mean_squared_error(y_test1, y_pred1, squared = False)) #Standard Deviation of 70404.2

70404.1852483494


In [30]:
names1 = [col for col in D1]
coefficents1 = lin_reg1.coef_

for i in range(0, 11):
    print(names1[i], ": ", coefficents1[i])

Median_Income :  39869.84365466141
Median_Age :  961.4735723830627
Tot_Rooms :  -6.630392240931829
Tot_Bedrooms :  98.28727592378586
Population :  -37.156187116197586
Households :  49.605179836429286
Distance_to_coast :  -0.6187252133027819
Distance_to_LA :  -0.14515502799150884
Distance_to_SanDiego :  0.05803903050748452
Distance_to_SanJose :  0.04146937995033765
Distance_to_SanFrancisco :  -0.10410452257887008


In [31]:
### Interestingly, the loss of Latitude and Longitude hurts the model by 1000
### This is a shocking result. What is not shocking is the change in most numbers
###
###                                   With Lat/Long         Without Lat/Long
###   Median_Income :              39549.112273157945   |  39549.112273157945
###   Median_Age :                 876.7007982201371    |  961.4735723836396
###   Tot_Rooms :                 -5.827335552016403    | -6.6303922409236655
###   Tot_Bedrooms :               100.51410262308274   |  98.28727592376198
###   Population :                -36.97031250072135    | -37.156187116187844
###   Households :                 42.25966460161104    |  49.605179836383066
###   Latitude :                  -45321.803386773325
###   Longitude :                 -26922.414595504193
###   Distance_to_coast :         -0.23133513726105087  | -0.6187252133031086
###   Distance_to_LA :            -0.1508688942982352   | -0.14515502799154625
###   Distance_to_SanDiego :       0.25084669991324166  |  0.058039030507382086
###   Distance_to_SanJose :        0.17968551877124983  |  0.04146937995032039
###   Distance_to_SanFrancisco :  -0.15741537279749718  | -0.10410452257898443
###
### More Specifically, lets look at the Distances, as nothing else changes
###
###   Distance_to_coast :         -0.23133513726105087  | -0.6187252133031086
###   Distance_to_LA :            -0.1508688942982352   | -0.14515502799154625
###   Distance_to_SanDiego :       0.25084669991324166  |  0.058039030507382086
###   Distance_to_SanJose :        0.17968551877124983  |  0.04146937995032039
###   Distance_to_SanFrancisco :  -0.15741537279749718  | -0.10410452257898443
###
### The numbers for Los Angeles and coast both drastically changed by a scale of at least 100000x. But,
### distance to San Diego, San Jose, and San Francisco barely changed, with Distance from SF now being 
### positive. This is most likely still due to the overrepresentation by LA and the coast. 

In [32]:
### There is still one more solution. Instead of just eliminating Latitude and Longitude, 
### lets make the distances a binary number based on how big the individual counties are. 
### We will almost one hot encode these columns, with the only difference being that 
### they are already seperated and that they are both are numerical. So, lets start with D1

In [33]:
### Length of (dataset uses meters): 
### LA = 42862.5305 m, 
### SF = 7510.4244 m , 
### SD = 18214 m , 
### SJ = 11503.0606 m
###           
### All measurements were taken by Radius Map https://sfplanninggis.org/RadiusMap/

In [58]:
D2 = D1.drop(columns = ["Median_House_Value"])
rad = {"Distance_to_coast": 10000, "Distance_to_LA": 42862.5305, "Distance_to_SanDiego":18214,"Distance_to_SanFrancisco": 7510.4244,"Distance_to_SanJose": 11503.0606 }
for i in rad.keys():
    for j in range(0, D2.shape[0]):
        if(D2[i][j] < rad[i]):
            D2[i][j] = 1
        else:
            D2[i][j] = 0

temp0 = D2.pop("Distance_to_coast")            
temp1 = D2.pop("Distance_to_LA")
temp2 = D2.pop("Distance_to_SanFrancisco")
temp3 = D2.pop("Distance_to_SanDiego")
temp4 = D2.pop("Distance_to_SanJose")
D2["onCoast"] = temp0
D2["inLA"] = temp1
D2["inSanFrancisco"] = temp2
D2["inSanDiego"] = temp3
D2["inSanJose"] = temp4
D2["Median_House_Value"] = target
D2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D2[i][j] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D2[i][j] = 0


Unnamed: 0,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,onCoast,inLA,inSanFrancisco,inSanDiego,inSanJose,Median_House_Value
0,8.3252,41,880,129,322,126,1.0,0.0,0.0,0.0,0.0,452600.0
1,8.3014,21,7099,1106,2401,1138,0.0,0.0,0.0,0.0,0.0,358500.0
2,7.2574,52,1467,190,496,177,1.0,0.0,0.0,0.0,0.0,352100.0
3,5.6431,52,1274,235,558,219,1.0,0.0,0.0,0.0,0.0,341300.0
4,3.8462,52,1627,280,565,259,1.0,0.0,0.0,0.0,0.0,342200.0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25,1665,374,845,330,0.0,0.0,0.0,0.0,0.0,78100.0
20636,2.5568,18,697,150,356,114,0.0,0.0,0.0,0.0,0.0,77100.0
20637,1.7000,17,2254,485,1007,433,0.0,0.0,0.0,0.0,0.0,92300.0
20638,1.8672,18,1860,409,741,349,0.0,0.0,0.0,0.0,0.0,84700.0


In [59]:
y2 = D2["Median_House_Value"]
X2 = D2.drop(columns = ["Median_House_Value"])
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.30, random_state = 0)
lin_reg2 = LinearRegression()
lin_reg2.fit(X_train2, y_train2)
y_pred2 = lin_reg2.predict(X_test2)

print(mean_squared_error(y_test2, y_pred2, squared = False)) #Standard Deviation of 70417.7

70417.6923226112


In [60]:
names2 = [col for col in D2]
coefficents2 = lin_reg2.coef_
for i in range(0, 11):
    print(names2[i], ": ", coefficents2[i])

Median_Income :  42191.10755132617
Median_Age :  650.5955276507768
Tot_Rooms :  -6.850889652463237
Tot_Bedrooms :  51.67253605750665
Population :  -32.48444042846152
Households :  90.12606073998191
onCoast :  53615.87181300517
inLA :  47089.32292163445
inSanFrancisco :  91952.73061425587
inSanDiego :  -10876.981406364015
inSanJose :  46774.17674537472


In [43]:
### Here is the final Table
###                                       inCity             With Lat/Long          Without Lat/Long
###   Median_Income :               42191.10755132617   |  39549.112273157945   |  39549.112273157945
###   Median_Age :                  650.5955276507768   |  876.7007982201371    |  961.4735723836396
###   Tot_Rooms :                  -6.850889652463237   | -5.827335552016403    | -6.6303922409236655
###   Tot_Bedrooms :                51.67253605750665   |  100.51410262308274   |  98.28727592376198
###   Population :                 -32.48444042846152   | -36.97031250072135    | -37.156187116187844
###   Households :                  90.12606073998191   |  42.25966460161104    |  49.605179836383066
###   Latitude :                                          -45321.803386773325
###   Longitude :                                         -26922.414595504193
###   Distance_to_/onCoast :        53615.87181300517   | -0.23133513726105087  | -0.6187252133031086
###   Distance_to_/inLA :           47089.32292163445   | -0.1508688942982352   | -0.14515502799154625
###   Distance_to_/inSanDiego :    -10876.981406364015  |  0.25084669991324166  |  0.058039030507382086
###   Distance_to_/inSanJose :      46774.17674537472   |  0.17968551877124983  |  0.04146937995032039
###   Distance_to_/inSanFrancisco : 91952.73061425587   | -0.15741537279749718  | -0.10410452257898443

In [38]:
### When converted, all distance to's became more powerful with their coefficents showing greater strength. 
### However, even so, the model still lost power when compared to the original model with Lat/Long