## I am analyzing the 2011-2015 PUMS housing data, found [here](https://www.census.gov/programs-surveys/acs/data/pums.html). The data dictionary which explains the meaning of each column and the values contained within is available [here.](https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.2015.html) I want to find an algorithm that can predict the value of the home.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier

In [2]:
#need to be able to read larger outputs
pd.options.display.max_rows = 250

In [3]:
dat = pd.read_csv('ss15hnd.csv')
dat = pd.concat([dat, pd.read_csv('ss15hny.csv'),
                 pd.read_csv('ss15hwy.csv'),
                 pd.read_csv('ss15htx.csv')], ignore_index = True)
dat.rename(columns = {"insp": "INSP"}, inplace = True)

In [4]:
colnames = ["ST", "NP", "ACR", "BATH", "BDSP", "ELEP", "GASP", "INSP",
              "RMSP", "RWAT", "SINK", "VALP", "VEH", "WATP", "YBL",
              "FINCP", "HINCP"]
readable_names = ["State", "Num_People", "Lot_Size", "Has_Bathtub", "Num_Bedrooms",
                  "Monthly_Electric", "Monthly_Gas", "Yearly_Insurance_Cost", "Num_Rooms",
                  "Has_Hot_Water", "Has_Sink", "Value", "Num_Vehicles",
                  "Yearly_Water", "Year_Built", "Family_Income", "Household_Income"]

cols = dat[colnames].copy()

cols.columns = readable_names

cols.dropna(inplace = True)
cols.describe()

Unnamed: 0,State,Num_People,Lot_Size,Has_Bathtub,Num_Bedrooms,Monthly_Electric,Monthly_Gas,Yearly_Insurance_Cost,Num_Rooms,Has_Hot_Water,Has_Sink,Value,Num_Vehicles,Yearly_Water,Year_Built,Family_Income,Household_Income
count,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0
mean,43.520216,3.101925,1.299669,1.001683,3.316505,186.376954,59.355973,1134.024498,6.977479,1.0021,1.001688,253706.3,2.255585,558.435847,5.277415,104792.4,105892.3
std,5.980571,1.356574,0.569388,0.040985,0.934867,106.674852,83.926798,1061.516282,2.264988,0.045773,0.041045,356243.0,0.961443,578.861019,3.101509,102128.9,102320.1
min,36.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,100.0,0.0,1.0,1.0,-16800.0,-16800.0
25%,36.0,2.0,1.0,1.0,3.0,110.0,3.0,500.0,5.0,1.0,1.0,90000.0,2.0,80.0,3.0,46000.0,47100.0
50%,48.0,3.0,1.0,1.0,3.0,160.0,30.0,980.0,7.0,1.0,1.0,160000.0,2.0,450.0,5.0,79607.0,80100.0
75%,48.0,4.0,1.0,1.0,4.0,250.0,80.0,1500.0,8.0,1.0,1.0,300000.0,3.0,840.0,7.0,126440.0,128000.0
max,56.0,20.0,3.0,2.0,12.0,650.0,570.0,8100.0,22.0,2.0,2.0,5216000.0,6.0,3600.0,19.0,2060000.0,2090000.0


### We now have 16 features and a target column ("Value"). We will need to clean the data a little further. The next cell will separate the state column into a column with each state.

In [5]:
cols = pd.get_dummies(cols, columns = ["State"])
readable_names = readable_names[1:]
readable_names.extend(["NY", "ND", "TX", "WY"])
cols.columns = readable_names

### The documentation tells us that some of the values in the year built column correspond to a year and some of them a range of years. We want to change it to the age of the house in years since it is a more reasonable range to work with.

In [6]:
ybl = cols.Year_Built

ybl.loc[ybl == 1.0] = 90
ybl.loc[ybl == 2.0] = 70
ybl.loc[ybl == 3.0] = 60
ybl.loc[ybl == 4.0] = 50
ybl.loc[ybl == 5.0] = 40
ybl.loc[ybl == 6.0] = 30
ybl.loc[ybl == 7.0] = 20
ybl.loc[ybl == 8.0] = 13
ybl.loc[ybl == 9.0] = 10
ybl.loc[ybl == 10.0] = 9
ybl.loc[ybl == 11.0] = 8
ybl.loc[ybl == 12.0] = 7
ybl.loc[ybl == 13.0] = 6
ybl.loc[ybl == 14.0] = 5
ybl.loc[ybl == 15.0] = 4
ybl.loc[ybl == 16.0] = 3
ybl.loc[ybl == 17.0] = 2
ybl.loc[ybl == 18.0] = 1
ybl.loc[ybl == 19.0] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
cols.rename(columns={"Year_Built":"Age"}, inplace = True)

In [8]:
Y = cols.Value.copy()
X_names = [c for c in cols.columns if c != "Value"]
X = cols[X_names].copy()

In [9]:
X = pd.DataFrame(preprocessing.normalize(X))
X.columns = X_names
X.describe()

Unnamed: 0,Num_People,Lot_Size,Has_Bathtub,Num_Bedrooms,Monthly_Electric,Monthly_Gas,Yearly_Insurance_Cost,Num_Rooms,Has_Hot_Water,Has_Sink,Num_Vehicles,Yearly_Water,Age,Family_Income,Household_Income,NY,ND,TX,WY
count,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0,407700.0
mean,6.627692e-05,2.944008e-05,2.199643e-05,6.5e-05,0.003344604,0.001067395,0.014136,0.000132064,2.207931e-05,2.2019e-05,3.8e-05,0.008856642,0.000888,0.697569,0.707965,7e-06,4.442247e-07,1.4e-05,3.94673e-07
std,0.001888253,0.0007003365,0.0004113219,0.000807,0.02202977,0.0105688,0.049879,0.001563581,0.0004133322,0.0004112528,0.000405,0.03813497,0.008559,0.064953,0.055243,0.000218,2.868221e-05,0.000198,4.188746e-05
min,7.319914e-07,3.40765e-07,3.40765e-07,0.0,5.528577e-07,7.262047e-07,0.0,5.987355e-07,3.40765e-07,3.40765e-07,0.0,4.524008e-07,0.0,-0.970353,-0.707107,0.0,0.0,0.0,0.0
25%,1.545568e-05,6.164258e-06,5.567537e-06,1.8e-05,0.0008122103,3.558896e-05,0.003581,3.778339e-05,5.567538e-06,5.567538e-06,1.2e-05,0.0006512817,0.000131,0.706977,0.706999,0.0,0.0,0.0,0.0
50%,2.570398e-05,1.063305e-05,8.838377e-06,2.8e-05,0.001425573,0.0002286502,0.007448,5.878211e-05,8.838388e-06,8.838381e-06,1.9e-05,0.003326583,0.000327,0.707067,0.707072,0.0,0.0,5e-06,0.0
75%,4.419414e-05,1.963975e-05,1.510492e-05,4.7e-05,0.00258053,0.0007592265,0.013539,9.706633e-05,1.510539e-05,1.510531e-05,3.1e-05,0.008345852,0.000689,0.707094,0.707096,6e-06,0.0,1.1e-05,0.0
max,0.7474351,0.2621112,0.1747408,0.246632,0.9998752,0.9730085,0.999985,0.4439373,0.1747408,0.1747408,0.167248,0.9996247,0.979535,0.992709,0.999999,0.08737,0.009256879,0.083624,0.01986929


In [10]:
Y.reset_index(inplace = True, drop = True)

In [11]:
sum(Y < 10000)

6108

### There are quite a few observations where the home is valued at less than $10,000. I first want to see if these have any predictive power or if they are errors. Since extremely low prices are quite low in proportion to the total dataset I am undersampling the larger class.

In [12]:
%%time
#split into test and train
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 4)
#the random state here is chosen explicitly so that the code is reproducible

CPU times: user 670 ms, sys: 112 ms, total: 782 ms
Wall time: 840 ms


In [13]:
ythis = Y[Y < 10000].copy()
ythis = pd.concat([ythis, y_train[y_train >= 10000].head(len(ythis)).copy()])
ythis[ythis < 10000] = 1
ythis[ythis > 1] = 0

xthis = X.loc[ythis.index, :]

In [14]:
xtrainthis, xtestthis, ytrainthis, ytestthis= train_test_split(xthis,
                                                               ythis,
                                                               test_size = .3,
                                                               random_state=4)

In [15]:
%%time

clf = RandomForestClassifier(n_estimators = 50, n_jobs = -1)
clf.fit(xtrainthis, ytrainthis)

CPU times: user 1.45 s, sys: 29.8 ms, total: 1.48 s
Wall time: 894 ms


In [16]:
%%time
clf.score(xtestthis, ytestthis)

CPU times: user 65.8 ms, sys: 8.08 ms, total: 73.9 ms
Wall time: 120 ms


0.90013642564802188

### Apparently there is a relatively large level of predictability with the extremely low priced homes. Let's look at how well we can predict low (<\$100,000), medium (\$100,000-\$500,000), and high (>\$500,000) priced homes.

In [17]:
sum(Y < 100000)/len(Y)

0.28621535442727497

In [18]:
ythis = Y[Y < 100000].copy()
ythis = pd.concat([ythis, y_train[y_train >= 100000].head(len(ythis)).copy()])
ythis[ythis < 100000] = 1
ythis[ythis > 1] = 0

xthis = X.loc[ythis.index, :]

In [19]:
xtrainthis, xtestthis, ytrainthis, ytestthis= train_test_split(xthis,
                                                               ythis,
                                                               test_size = .3,
                                                               random_state=4)

In [20]:
%%time

clf = RandomForestClassifier(n_estimators = 50, n_jobs = -1)
clf.fit(xtrainthis, ytrainthis)

CPU times: user 57.7 s, sys: 898 ms, total: 58.6 s
Wall time: 31.8 s


In [21]:
%%time
clf.score(xtestthis, ytestthis)

CPU times: user 1.84 s, sys: 116 ms, total: 1.95 s
Wall time: 1.2 s


0.80998086097066302

The majority of the homes are between \$100,000 and \$500,000. We will not want to undersample here.

In [22]:
sum((Y>=100000) & (Y<500000))/len(Y)

0.60302918812852591

In [23]:
ythis = Y.copy()
ythis[(ythis>=100000) & (ythis<500000)] = 1
ythis[ythis > 1] = 0

xthis = X.loc[ythis.index, :]

In [24]:
xtrainthis, xtestthis, ytrainthis, ytestthis= train_test_split(xthis,
                                                               ythis,
                                                               test_size = .3,
                                                               random_state=4)

In [25]:
%%time

clf = RandomForestClassifier(n_estimators = 50, n_jobs = -1)
clf.fit(xtrainthis, ytrainthis)

CPU times: user 2min 27s, sys: 1.97 s, total: 2min 29s
Wall time: 1min 20s


In [26]:
%%time
clf.score(xtestthis, ytestthis)

CPU times: user 5.1 s, sys: 231 ms, total: 5.33 s
Wall time: 3.12 s


0.74539285422287627

In [27]:
sum(Y >= 500000)/len(Y)

0.11075545744419917

In [28]:
ythis = Y[Y >= 500000].copy()
ythis = pd.concat([ythis, y_train[y_train < 500000].head(len(ythis)).copy()])
ythis[ythis >= 500000] = 1
ythis[ythis > 1] = 0

xthis = X.loc[ythis.index, :]

In [29]:
xtrainthis, xtestthis, ytrainthis, ytestthis= train_test_split(xthis,
                                                               ythis,
                                                               test_size = .3,
                                                               random_state=4)

In [30]:
%%time

clf = RandomForestClassifier(n_estimators = 50, n_jobs = -1)
clf.fit(xtrainthis, ytrainthis)

CPU times: user 17.3 s, sys: 419 ms, total: 17.7 s
Wall time: 9.64 s


In [31]:
%%time
clf.score(xtestthis, ytestthis)

CPU times: user 579 ms, sys: 44.4 ms, total: 624 ms
Wall time: 436 ms


0.83110028420625259

In [32]:
from sklearn.ensemble import RandomForestRegressor

In [33]:
%%time

reg = RandomForestRegressor(n_estimators=50, n_jobs = -1)
reg.fit(X_train, y_train)

CPU times: user 8min 37s, sys: 5.72 s, total: 8min 42s
Wall time: 5min 33s


In [34]:
%%time
print("Train score:", reg.score(X_train, y_train))
print("Test score:", reg.score(X_test, y_test))

Train score: 0.916940514293
Test score: 0.402066828148
CPU times: user 24.5 s, sys: 3.43 s, total: 27.9 s
Wall time: 31.5 s


### This regressor is incredibly overfit. Further work is necessary to get a less overfitted model.