# Machine Learning 3 - Progress Report 2
## Group 22: Kai Mei, Anshul Subramanya, David Martell

# Processing the training set

In [1]:
import numpy as np 
import pandas as pd
from scipy.stats import norm
import matplotlib
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("Training_Data.csv")
train_data = train_data.transpose() # Transposing so that features are in columns and time in rows
idx_missing_data = np.where(train_data.isnull().any() == True) # Getting which features have missing data

In [3]:
train_data.iloc[:,idx_missing_data[0][11:]] # Starting from 11 because the first 11 features are
# the information downloaded from Bbg and they are missing "RegionName","RegionType","StateName",
# which are not relevant for the first 11 features

Unnamed: 0,8270,8281,8336,8427,8637,8967,9099,9124,9233,9285,...,23068,23085,23095,23122,23126,23127,23179,23182,23187,23211
Feature,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,...,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot
RegionName,10456,11212,19143,11233,60085,35758,30032,63136,48858,30721,...,19964,44817,67070,43462,43013,33854,68933,56016,50581,37878
RegionType,Zip,Zip,Zip,Zip,Zip,Zip,Zip,Zip,Zip,Zip,...,Zip,Zip,Zip,Zip,Zip,Zip,Zip,Zip,Zip,Zip
StateName,NY,NY,PA,NY,IL,AL,GA,MO,MI,GA,...,DE,OH,KS,OH,OH,FL,NE,MN,IA,TN
2010-11,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-3,198,261,,358,57,90,44,,113,62,...,,53,28,62,142,50,,74,24,84
2015-4,197,262,,373,58,90,45,,112,63,...,,53,28,62,139,49,,74,24,84
2015-5,197,263,,385,59,90,46,,110,62,...,,53,29,63,137,47,,75,24,85
2015-6,197,264,,388,59,90,47,,109,62,...,,53,29,63,136,47,,75,24,85


In [4]:
# Since some regions have missing data for a large period of the training set, remove them from the data set
train_data_final = train_data.drop(labels=idx_missing_data[0][11:],axis=1)
train_data_final = train_data_final.drop(["RegionName","RegionType","StateName"]) 

In [5]:
train_data_final.iloc[0:20,:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23224,23225,23226,23227,23228,23229,23230,23231,23232,23233
Feature,DR Horton Positive Return Next Month,Lennar Corp Positive Return Next Month,PulteGroup Inc Positive Return Next Month,Real Estate Outperformance vs. Market Next Month,US Home Mortgage 30 Year Fixed National Avg Ra...,U-3 US Unemployement Rate (%),US Average Hourly Earnings (YoY %),MBA US US Mortgage Market Applications Index W...,Home Builders Confidence Index (Units),US GDP QoQ SAAR,...,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot
2010-11,1,1,1,0,4.57,9.8,1.7,-16.5,16,3,...,142,141,135,219,105,278,120,123,67,88
2010-12,1,1,1,1,4.99,9.3,1.8,2.3,16,2,...,141,139,134,214,105,279,119,122,66,88
2011-1,0,1,0,1,4.78,9.1,1.9,11.3,16,2,...,142,137,134,208,103,276,117,120,66,87
2011-2,0,0,1,0,4.85,9,1.9,-6.5,16,2,...,142,136,133,201,101,270,113,119,66,86
2011-3,1,1,1,1,4.84,9,1.9,-7.5,17,-1,...,140,135,130,191,100,265,111,117,66,86
2011-4,0,0,1,1,4.7,9.1,1.9,4,16,-1,...,139,135,127,185,98,261,111,118,65,86
2011-5,0,0,0,0,4.55,9,2,-4,16,-1,...,140,135,125,183,97,261,112,123,65,85
2011-6,1,0,0,1,4.57,9.1,2.1,-2.7,13,2.9,...,140,134,125,184,96,263,113,125,64,83
2011-7,0,0,0,1,4.47,9,2.3,7.1,15,2.9,...,141,135,125,186,95,265,113,123,65,82


In [6]:
# Confirming that there are no further missing values
np.where(train_data_final.isnull().any() == True)

(array([], dtype=int64),)

In [7]:
# Final training features
y_DHI_train = train_data_final.iloc[:,0]
y_LEN_train = train_data_final.iloc[:,1]
y_PHM_train = train_data_final.iloc[:,2]
y_REoutperf_train = train_data_final.iloc[:,3]
X_train = train_data_final.iloc[:,4:]

# Processing the validation set

In [8]:
valid_data = pd.read_csv("Validation_Data.csv")
valid_data = valid_data.transpose() # Transposing so that features are in columns and time in rows
valid_data_final = valid_data.drop(labels=idx_missing_data[0][11:],axis=1) # Dropping the same regions as in the train set
valid_data_final = valid_data_final.drop(["RegionName","RegionType","StateName"]) 
np.where(valid_data_final.isnull().any() == True)

(array([], dtype=int64),)

In [9]:
valid_data_final.iloc[0:20,:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23224,23225,23226,23227,23228,23229,23230,23231,23232,23233
Feature,DR Horton Positive Return Next Month,Lennar Corp Positive Return Next Month,PulteGroup Inc Positive Return Next Month,Real Estate Outperformance vs. Market Next Month,US Home Mortgage 30 Year Fixed National Avg Ra...,U-3 US Unemployement Rate (%),US Average Hourly Earnings (YoY %),MBA US US Mortgage Market Applications Index W...,Home Builders Confidence Index (Units),US GDP QoQ SAAR,...,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot
2015-8,0,0,0,1,3.9,5.1,2.2,11.3,61,3,...,136,137,111,210,102,305,122,115,69,121
2015-9,1,1,0,0,3.84,5,2.3,-6.7,61,1.3,...,135,137,112,218,103,308,121,115,69,122
2015-10,1,1,1,0,4.01,5,2.5,-0.8,65,1.3,...,134,139,113,223,103,309,121,114,68,123
2015-11,0,0,0,1,3.82,5.1,2.4,-0.2,62,1.3,...,134,141,113,225,104,311,121,114,68,123
2015-12,0,0,0,1,3.9,5,2.5,-17.4,60,0.1,...,135,143,112,226,101,311,120,114,68,124
2016-1,0,0,1,0,3.76,4.9,2.5,-2.6,61,0.1,...,137,144,110,226,95,310,120,113,69,125
2016-2,1,1,1,1,3.84,4.9,2.4,-4.8,58,0.1,...,138,144,110,228,89,309,120,113,69,125
2016-3,0,0,0,0,3.65,5,2.5,-1,58,2,...,139,143,109,231,87,309,120,113,69,125
2016-4,1,1,1,1,3.63,5,2.6,-3.4,58,2,...,140,143,109,232,85,311,120,114,69,126


In [10]:
# Final validation features
y_DHI_valid = valid_data_final.iloc[:,0]
y_LEN_valid = valid_data_final.iloc[:,1]
y_PHM_valid = valid_data_final.iloc[:,2]
y_REoutperf_valid = valid_data_final.iloc[:,3]
X_valid = valid_data_final.iloc[:,4:]

# Processing the test set

In [11]:
test_data = pd.read_csv("Test_Data.csv")
test_data = test_data.transpose() # Transposing so that features are in columns and time in rows
test_data_final = test_data.drop(labels=idx_missing_data[0][11:],axis=1) # Dropping the same regions as in the train set
test_data_final = test_data_final.drop(["RegionName","RegionType","StateName"]) 
np.where(test_data_final.isnull().any() == True)

(array([], dtype=int64),)

In [12]:
test_data_final.iloc[0:20,:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23224,23225,23226,23227,23228,23229,23230,23231,23232,23233
Feature,DR Horton Positive Return Next Month,Lennar Corp Positive Return Next Month,PulteGroup Inc Positive Return Next Month,Real Estate Outperformance vs. Market Next Month,US Home Mortgage 30 Year Fixed National Avg Ra...,U-3 US Unemployement Rate (%),US Average Hourly Earnings (YoY %),MBA US US Mortgage Market Applications Index W...,Home Builders Confidence Index (Units),US GDP QoQ SAAR,...,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot,Zillow's Median Value per Sq. Foot
2017-8,1,1,1,0,3.77,4.4,2.6,-2.3,67,2.2,...,147,155,125,252,101,328,134,124,77,141
2017-9,1,1,1,0,3.8,4.2,2.8,-0.4,64,3.2,...,148,157,127,250,101,330,135,125,79,142
2017-10,1,1,1,0,3.83,4.1,2.3,-2.6,68,3.2,...,149,159,128,249,100,331,135,125,80,143
2017-11,1,1,0,0,3.82,4.2,2.5,-3.1,69,3.2,...,149,162,129,249,101,330,136,126,81,144
2017-12,0,0,0,0,3.85,4.1,2.7,-1.6,74,3.5,...,150,165,131,251,103,327,136,127,82,145
2018-1,0,0,0,0,4.18,4.1,2.8,-2.6,72,3.5,...,152,167,132,257,105,327,134,127,82,146
2018-2,1,1,1,1,4.34,4.1,2.6,2.7,71,3.5,...,154,169,132,262,107,332,133,128,83,148
2018-3,1,0,1,1,4.27,4,2.8,-3.3,70,2.5,...,154,171,132,265,106,334,132,128,84,148
2018-4,0,0,0,1,4.42,3.9,2.8,-2.5,68,2.5,...,153,172,133,267,103,334,133,128,83,148


In [13]:
# Final test features
y_DHI_test = valid_data_final.iloc[:,0]
y_LEN_test = valid_data_final.iloc[:,1]
y_PHM_test = valid_data_final.iloc[:,2]
y_REoutperf_test = valid_data_final.iloc[:,3]
X_test = valid_data_final.iloc[:,4:]

In [14]:
print(X_train.shape,X_valid.shape,X_test.shape)

(58, 22966) (25, 22966) (25, 22966)


In [15]:
y_LEN_test

Feature    Lennar Corp Positive Return Next Month
2015-8                                          0
2015-9                                          1
2015-10                                         1
2015-11                                         0
2015-12                                         0
2016-1                                          0
2016-2                                          1
2016-3                                          0
2016-4                                          1
2016-5                                          1
2016-6                                          1
2016-7                                          1
2016-8                                          0
2016-9                                          0
2016-10                                         1
2016-11                                         1
2016-12                                         1
2017-1                                          1
2017-2                                          1
