# Split Data into Training and Test Sets

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
filename = '../../data/processed/na-data-int.csv'
df = pd.read_csv(filename)

In [3]:
df.head()

Unnamed: 0,year,zipcode,EQI_zip,SFR_zip,RECPI_zip,EQI_MSA,SFR_MSA,RECPI_MSA,EQI_state,SFR_state,RECPI_state
0,1988,1001,0.000815,48.0,0.039108,0.001021,1235.0,1.260888,0.001476,17558.0,25.92194
1,1989,1001,0.001116,44.0,0.0491,0.001168,1049.0,1.225384,0.001751,15343.0,26.866861
2,1990,1001,0.001629,45.0,0.073317,0.001243,841.0,1.045161,0.001857,13556.0,25.172453
3,1991,1001,0.000826,27.0,0.022298,0.001375,714.0,0.981724,0.001823,12798.0,23.330479
4,1992,1001,0.002216,22.0,0.048744,0.001549,760.0,1.176877,0.002111,13289.0,28.052156


In [4]:
df.tail()

Unnamed: 0,year,zipcode,EQI_zip,SFR_zip,RECPI_zip,EQI_MSA,SFR_MSA,RECPI_MSA,EQI_state,SFR_state,RECPI_state
493981,2012,99901,0.000107,30.0,0.003195,0.0001,36.0,0.003602,0.000123,1708.0,0.209606
493982,2013,99901,5.6e-05,26.0,0.001469,6e-05,30.0,0.00181,8.8e-05,2114.0,0.185809
493983,2014,99901,7e-05,32.0,0.002242,6.9e-05,34.0,0.002346,8.8e-05,2260.0,0.198317
493984,2015,99901,7.7e-05,50.0,0.003867,7.6e-05,52.0,0.003957,9.3e-05,3179.0,0.295212
493985,2016,99901,7.6e-05,47.0,0.003571,7.3e-05,54.0,0.003929,8.2e-05,3847.0,0.315002


In [5]:
X = df['zipcode'].unique()
y = df['zipcode'].unique()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train = pd.DataFrame(X_train)
X_train.columns = ['zipcode']
X_test = pd.DataFrame(X_test)
X_test.columns = ['zipcode']
y_train = pd.DataFrame(y_train)
y_train.columns = ['zipcode']
y_test = pd.DataFrame(y_test)
y_test.columns = ['zipcode']

In [8]:
five_years = [[2016,2011],[2011,2006],[2006,2001],[2001,1996],[1996,1991]]

In [9]:
combined = pd.DataFrame()
for y in five_years:
    print(y[0], y[1])
    mask = (df['year'] > y[1]) & (df['year'] <= y[0])
    subset = df.loc[mask]
    df['year_label'] = pd.cut(subset['year'], 5, labels=['year_1', 'year_2', 'year_3', 'year_4', 'year_5'])
    subset = df.loc[mask]
    melted = subset.melt(id_vars=['zipcode', 'year_label'])
    pivoted = pd.pivot_table(melted, values='value', index=['zipcode'], columns=['year_label', 'variable'])
    pivoted.columns = ['_'.join(col).strip() for col in pivoted.columns.values]
    pivoted = pivoted.drop(['year_2_year','year_3_year','year_4_year','year_5_year'], axis=1)
    pivoted.rename(columns={'year_1_year':'start_year'}, inplace=True)
    pivoted = pivoted.reset_index()
    pivoted['prediction_start_year'] = pivoted['start_year'] - 5
    pivoted['prediction_value'] = pivoted.filter(regex='EQI_zip$', axis=1).mean(axis=1)
    combined = combined.append(pivoted)

2016 2011
2011 2006
2006 2001
2001 1996
1996 1991


In [10]:
X_train = pd.merge(left=X_train, right=combined, on='zipcode')
X_train = X_train.drop(['prediction_start_year', 'prediction_value'], axis=1)

In [12]:
X_test = pd.merge(left=X_test, right=combined, on='zipcode')
X_test = X_test.drop(['prediction_start_year', 'prediction_value'], axis=1)

In [16]:
y_train = pd.merge(left=y_train, right=combined, on='zipcode')
y_train = y_train[['zipcode','prediction_start_year', 'prediction_value']]
y_train.rename(columns={'prediction_start_year':'start_year', 'prediction_value':'y'}, inplace=True)

In [17]:
train.head()

Unnamed: 0,zipcode,start_year,y,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_1_SFR_MSA,...,year_4_SFR_zip,year_5_EQI_MSA,year_5_EQI_state,year_5_EQI_zip,year_5_RECPI_MSA,year_5_RECPI_state,year_5_RECPI_zip,year_5_SFR_MSA,year_5_SFR_state,year_5_SFR_zip
0,13843,2007.0,0.000179,0.000175,0.000334,7.4e-05,0.029841,37.070427,0.000442,171.0,...,7.0,0.000181,0.000312,0.000207,0.026593,35.643444,0.00062,147.0,114296.0,3.0
1,13843,2002.0,0.000127,0.000231,0.000362,0.000306,0.029814,34.266712,0.000917,129.0,...,2.0,0.000183,0.000319,6.8e-05,0.036619,35.56701,0.000271,200.0,111622.0,4.0
2,13843,1997.0,0.000164,0.000243,0.000434,0.000254,0.023309,32.540657,0.000762,96.0,...,1.0,0.000227,0.000394,0.000198,0.024286,32.927654,0.000396,107.0,83609.0,2.0
3,13843,1992.0,0.000282,0.000305,0.000438,0.000344,0.019549,24.280985,0.001033,64.0,...,1.5,0.000472,0.00043,0.000491,0.031155,30.707523,0.000491,66.0,71477.0,1.0
4,7021,2007.0,0.000159,0.000281,0.000296,0.000125,15.110121,21.331076,0.003114,53837.0,...,24.0,0.000244,0.000253,0.000174,13.529889,18.01945,0.004339,55456.0,71184.0,25.0


In [18]:
test = pd.merge(left=y_test, right=X_test, on=['zipcode','start_year'])

In [19]:
test.head()

Unnamed: 0,zipcode,start_year,y,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_1_SFR_MSA,...,year_4_SFR_zip,year_5_EQI_MSA,year_5_EQI_state,year_5_EQI_zip,year_5_RECPI_MSA,year_5_RECPI_state,year_5_RECPI_zip,year_5_SFR_MSA,year_5_SFR_state,year_5_SFR_zip
0,61080,2007.0,0.000336,0.000352,0.000356,0.000236,0.301666,25.898994,0.006616,857.0,...,20.0,0.00024,0.000366,0.000232,0.143874,24.32311,0.004402,600.0,66378.0,19.0
1,61080,2002.0,0.00026,0.000617,0.000412,0.000258,0.000617,24.874615,0.008269,1.0,...,30.0,0.000281,0.000353,0.000292,0.251313,25.447918,0.004962,893.0,72015.0,17.0
2,61080,1997.0,0.00033,0.000445,0.000483,0.00051,0.213866,20.994663,0.007135,481.0,...,24.0,0.000423,0.000475,0.000257,0.263686,23.736517,0.004109,623.0,50014.0,16.0
3,61080,1992.0,0.000383,0.00048,0.000496,0.002105,0.222667,16.774063,0.025263,464.0,...,13.0,0.000422,0.000468,0.000351,0.232828,19.821539,0.004908,552.0,42325.0,14.0
4,32839,2007.0,0.000213,0.000233,0.000252,0.000249,6.884016,65.80577,0.090474,29503.0,...,354.0,0.000225,0.000243,0.000207,6.132314,60.965862,0.083882,27265.0,251216.0,405.0


In [20]:
train_path = '../../data/processed/train.csv'
train.to_csv(train_path, index=False)

In [21]:
test_path = '../../data/processed/test.csv'
test.to_csv(test_path, index=False)