## 1. Triming Raw Data - Extract Los Angeles metropolitant data only

Note: Raw data is not uploaded to GitHub. Do not run this section without downloading the raw_data folder!

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Set some options for pandas and to have matplotlib show the charts in the notebook
pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline

# Import raw data
house_raw = pd.read_csv('raw_data/house_value_raw.csv', index_col=False)
print(house_raw.head())
house_raw.shape

   RegionID  SizeRank  RegionName RegionType StateName State      City  \
0     61639         0       10025        Zip        NY    NY  New York   
1     84654         1       60657        Zip        IL    IL   Chicago   
2     61637         2       10023        Zip        NY    NY  New York   
3     91982         3       77494        Zip        TX    TX      Katy   
4     84616         4       60614        Zip        IL    IL   Chicago   

                              Metro       CountyName  1996-01-31  ...  \
0       New York-Newark-Jersey City  New York County         nan  ...   
1          Chicago-Naperville-Elgin      Cook County  360,618.00  ...   
2       New York-Newark-Jersey City  New York County         nan  ...   
3  Houston-The Woodlands-Sugar Land    Harris County  200,194.00  ...   
4          Chicago-Naperville-Elgin      Cook County  547,552.00  ...   

    2020-05-31   2020-06-30   2020-07-31   2020-08-31   2020-09-30  \
0 1,251,067.00 1,243,911.00 1,242,393.00 1,243

(30311, 311)

In [2]:
# Triming data to only include California
house_raw = house_raw.copy()[house_raw.State == 'CA']
print(house_raw.head())
print(house_raw.shape)

    RegionID  SizeRank  RegionName RegionType StateName State           City  \
13     97564        13       94109        Zip        CA    CA  San Francisco   
21     96107        22       90250        Zip        CA    CA      Hawthorne   
39     97771        40       94565        Zip        CA    CA      Pittsburg   
43     96027        44       90046        Zip        CA    CA    Los Angeles   
87     97711        88       94501        Zip        CA    CA        Alameda   

                             Metro            CountyName  1996-01-31  ...  \
13   San Francisco-Oakland-Hayward  San Francisco County  559,314.00  ...   
21  Los Angeles-Long Beach-Anaheim    Los Angeles County  173,716.00  ...   
39   San Francisco-Oakland-Hayward   Contra Costa County         nan  ...   
43  Los Angeles-Long Beach-Anaheim    Los Angeles County  172,655.00  ...   
87   San Francisco-Oakland-Hayward        Alameda County  170,395.00  ...   

     2020-05-31   2020-06-30   2020-07-31   2020-08-31  

In [3]:
# Triming data to only include Los Angeles Metro Politant
house_raw = house_raw.copy()[house_raw.Metro == 'Los Angeles-Long Beach-Anaheim']
print(house_raw.head())
print(house_raw.shape)
print(house_raw.CountyName.value_counts())

# Export trimmed data to csv
house_raw.to_csv('data/1_house_value_LA.csv', index=False)

     RegionID  SizeRank  RegionName RegionType StateName State         City  \
21      96107        22       90250        Zip        CA    CA    Hawthorne   
43      96027        44       90046        Zip        CA    CA  Los Angeles   
98      96015        99       90034        Zip        CA    CA  Los Angeles   
111     96239       112       90805        Zip        CA    CA   Long Beach   
113     96193       114       90650        Zip        CA    CA      Norwalk   

                              Metro          CountyName  1996-01-31  ...  \
21   Los Angeles-Long Beach-Anaheim  Los Angeles County  173,716.00  ...   
43   Los Angeles-Long Beach-Anaheim  Los Angeles County  172,655.00  ...   
98   Los Angeles-Long Beach-Anaheim  Los Angeles County  260,983.00  ...   
111  Los Angeles-Long Beach-Anaheim  Los Angeles County  137,652.00  ...   
113  Los Angeles-Long Beach-Anaheim  Los Angeles County  147,316.00  ...   

      2020-05-31   2020-06-30   2020-07-31   2020-08-31   2020-09-30

## 2. Process data - extract average house values by year

In [4]:
# Import data from trimmed data
house = pd.read_csv('data/1_house_value_LA.csv', index_col=False)
print(house.head())
print(house.shape)
print(house.CountyName.value_counts())
print(house.columns)

   RegionID  SizeRank  RegionName RegionType StateName State         City  \
0     96107        22       90250        Zip        CA    CA    Hawthorne   
1     96027        44       90046        Zip        CA    CA  Los Angeles   
2     96015        99       90034        Zip        CA    CA  Los Angeles   
3     96239       112       90805        Zip        CA    CA   Long Beach   
4     96193       114       90650        Zip        CA    CA      Norwalk   

                            Metro          CountyName  1996-01-31  ...  \
0  Los Angeles-Long Beach-Anaheim  Los Angeles County  173,716.00  ...   
1  Los Angeles-Long Beach-Anaheim  Los Angeles County  172,655.00  ...   
2  Los Angeles-Long Beach-Anaheim  Los Angeles County  260,983.00  ...   
3  Los Angeles-Long Beach-Anaheim  Los Angeles County  137,652.00  ...   
4  Los Angeles-Long Beach-Anaheim  Los Angeles County  147,316.00  ...   

    2020-05-31   2020-06-30   2020-07-31   2020-08-31   2020-09-30  \
0   703,189.00   705,3

In [5]:
# Extract all the month from header row, start from column index 9
years = pd.Series(house.columns[9:])
print(years.head())
print(years.describe())

# Convert from month to year data, which is the first 4 numbers
years = years.apply(lambda x: x[:4])
print(years.head())
print(years.value_counts().head())

# Update columns' name for house dataset:
house.columns = list(house.columns[:9]) + list(years)
print(house.head())

0    1996-01-31
1    1996-02-29
2    1996-03-31
3    1996-04-30
4    1996-05-31
dtype: object
count            302
unique           302
top       2011-10-31
freq               1
dtype: object
0    1996
1    1996
2    1996
3    1996
4    1996
dtype: object
2006    12
2008    12
2010    12
1999    12
2003    12
dtype: int64
   RegionID  SizeRank  RegionName RegionType StateName State         City  \
0     96107        22       90250        Zip        CA    CA    Hawthorne   
1     96027        44       90046        Zip        CA    CA  Los Angeles   
2     96015        99       90034        Zip        CA    CA  Los Angeles   
3     96239       112       90805        Zip        CA    CA   Long Beach   
4     96193       114       90650        Zip        CA    CA      Norwalk   

                            Metro          CountyName       1996  ...  \
0  Los Angeles-Long Beach-Anaheim  Los Angeles County 173,716.00  ...   
1  Los Angeles-Long Beach-Anaheim  Los Angeles County 172,655.00  .

In [7]:
# Create a new data set to store house value by mean value of each year
house_yearly = pd.DataFrame([house.RegionName, house.City, house.CountyName]).T
house_yearly.columns = ['zip','city','county']

# Calculate mean house values for each year and add to house_yearly dataset
for year in list(years.unique()):
    house_yearly[year] = house.loc[:,year].mean(axis=1)
    
print(house_yearly.head())
# Export house_yearly values to csv
house_yearly.to_csv('data/7_house_by_year.csv', index=False)

     zip         city              county       1996       1997       1998  \
0  90250    Hawthorne  Los Angeles County 175,496.25 174,246.42 184,904.75   
1  90046  Los Angeles  Los Angeles County 175,135.00 187,405.25 214,114.33   
2  90034  Los Angeles  Los Angeles County 262,943.58 272,706.58 309,756.17   
3  90805   Long Beach  Los Angeles County 138,491.42 136,872.75 144,182.75   
4  90650      Norwalk  Los Angeles County 146,735.25 144,758.58 151,095.92   

        1999       2000       2001       2002  ...       2012       2013  \
0 199,459.33 218,407.17 241,722.08 277,739.08  ... 372,409.25 432,330.00   
1 242,637.42 271,987.17 296,312.17 327,704.83  ... 627,627.67 749,164.75   
2 351,412.92 384,412.67 422,400.00 476,366.08  ... 732,152.00 854,976.50   
3 152,110.58 166,593.08 183,235.33 208,686.08  ... 261,307.33 312,950.17   
4 161,769.75 174,862.00 191,337.92 220,191.92  ... 290,623.00 336,330.08   

        2014       2015         2016         2017         2018         201