In [4]:
# Import statements
import pandas as pd
import numpy as np


In [30]:
# HPI Data
df = pd.read_csv('datasets/hpi_master.csv')
cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
pattern = "|".join(cities)
filtered_df = df[df['place_name'].str.contains(pattern, case=False, na=False)]
filtered_df = filtered_df.drop(columns=['hpi_type', 'hpi_flavor', 'frequency', 'level', 'place_id', 'index_sa'])
def extract_city(name):
    for city in cities:
        if city.lower() in name.lower():
            return city
    return name

filtered_df['place_name'] = filtered_df['place_name'].apply(extract_city)
filtered_df = filtered_df[filtered_df['yr'] >= 1990]
filtered_df = filtered_df.rename(columns={'yr': 'year', 'place_name': 'city', 'period': 'quarter'})
hpi = filtered_df
hpi.head()

Unnamed: 0,city,year,quarter,index_nsa
16354,Chicago,1990,1,83.83
16355,Chicago,1990,2,84.81
16356,Chicago,1990,3,85.8
16357,Chicago,1990,4,85.87
16358,Chicago,1991,1,86.95


In [None]:
#Demographic Data
import pandas as pd
import openpyxl

df = pd.read_excel('datasets/income_data.xlsx', sheet_name='Sheet1')

df_less_income = df[['Median Household Income', 'City', 'Year', 'Total Households']]

def expand_to_quarters(row_1, row_2):
    median_1 = int(row_1['Median Household Income'])
    median_2 = int(row_2['Median Household Income'])

    if median_1 == 0 or median_2 == 0:
        return pd.DataFrame()  # Avoid division by zero

    slope = (median_2 - median_1) / 4
    quarter_incomes = [int(median_1 + slope * (i + 1)) for i in range(4)]
    
    quarters = [f"{i+1}" for i in range(4)]
    
    data = {
        'Median Household Income': quarter_incomes,
        'City': [row_1['City']] * 4,
        'Year': [f"{row_1['Year']}"] * 4,
        'Quarter': quarters
    }
    
    return pd.DataFrame(data)
df_new = []
for i in range(len(df_less) - 1):
    expanded = expand_to_quarters(df_less.iloc[i], df_less.iloc[i + 1])
    if (df_less.iloc[i]['City'] != df_less.iloc[i + 1]['City']):
        continue  # Skip if not the same city
    if not expanded.empty:
        df_new.append(expanded)
df_new = pd.concat(df_new, ignore_index=True)
df_new = df_new.sort_values(by=['City', 'Year', 'Quarter']).reset_index(drop=True)
#df_new.to_excel('expanded_income_data.xlsx', index=False)
df_new_income = df_new

data = {
    'Median Household Income': [38909, 26301, 28327, 40328, 45600, 38293, 37625, 41207, 36616, 36687],
    'City': ['New York', 'Chicago', 'Phoenix', 'Houston', 'Los Angeles', 'New York', 'Chicago', 'Phoenix', 'Houston', 'Los Angeles'],
    'Year': ['1990', '1990', '1990', '1990', '1990', '2000', '2000', '2000', '2000', '2000'],
}

df = pd.DataFrame(data)

df['Date'] = pd.to_datetime(df['Year']) + pd.offsets.QuarterEnd(1)

# Step 3: Set up new full date range (1990Q1 to 2009Q4)
quarters = pd.date_range(start='1990-03-31', end='2009-12-31', freq='Q')

# Step 4: Create a multi-index of all City-Quarter combinations
cities = df['City'].unique()
multi_index = pd.MultiIndex.from_product([cities, quarters], names=['City', 'Date'])

# Step 5: Reindex the original DataFrame
df = df.set_index(['City', 'Date'])
df = df.reindex(multi_index)

# Step 6: Interpolate missing values by linear method
df['Median Household Income'] = df['Median Household Income'].interpolate(method='linear')

# Optional: Reset index to flatten the DataFrame
df = df.reset_index()

# Display final result
#df.to_excel('interpolated_income_data.xlsx', index=False)

df_interpolated_income = df.copy()



quarter_age = pd.read_excel('less_age_data.xlsx', sheet_name='Sheet1')

def expand_to_quarters_age(row_1, row_2):
    median_1 = int(row_1['Total Population'])
    median_2 = int(row_2['Total Population'])

    if median_1 == 0 or median_2 == 0:
        return pd.DataFrame()  # Avoid division by zero

    slope = (median_2 - median_1) / 4
    quarter_incomes = [int(median_1 + slope * (i + 1)) for i in range(4)]
    
    quarters = [f"{i+1}" for i in range(4)]
    
    data = {
        'Total Population': quarter_incomes,
        'City': [row_1['City']] * 4,
        'Year': [f"{row_1['Year']}"] * 4,
        'Quarter': quarters
    }
    
    return pd.DataFrame(data)
df_new_age = []
quarter_age = quarter_age.sort_values(by=['City', 'Year']).reset_index(drop=True)
for i in range(len(quarter_age) - 1):
    if (quarter_age.iloc[i]['City'] != quarter_age.iloc[i + 1]['City']):
        continue  # Skip if not the same city
    expanded = expand_to_quarters_age(quarter_age.iloc[i], quarter_age.iloc[i + 1])
    if not expanded.empty:
        df_new_age.append(expanded)

df_new_age = pd.concat(df_new_age, ignore_index=True)
df_new_age = df_new_age.sort_values(by=['City', 'Year', 'Quarter']).reset_index(drop=True)
#df_new_age.to_excel('expanded_age_data.xlsx', index=False)
df_new_age_population = df_new_age




#df_income_combined = pd.read_excel('datasets/interpolated_income_data.xlsx', sheet_name='Sheet1')
#df_income_combined_2 = pd.read_excel('datasets/expanded_income_data.xlsx', sheet_name='Sheet1')

combined_income = pd.concat([df_interpolated_income, df_new_income]).drop_duplicates().reset_index(drop=True)
combined_income = combined_income.drop(columns=['Date'])

#total_age = pd.read_excel('datasets/expanded_age_data.xlsx', sheet_name='Sheet1')
total_age = df_new_age_population
final_combined = pd.merge(combined_income, total_age, on=['City', 'Year', 'Quarter'], how='inner')
final_combined = final_combined.sort_values(by=['City', 'Year', 'Quarter']).reset_index(drop=True)
final_combined.to_excel('datasets/final_combined_data.xlsx', index=False)


ModuleNotFoundError: No module named 'pandas'

In [None]:
# GDP Data

In [11]:
# Unemployment data

In [5]:
# Industrial production
df = pd.read_csv('datasets/industrial_production.csv')
df['observation_date'] = pd.to_datetime(df['observation_date'])
df['year'] = df['observation_date'].dt.year
df = df[df['year'] >= 1990]
df['month'] = df['observation_date'].dt.month
df['quarter'] = ((df['month'] - 1) // 3) + 1
df = df.drop(columns=['month'])
quarterly_avg = df.groupby(['year', 'quarter']).mean(numeric_only=True).reset_index()
cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
quarterly_avg_expanded = pd.DataFrame(
    quarterly_avg.loc[quarterly_avg.index.repeat(len(cities))].reset_index(drop=True)
)
quarterly_avg_expanded['City'] = cities * len(quarterly_avg)
quarterly_avg_expanded.head()

Unnamed: 0,year,quarter,INDPRO,City
0,1990,1,62.1073,Houston
1,1990,1,62.1073,Phoenix
2,1990,1,62.1073,Los Angeles
3,1990,1,62.1073,Chicago
4,1990,1,62.1073,New York


In [6]:
# Volatility Data
df = pd.read_csv('datasets/VIXCLS.csv')
df['observation_date'] = pd.to_datetime(df['observation_date'])

df['VIXCLS'] = df['VIXCLS'].interpolate(method='linear')

df['year'] = df['observation_date'].dt.year
df['quarter'] = df['observation_date'].dt.quarter

quarterly_df = df.groupby(['year', 'quarter'], as_index=False)['VIXCLS'].mean()

cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
vix = pd.DataFrame()

for city in cities:
    temp = quarterly_df.copy()
    temp['city'] = city
    vix = pd.concat([vix, temp], ignore_index=True)

vix = vix[['city', 'year', 'quarter', 'VIXCLS']]

vix.head()

Unnamed: 0,city,year,quarter,VIXCLS
0,Houston,1990,1,22.168437
1,Houston,1990,2,18.735385
2,Houston,1990,3,25.134462
3,Houston,1990,4,26.005909
4,Houston,1991,1,22.427344


In [14]:
# Inflation Data

In [1]:
#add national gdp data to the csv file
import pandas as pd

city_file = "datasets/final_combined_data.xlsx"
gdp_file = "datasets/National GDP.csv"

city_df = pd.read_excel(city_file)
gdp_df = pd.read_csv(gdp_file)

gdp_df["DATE"] = pd.to_datetime(gdp_df["observation_date"])
gdp_df["Year"] = gdp_df["DATE"].dt.year
gdp_df["Quarter"] = gdp_df["DATE"].dt.quarter

gdp_quarterly = gdp_df[["Year", "Quarter", "GDP"]].rename(columns={"GDP": "National GDP1"})

merged = city_df.merge(gdp_quarterly, on=["Year", "Quarter"], how="left")
merged.to_excel("datasets/test_output.xlsx", index=False)



print(merged.head(20))



       City  Median Household Income  Year  Quarter  Total Population  \
0   Chicago                  26301.0  1990        1           2784500   
1   Chicago                  26584.1  1990        2           2785000   
2   Chicago                  26867.2  1990        3           2785500   
3   Chicago                  27150.3  1990        4           2786000   
4   Chicago                  27433.4  1991        1           2787000   
5   Chicago                  27716.5  1991        2           2788000   
6   Chicago                  27999.6  1991        3           2789000   
7   Chicago                  28282.7  1991        4           2790000   
8   Chicago                  28565.8  1992        1           2790500   
9   Chicago                  28848.9  1992        2           2791000   
10  Chicago                  29132.0  1992        3           2791500   
11  Chicago                  29415.1  1992        4           2792000   
12  Chicago                  29698.2  1993        1

In [11]:
#predict the gdp for 5 cities from 1990 to 2000 based on the national gdp and cities'population

import pandas as pd
pd.set_option('display.float_format', '{:,.0f}'.format)
city_gdp_file = "datasets/5cities (1) (1).csv"      
quarter_file = "datasets/final_combined_data.xlsx" 

city_gdp_df = pd.read_csv(city_gdp_file)   
quarter_df = pd.read_excel(quarter_file)   

city_gdp_long = city_gdp_df.melt(
    id_vars=["GeoFips", "GeoName"], 
    var_name="Year", 
    value_name="City GDP"
)

city_gdp_long["Year"] = city_gdp_long["Year"].astype(int)
city_gdp_long.rename(columns={"GeoName": "City"}, inplace=True)

merged = quarter_df.merge(city_gdp_long, on=["City", "Year"], how="left")
merged = merged.drop(columns=["GeoFips"])

merged.to_excel("datasets/test_with_city_gdp.xlsx", index=False)

import pandas as pd
from sklearn.linear_model import LinearRegression

city_gdp_file = "datasets/5cities (1) (1).csv"       
quarter_file = "datasets/final_combined_data.xlsx"  
test_file = "datasets/test_with_city_gdp.xlsx"     

city_gdp_df = pd.read_csv(city_gdp_file)
quarter_df = pd.read_excel(quarter_file)
test_df = pd.read_excel(test_file)

city_gdp_long = city_gdp_df.melt(
    id_vars=["GeoFips", "GeoName"], 
    var_name="Year", 
    value_name="City GDP"
)
city_gdp_long["Year"] = city_gdp_long["Year"].astype(int)
city_gdp_long.rename(columns={"GeoName": "City"}, inplace=True)

merged = quarter_df.merge(city_gdp_long, on=["City", "Year"], how="left")

train = merged.dropna(subset=["City GDP"])  
X_train = train[["Total Population", "National GDP_Billion"]]
y_train = train["City GDP"]

model = LinearRegression()
model.fit(X_train, y_train)

predict_data = merged[(merged["Year"] < 2001)]
X_pred = predict_data[["Total Population", "National GDP_Billion"]]
merged.loc[merged["Year"] < 2001, "City GDP"] = model.predict(X_pred)
merged = merged.drop(columns=["GeoFips"])

merged.to_excel(test_file, index=False)

print(merged.head(10))





      City  Median Household Income  Year  Quarter  Total Population  \
0  Chicago                   26,301  1990        1           2784500   
1  Chicago                   26,584  1990        2           2785000   
2  Chicago                   26,867  1990        3           2785500   
3  Chicago                   27,150  1990        4           2786000   
4  Chicago                   27,433  1991        1           2787000   
5  Chicago                   27,716  1991        2           2788000   
6  Chicago                   28,000  1991        3           2789000   
7  Chicago                   28,283  1991        4           2790000   
8  Chicago                   28,566  1992        1           2790500   
9  Chicago                   28,849  1992        2           2791000   

   National GDP_Billion    City GDP  
0                 5,873  94,145,077  
1                 5,960  97,685,275  
2                 6,015  99,950,725  
3                 6,005  99,627,413  
4                

# Unemployment and Inflation Data

In [None]:
# Import all packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read in all dates
cpi = pd.read_excel("consumer-price-inflation.xlsx")
chicago_unemployment = pd.read_excel("chicago-unemp.xlsx")
houston_unemployment = pd.read_excel("houston-unemp.xlsx")
phoenix_unemployment = pd.read_excel("phoenix-unemp.xlsx")
la_unemployment = pd.read_excel("los-angeles-unemp.xlsx")
ny_unemployment = pd.read_excel("new-york-unemp.xlsx")

In [None]:
cpi.head()

Unnamed: 0,Country Code,IMF Country Code,Country,Indicator Type,Series Name,1970,1971,1972,1973,1974,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Note
0,ABW,314.0,Aruba,Inflation,Headline Consumer Price Inflation,,,,,,...,-0.89,-0.474,3.58,4.257,1.222,0.744,5.52,3.363,1.712,Annual average inflation
1,AFG,512.0,Afghanistan,Inflation,Headline Consumer Price Inflation,25.51,25.51,-12.52,-10.68,10.23,...,4.38,4.976,0.63,2.302,5.443,5.062,10.6,-7.714,-6.601186,Annual average inflation
2,AGO,614.0,Angola,Inflation,Headline Consumer Price Inflation,7.97,5.78,15.8,15.67,27.42,...,32.377731,29.844,19.63,17.079,21.024,23.846111,23.826819,13.639,28.240495,Annual average inflation
3,ALB,914.0,Albania,Inflation,Headline Consumer Price Inflation,,,,,,...,1.291234,1.987,2.03,1.411,1.620887,2.041472,6.725203,4.769,2.21449,Annual average inflation
4,ARE,466.0,United Arab Emirates,Inflation,Headline Consumer Price Inflation,21.984699,21.984699,21.984699,21.984699,21.984699,...,1.619795,1.967,3.06,-1.931,-2.079403,0.18,4.827889,1.624,1.663365,Annual average inflation


In [None]:
# Drop CPI for all countries except US and unnecessary columns
cpi_us = cpi[cpi['Country'] == 'United States'].copy()
cpi_us.drop(columns=['Country Code', 'IMF Country Code', 'Indicator Type', 'Series Name', 'Note', 'Country'], inplace=True)
cpi_us.head()

Unnamed: 0,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
190,5.8953,4.292765,3.272283,6.177756,11.054799,9.143152,5.744812,6.501684,7.63096,11.254475,...,0.118625,1.261581,2.136,2.44,1.812,1.233584,4.697859,8.0028,4.128,2.949511


In [None]:
# Transpose the dataframe
cpi_us_t = cpi_us.T
cpi_us_t = cpi_us_t.reset_index().rename(columns={'index': 'Year', 190: 'CPI'})
cpi_us_t.head()

Unnamed: 0,Year,CPI
0,1970,5.8953
1,1971,4.292765
2,1972,3.272283
3,1973,6.177756
4,1974,11.054799


In [None]:
cpi_us_t.shape

(55, 2)

In [None]:
# Look at shapes of all unemployment datasets
print("Chicago Unemployment Shape:", chicago_unemployment.shape)
print("Houston Unemployment Shape:", houston_unemployment.shape)
print("Phoenix Unemployment Shape:", phoenix_unemployment.shape)
print("Los Angeles Unemployment Shape:", la_unemployment.shape)
print("New York Unemployment Shape:", ny_unemployment.shape)

Chicago Unemployment Shape: (427, 6)
Houston Unemployment Shape: (427, 6)
Phoenix Unemployment Shape: (427, 6)
Los Angeles Unemployment Shape: (427, 6)
New York Unemployment Shape: (595, 8)


In [None]:
# Look at head of all unemployment datasets
print(f"Chicago: \n {chicago_unemployment.head()}")
print(f"Houston: \n {houston_unemployment.head()}")
print(f"Phoenix: \n {phoenix_unemployment.head()}")
print(f"LA: \n {la_unemployment.head()}")
print(f"NY: \n {ny_unemployment.head()}")

Chicago: 
    Year Period  labor force  employment  unemployment  unemployment rate
0  1990    Jan      4165050     3877704        287346                6.9
1  1990    Feb      4158474     3883640        274834                6.6
2  1990    Mar      4158961     3895016        263945                6.3
3  1990    Apr      4159901     3887001        272900                6.6
4  1990    May      4168759     3910275        258484                6.2
Houston: 
    Year Period  labor force  employment  unemployment  unemployment rate
0  1990    Jan      1984076     1881055        103021                5.2
1  1990    Feb      1988590     1886513        102077                5.1
2  1990    Mar      1997856     1903040         94816                4.7
3  1990    Apr      2012121     1916293         95828                4.8
4  1990    May      2032188     1933128         99060                4.9
Phoenix: 
    Year Period  labor force  employment  unemployment  unemployment rate
0  1990    Jan    

In [None]:
# Percent missing values in each dataset
dfs = [chicago_unemployment, houston_unemployment, ny_unemployment, la_unemployment, phoenix_unemployment, cpi_us_t]

for df in dfs:
    print((df.isnull().sum() / len(df)) * 100)

Year                 0.0
Period               0.0
labor force          0.0
employment           0.0
unemployment         0.0
unemployment rate    0.0
dtype: float64
Year                 0.0
Period               0.0
labor force          0.0
employment           0.0
unemployment         0.0
unemployment rate    0.0
dtype: float64
Year                              0.0
Period                            0.0
labor force participation rate    0.0
employment-population ratio       0.0
labor force                       0.0
employment                        0.0
unemployment                      0.0
unemployment rate                 0.0
dtype: float64
Year                 0.0
Period               0.0
labor force          0.0
employment           0.0
unemployment         0.0
unemployment rate    0.0
dtype: float64
Year                 0.0
Period               0.0
labor force          0.0
employment           0.0
unemployment         0.0
unemployment rate    0.0
dtype: float64
Year    0.0
CPI     0

In [None]:
# Unemployment Data Preparation for Merging
chicago_unemp = chicago_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'Chicago'})
houston_unemp = houston_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'Houston'})
phoenix_unemp = phoenix_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'Phoenix'})
la_unemp = la_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'Los Angeles'})
ny_unemp = ny_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'New York'})


In [None]:
# Convert monthly data to quarter in each dataframe
for df in [chicago_unemp, houston_unemp, phoenix_unemp, la_unemp, ny_unemp]:
    # Define the period_to_quarter function
    def period_to_quarter(period):
        month_to_quarter = {
            'Jan': '1', 'Feb': '1', 'Mar': '1',
            'Apr': '2', 'May': '2', 'Jun': '2',
            'Jul': '3', 'Aug': '3', 'Sep': '3',
            'Oct': '4', 'Nov': '4', 'Dec': '4'
        }
        month = period[:3]
        return month_to_quarter.get(month, None)

    df['Quarter'] = df['Period'].apply(period_to_quarter)

# Aggregate to quarterly average for each city
chicago_quarterly = chicago_unemp.groupby(['Year', 'Quarter'])['Chicago'].mean().reset_index()
houston_quarterly = houston_unemp.groupby(['Year', 'Quarter'])['Houston'].mean().reset_index()
phoenix_quarterly = phoenix_unemp.groupby(['Year', 'Quarter'])['Phoenix'].mean().reset_index()
la_quarterly = la_unemp.groupby(['Year', 'Quarter'])['Los Angeles'].mean().reset_index()
ny_quarterly = ny_unemp.groupby(['Year', 'Quarter'])['New York'].mean().reset_index()


In [None]:
# Merge all quarterly dataframes
from functools import reduce

data_frames = [chicago_quarterly, houston_quarterly, phoenix_quarterly, la_quarterly, ny_quarterly]
quarterly_unemployment = reduce(lambda left, right: pd.merge(left, right, on=['Year', 'Quarter'], how='outer'), data_frames)
quarterly_unemployment.head()

Unnamed: 0,Year,Quarter,Chicago,Houston,Phoenix,Los Angeles,New York
0,1976,1,,,,,10.266667
1,1976,2,,,,,10.2
2,1976,3,,,,,10.2
3,1976,4,,,,,10.166667
4,1977,1,,,,,9.966667


In [None]:
final_unemployment = quarterly_unemployment[quarterly_unemployment['Year'] >= 1990]
final_unemployment

Unnamed: 0,Year,Quarter,Chicago,Houston,Phoenix,Los Angeles,New York
56,1990,1,6.600000,5.000000,4.400000,5.000000,5.133333
57,1990,2,6.400000,5.000000,4.400000,4.900000,5.000000
58,1990,3,6.466667,5.366667,4.533333,5.633333,5.266667
59,1990,4,6.133333,5.100000,4.066667,5.766667,5.933333
60,1991,1,7.166667,5.500000,4.266667,6.833333,6.700000
...,...,...,...,...,...,...,...
194,2024,3,5.500000,4.733333,3.600000,5.933333,4.400000
195,2024,4,4.700000,4.233333,3.266667,5.400000,4.400000
196,2025,1,5.233333,4.366667,3.566667,5.300000,4.300000
197,2025,2,4.700000,4.133333,3.600000,5.133333,4.066667


In [None]:
# Splitting years in quarters for CPI
cpi_us_t_quarters = cpi_us_t.loc[cpi_us_t.index.repeat(4)].reset_index(drop=True)
cpi_us_t_quarters['Quarter'] = np.tile([1, 2, 3, 4], len(cpi_us_t))
cpi_us_t_quarters.head(12)


Unnamed: 0,Year,CPI,Quarter
0,1970,5.8953,1
1,1970,5.8953,2
2,1970,5.8953,3
3,1970,5.8953,4
4,1971,4.292765,1
5,1971,4.292765,2
6,1971,4.292765,3
7,1971,4.292765,4
8,1972,3.272283,1
9,1972,3.272283,2


In [None]:
# Save relevant dataframes to csv
final_unemployment.to_csv("quarterly_unemployment.csv", index=False)
cpi_us_t_quarters.to_csv("cpi_us_quarters.csv", index=False)
chicago_quarterly.to_csv("chicago_quarterly_unemployment.csv", index=False)
houston_quarterly.to_csv("houston_quarterly_unemployment.csv", index=False)
phoenix_quarterly.to_csv("phoenix_quarterly_unemployment.csv", index=False)
la_quarterly.to_csv("la_quarterly_unemployment.csv", index=False)
ny_quarterly.to_csv("ny_quarterly_unemployment.csv", index=False)