In [1]:
# Import statements
import pandas as pd
import numpy as np


In [2]:
# HPI Data
df = pd.read_csv('datasets/hpi_master.csv')
cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
pattern = "|".join(cities)
filtered_df = df[df['place_name'].str.contains(pattern, case=False, na=False)]
filtered_df = filtered_df.drop(columns=['hpi_type', 'hpi_flavor', 'frequency', 'level', 'place_id', 'index_sa'])
def extract_city(name):
    for city in cities:
        if city.lower() in name.lower():
            return city
    return name

filtered_df['place_name'] = filtered_df['place_name'].apply(extract_city)
filtered_df = filtered_df[filtered_df['yr'] >= 1990]
filtered_df = filtered_df.rename(columns={'yr': 'year', 'place_name': 'city', 'period': 'quarter'})
hpi = filtered_df
hpi.head()

Unnamed: 0,city,year,quarter,index_nsa
16354,Chicago,1990,1,83.83
16355,Chicago,1990,2,84.81
16356,Chicago,1990,3,85.8
16357,Chicago,1990,4,85.87
16358,Chicago,1991,1,86.95


In [4]:
#Demographic Data
import pandas as pd
import numpy as np

df_income = pd.read_excel('datasets/income_data.xlsx', sheet_name='Sheet1')
df_age_before = pd.read_excel('datasets/less_age_data.xlsx', sheet_name='Sheet1')

df_less_income = df_income[['Median Household Income', 'City', 'Year']]

data = {
    'Median Household Income': [38909, 26301, 28327, 40328, 45600, 38293, 37625, 41207, 36616, 36687],
    'City': ['New York', 'Chicago', 'Phoenix', 'Houston', 'Los Angeles', 'New York', 'Chicago', 'Phoenix', 'Houston', 'Los Angeles'],
    'Year': [1990, 1990, 1990, 1990, 1990, 2000, 2000, 2000, 2000, 2000],
}

df_income_before = pd.DataFrame(data)

df_income_combined = pd.concat([df_less_income, df_income_before], axis=0)



df_income_combined = df_income_combined.sort_values(by=['City', 'Year']).reset_index(drop=True)
df_income_combined['Date'] = pd.to_datetime(df_income_combined['Year'], format='%Y') + pd.offsets.QuarterEnd(4)

quarters = pd.date_range(start='1990-03-31', end='2022-12-31', freq='Q')

median_income_quarterly = []

for city, group in df_income_combined.groupby('City'):
    city_df = group.set_index('Date').sort_index()
    city_quarterly = pd.DataFrame(index=quarters)
    city_quarterly = city_quarterly.join(city_df['Median Household Income'], how='left')
    city_quarterly['Median Household Income'] = city_quarterly['Median Household Income'].interpolate(method='linear')
    city_quarterly['Median Household Income'] = city_quarterly['Median Household Income'].ffill().bfill()
    city_quarterly['City'] = city
    median_income_quarterly.append(city_quarterly)

df_income_combined = pd.concat(median_income_quarterly).reset_index().rename(columns={'index': 'Date'})
df_income_combined['Year'] = df_income_combined['Date'].dt.year
df_income_combined['Quarter'] = df_income_combined['Date'].dt.quarter
df_income_combined = df_income_combined[['Median Household Income', 'City', 'Year', 'Quarter']]


# Population Section

age_data_combined = df_age_before.sort_values(by=['City', 'Year']).reset_index(drop=True)
age_data_combined['Date'] = pd.to_datetime(age_data_combined['Year'], format='%Y') + pd.offsets.QuarterEnd(4)

median_age_quarterly = []
for city, group in age_data_combined.groupby('City'):
    city_df = group.set_index('Date').sort_index()
    city_quarterly = pd.DataFrame(index=quarters)
    city_quarterly = city_quarterly.join(city_df['Total Population'], how='left')
    city_quarterly['Total Population'] = city_quarterly['Total Population'].interpolate(method='linear')
    city_quarterly['Total Population'] = city_quarterly['Total Population'].ffill().bfill()
    city_quarterly['City'] = city
    median_age_quarterly.append(city_quarterly)


df_age_combined = pd.concat(median_age_quarterly).reset_index().rename(columns={'index': 'Date'})
df_age_combined['Year'] = df_age_combined['Date'].dt.year
df_age_combined['Quarter'] = df_age_combined['Date'].dt.quarter
df_age_combined = df_age_combined[['Total Population', 'City', 'Year', 'Quarter']]

# Merging Section
final_combined = pd.merge(df_income_combined, df_age_combined, on=['City', 'Year', 'Quarter'], how='inner')
final_combined = final_combined.sort_values(by=['City', 'Year', 'Quarter']).reset_index(drop=True)
final_combined.to_excel('datasets/final_combined_data.xlsx', index=False)


  quarters = pd.date_range(start='1990-03-31', end='2022-12-31', freq='Q')


In [4]:
# Industrial production
df = pd.read_csv('datasets/industrial_production.csv')
df['observation_date'] = pd.to_datetime(df['observation_date'])
df['year'] = df['observation_date'].dt.year
df = df[df['year'] >= 1990]
df['month'] = df['observation_date'].dt.month
df['quarter'] = ((df['month'] - 1) // 3) + 1
df = df.drop(columns=['month'])
quarterly_avg = df.groupby(['year', 'quarter']).mean(numeric_only=True).reset_index()
cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
quarterly_avg_expanded = pd.DataFrame(
    quarterly_avg.loc[quarterly_avg.index.repeat(len(cities))].reset_index(drop=True)
)
quarterly_avg_expanded['City'] = cities * len(quarterly_avg)
quarterly_avg_expanded.head()

Unnamed: 0,year,quarter,INDPRO,City
0,1990,1,62.1073,Houston
1,1990,1,62.1073,Phoenix
2,1990,1,62.1073,Los Angeles
3,1990,1,62.1073,Chicago
4,1990,1,62.1073,New York


In [5]:
# Volatility Data
df = pd.read_csv('datasets/VIXCLS.csv')
df['observation_date'] = pd.to_datetime(df['observation_date'])

df['VIXCLS'] = df['VIXCLS'].interpolate(method='linear')

df['year'] = df['observation_date'].dt.year
df['quarter'] = df['observation_date'].dt.quarter

quarterly_df = df.groupby(['year', 'quarter'], as_index=False)['VIXCLS'].mean()

cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
vix = pd.DataFrame()

for city in cities:
    temp = quarterly_df.copy()
    temp['city'] = city
    vix = pd.concat([vix, temp], ignore_index=True)

vix = vix[['city', 'year', 'quarter', 'VIXCLS']]

vix.head()

Unnamed: 0,city,year,quarter,VIXCLS
0,Houston,1990,1,22.168437
1,Houston,1990,2,18.735385
2,Houston,1990,3,25.134462
3,Houston,1990,4,26.005909
4,Houston,1991,1,22.427344


In [6]:
# Inflation Data - CPI 

# Import necessary libraries
import pandas as pd
import numpy as np

# Load CPI data
cpi = pd.read_excel("datasets/consumer-price-inflation.xlsx")

# Drop CPI for all countries except US and unnecessary columns
cpi_us = cpi[cpi['Country'] == 'United States'].copy()
cpi_us.drop(columns=['Country Code', 'IMF Country Code', 'Indicator Type', 'Series Name', 'Note', 'Country'], inplace=True)
cpi_us.head()

# Transpose the dataframe
cpi_us_t = cpi_us.T
cpi_us_t = cpi_us_t.reset_index().rename(columns={'index': 'Year', 190: 'CPI'})

# Splitting years in quarters for CPI
cpi_us_t_quarters = cpi_us_t.loc[cpi_us_t.index.repeat(4)].reset_index(drop=True)
cpi_us_t_quarters['Quarter'] = np.tile([1, 2, 3, 4], len(cpi_us_t))
cpi_us_t_quarters.head()


Unnamed: 0,Year,CPI,Quarter
0,1970,5.8953,1
1,1970,5.8953,2
2,1970,5.8953,3
3,1970,5.8953,4
4,1971,4.292765,1


In [9]:
# Unemployment Data

# Import all packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load unemployment data for 5 cities
chicago_unemployment = pd.read_excel("datasets/chicago-unemp.xlsx")
houston_unemployment = pd.read_excel("datasets/houston-unemp.xlsx")
phoenix_unemployment = pd.read_excel("datasets/phoenix-unemp.xlsx")
la_unemployment = pd.read_excel("datasets/los-angeles-unemp.xlsx")
ny_unemployment = pd.read_excel("datasets/new-york-unemp.xlsx")

# Unemployment Data Preparation for Merging
chicago_unemp = chicago_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'Chicago'})
houston_unemp = houston_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'Houston'})
phoenix_unemp = phoenix_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'Phoenix'})
la_unemp = la_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'Los Angeles'})
ny_unemp = ny_unemployment[['Year', 'Period', 'unemployment rate']].rename(columns={'unemployment rate': 'New York'})

# Convert monthly data to quarter in each dataframe
for df in [chicago_unemp, houston_unemp, phoenix_unemp, la_unemp, ny_unemp]:
    # Define the period_to_quarter function
    def period_to_quarter(period):
        month_to_quarter = {
            'Jan': '1', 'Feb': '1', 'Mar': '1',
            'Apr': '2', 'May': '2', 'Jun': '2',
            'Jul': '3', 'Aug': '3', 'Sep': '3',
            'Oct': '4', 'Nov': '4', 'Dec': '4'
        }
        month = period[:3]
        return month_to_quarter.get(month, None)

    df['Quarter'] = df['Period'].apply(period_to_quarter)

# Aggregate to quarterly average for each city
chicago_quarterly = chicago_unemp.groupby(['Year', 'Quarter'])['Chicago'].mean().reset_index()
houston_quarterly = houston_unemp.groupby(['Year', 'Quarter'])['Houston'].mean().reset_index()
phoenix_quarterly = phoenix_unemp.groupby(['Year', 'Quarter'])['Phoenix'].mean().reset_index()
la_quarterly = la_unemp.groupby(['Year', 'Quarter'])['Los Angeles'].mean().reset_index()
ny_quarterly = ny_unemp.groupby(['Year', 'Quarter'])['New York'].mean().reset_index()

# Merge all quarterly dataframes
from functools import reduce

data_frames = [chicago_quarterly, houston_quarterly, phoenix_quarterly, la_quarterly, ny_quarterly]
quarterly_unemployment = reduce(lambda left, right: pd.merge(left, right, on=['Year', 'Quarter'], how='outer'), data_frames)
quarterly_unemployment.head()

final_unemployment = quarterly_unemployment[quarterly_unemployment['Year'] >= 1990]
final_unemployment.head()


Unnamed: 0,Year,Quarter,Chicago,Houston,Phoenix,Los Angeles,New York
56,1990,1,6.6,5.0,4.4,5.0,5.133333
57,1990,2,6.4,5.0,4.4,4.9,5.0
58,1990,3,6.466667,5.366667,4.533333,5.633333,5.266667
59,1990,4,6.133333,5.1,4.066667,5.766667,5.933333
60,1991,1,7.166667,5.5,4.266667,6.833333,6.7


In [None]:
#add national gdp data to the csv file
import pandas as pd

city_file = "datasets/final_combined_data.xlsx"
gdp_file = "datasets/National GDP.csv"

city_df = pd.read_excel(city_file)
gdp_df = pd.read_csv(gdp_file)

gdp_df["DATE"] = pd.to_datetime(gdp_df["observation_date"])
gdp_df["Year"] = gdp_df["DATE"].dt.year
gdp_df["Quarter"] = gdp_df["DATE"].dt.quarter

gdp_quarterly = gdp_df[["Year", "Quarter", "GDP"]].rename(columns={"GDP": "National GDP1"})

merged = city_df.merge(gdp_quarterly, on=["Year", "Quarter"], how="left")
#merged.to_excel("datasets/test_output.xlsx", index=False)



#print(merged.head(20))1



       City  Median Household Income  Year  Quarter  Total Population  \
0   Chicago                  26301.0  1990        1           2784500   
1   Chicago                  26584.1  1990        2           2785000   
2   Chicago                  26867.2  1990        3           2785500   
3   Chicago                  27150.3  1990        4           2786000   
4   Chicago                  27433.4  1991        1           2787000   
5   Chicago                  27716.5  1991        2           2788000   
6   Chicago                  27999.6  1991        3           2789000   
7   Chicago                  28282.7  1991        4           2790000   
8   Chicago                  28565.8  1992        1           2790500   
9   Chicago                  28848.9  1992        2           2791000   
10  Chicago                  29132.0  1992        3           2791500   
11  Chicago                  29415.1  1992        4           2792000   
12  Chicago                  29698.2  1993        1

In [12]:
#predict the gdp for 5 cities from 1990 to 2000 based on the national gdp and cities'population

import pandas as pd
pd.set_option('display.float_format', '{:,.0f}'.format)
city_gdp_file = "datasets/5cities (1) (1).csv"      
quarter_file = "datasets/final_combined_data.xlsx" 

city_gdp_df = pd.read_csv(city_gdp_file)   
quarter_df = pd.read_excel(quarter_file)   

city_gdp_long = city_gdp_df.melt(
    id_vars=["GeoFips", "GeoName"], 
    var_name="Year", 
    value_name="City GDP"
)

city_gdp_long["Year"] = city_gdp_long["Year"].astype(int)
city_gdp_long.rename(columns={"GeoName": "City"}, inplace=True)

merged = quarter_df.merge(city_gdp_long, on=["City", "Year"], how="left")
merged = merged.drop(columns=["GeoFips"])

merged.to_excel("datasets/test_with_city_gdp.xlsx", index=False)

import pandas as pd
from sklearn.linear_model import LinearRegression

city_gdp_file = "datasets/5cities (1) (1).csv"       
quarter_file = "datasets/final_combined_data.xlsx"  
test_file = "datasets/test_with_city_gdp.xlsx"     

city_gdp_df = pd.read_csv(city_gdp_file)
quarter_df = pd.read_excel(quarter_file)
test_df = pd.read_excel(test_file)

city_gdp_long = city_gdp_df.melt(
    id_vars=["GeoFips", "GeoName"], 
    var_name="Year", 
    value_name="City GDP"
)
city_gdp_long["Year"] = city_gdp_long["Year"].astype(int)
city_gdp_long.rename(columns={"GeoName": "City"}, inplace=True)

merged = quarter_df.merge(city_gdp_long, on=["City", "Year"], how="left")

train = merged.dropna(subset=["City GDP"])  
X_train = train[["Total Population", "National GDP_Billion"]]
y_train = train["City GDP"]

model = LinearRegression()
model.fit(X_train, y_train)

predict_data = merged[(merged["Year"] < 2001)]
X_pred = predict_data[["Total Population", "National GDP_Billion"]]
merged.loc[merged["Year"] < 2001, "City GDP"] = model.predict(X_pred)
merged = merged.drop(columns=["GeoFips"])

#merged.to_excel(test_file, index=False)

print(merged.head(10))





      City  Median Household Income  Year  Quarter  Total Population  \
0  Chicago                   26,301  1990        1           2784500   
1  Chicago                   26,584  1990        2           2785000   
2  Chicago                   26,867  1990        3           2785500   
3  Chicago                   27,150  1990        4           2786000   
4  Chicago                   27,433  1991        1           2787000   
5  Chicago                   27,716  1991        2           2788000   
6  Chicago                   28,000  1991        3           2789000   
7  Chicago                   28,283  1991        4           2790000   
8  Chicago                   28,566  1992        1           2790500   
9  Chicago                   28,849  1992        2           2791000   

   National GDP_Billion    City GDP  
0                 5,873  94,145,077  
1                 5,960  97,685,275  
2                 6,015  99,950,725  
3                 6,005  99,627,413  
4                