In [4]:
# Import statements
import pandas as pd
import numpy as np


In [30]:
# HPI Data
df = pd.read_csv('datasets/hpi_master.csv')
cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
pattern = "|".join(cities)
filtered_df = df[df['place_name'].str.contains(pattern, case=False, na=False)]
filtered_df = filtered_df.drop(columns=['hpi_type', 'hpi_flavor', 'frequency', 'level', 'place_id', 'index_sa'])
def extract_city(name):
    for city in cities:
        if city.lower() in name.lower():
            return city
    return name

filtered_df['place_name'] = filtered_df['place_name'].apply(extract_city)
filtered_df = filtered_df[filtered_df['yr'] >= 1990]
filtered_df = filtered_df.rename(columns={'yr': 'year', 'place_name': 'city', 'period': 'quarter'})
hpi = filtered_df
hpi.head()

Unnamed: 0,city,year,quarter,index_nsa
16354,Chicago,1990,1,83.83
16355,Chicago,1990,2,84.81
16356,Chicago,1990,3,85.8
16357,Chicago,1990,4,85.87
16358,Chicago,1991,1,86.95


In [None]:
#Demographic Data
import pandas as pd
import openpyxl

df = pd.read_excel('datasets/income_data.xlsx', sheet_name='Sheet1')

df_less_income = df[['Median Household Income', 'City', 'Year', 'Total Households']]

def expand_to_quarters(row_1, row_2):
    median_1 = int(row_1['Median Household Income'])
    median_2 = int(row_2['Median Household Income'])

    if median_1 == 0 or median_2 == 0:
        return pd.DataFrame()  # Avoid division by zero

    slope = (median_2 - median_1) / 4
    quarter_incomes = [int(median_1 + slope * (i + 1)) for i in range(4)]
    
    quarters = [f"{i+1}" for i in range(4)]
    
    data = {
        'Median Household Income': quarter_incomes,
        'City': [row_1['City']] * 4,
        'Year': [f"{row_1['Year']}"] * 4,
        'Quarter': quarters
    }
    
    return pd.DataFrame(data)
df_new = []
for i in range(len(df_less) - 1):
    expanded = expand_to_quarters(df_less.iloc[i], df_less.iloc[i + 1])
    if (df_less.iloc[i]['City'] != df_less.iloc[i + 1]['City']):
        continue  # Skip if not the same city
    if not expanded.empty:
        df_new.append(expanded)
df_new = pd.concat(df_new, ignore_index=True)
df_new = df_new.sort_values(by=['City', 'Year', 'Quarter']).reset_index(drop=True)
#df_new.to_excel('expanded_income_data.xlsx', index=False)
df_new_income = df_new

data = {
    'Median Household Income': [38909, 26301, 28327, 40328, 45600, 38293, 37625, 41207, 36616, 36687],
    'City': ['New York', 'Chicago', 'Phoenix', 'Houston', 'Los Angeles', 'New York', 'Chicago', 'Phoenix', 'Houston', 'Los Angeles'],
    'Year': ['1990', '1990', '1990', '1990', '1990', '2000', '2000', '2000', '2000', '2000'],
}

df = pd.DataFrame(data)

df['Date'] = pd.to_datetime(df['Year']) + pd.offsets.QuarterEnd(1)

# Step 3: Set up new full date range (1990Q1 to 2009Q4)
quarters = pd.date_range(start='1990-03-31', end='2009-12-31', freq='Q')

# Step 4: Create a multi-index of all City-Quarter combinations
cities = df['City'].unique()
multi_index = pd.MultiIndex.from_product([cities, quarters], names=['City', 'Date'])

# Step 5: Reindex the original DataFrame
df = df.set_index(['City', 'Date'])
df = df.reindex(multi_index)

# Step 6: Interpolate missing values by linear method
df['Median Household Income'] = df['Median Household Income'].interpolate(method='linear')

# Optional: Reset index to flatten the DataFrame
df = df.reset_index()

# Display final result
#df.to_excel('interpolated_income_data.xlsx', index=False)

df_interpolated_income = df.copy()



quarter_age = pd.read_excel('less_age_data.xlsx', sheet_name='Sheet1')

def expand_to_quarters_age(row_1, row_2):
    median_1 = int(row_1['Total Population'])
    median_2 = int(row_2['Total Population'])

    if median_1 == 0 or median_2 == 0:
        return pd.DataFrame()  # Avoid division by zero

    slope = (median_2 - median_1) / 4
    quarter_incomes = [int(median_1 + slope * (i + 1)) for i in range(4)]
    
    quarters = [f"{i+1}" for i in range(4)]
    
    data = {
        'Total Population': quarter_incomes,
        'City': [row_1['City']] * 4,
        'Year': [f"{row_1['Year']}"] * 4,
        'Quarter': quarters
    }
    
    return pd.DataFrame(data)
df_new_age = []
quarter_age = quarter_age.sort_values(by=['City', 'Year']).reset_index(drop=True)
for i in range(len(quarter_age) - 1):
    if (quarter_age.iloc[i]['City'] != quarter_age.iloc[i + 1]['City']):
        continue  # Skip if not the same city
    expanded = expand_to_quarters_age(quarter_age.iloc[i], quarter_age.iloc[i + 1])
    if not expanded.empty:
        df_new_age.append(expanded)

df_new_age = pd.concat(df_new_age, ignore_index=True)
df_new_age = df_new_age.sort_values(by=['City', 'Year', 'Quarter']).reset_index(drop=True)
#df_new_age.to_excel('expanded_age_data.xlsx', index=False)
df_new_age_population = df_new_age




#df_income_combined = pd.read_excel('datasets/interpolated_income_data.xlsx', sheet_name='Sheet1')
#df_income_combined_2 = pd.read_excel('datasets/expanded_income_data.xlsx', sheet_name='Sheet1')

combined_income = pd.concat([df_interpolated_income, df_new_income]).drop_duplicates().reset_index(drop=True)
combined_income = combined_income.drop(columns=['Date'])

#total_age = pd.read_excel('datasets/expanded_age_data.xlsx', sheet_name='Sheet1')
total_age = df_new_age_population
final_combined = pd.merge(combined_income, total_age, on=['City', 'Year', 'Quarter'], how='inner')
final_combined = final_combined.sort_values(by=['City', 'Year', 'Quarter']).reset_index(drop=True)
final_combined.to_excel('datasets/final_combined_data.xlsx', index=False)


ModuleNotFoundError: No module named 'pandas'

In [None]:
# GDP Data

In [11]:
# Unemployment data

In [5]:
# Industrial production
df = pd.read_csv('datasets/industrial_production.csv')
df['observation_date'] = pd.to_datetime(df['observation_date'])
df['year'] = df['observation_date'].dt.year
df = df[df['year'] >= 1990]
df['month'] = df['observation_date'].dt.month
df['quarter'] = ((df['month'] - 1) // 3) + 1
df = df.drop(columns=['month'])
quarterly_avg = df.groupby(['year', 'quarter']).mean(numeric_only=True).reset_index()
cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
quarterly_avg_expanded = pd.DataFrame(
    quarterly_avg.loc[quarterly_avg.index.repeat(len(cities))].reset_index(drop=True)
)
quarterly_avg_expanded['City'] = cities * len(quarterly_avg)
quarterly_avg_expanded.head()

Unnamed: 0,year,quarter,INDPRO,City
0,1990,1,62.1073,Houston
1,1990,1,62.1073,Phoenix
2,1990,1,62.1073,Los Angeles
3,1990,1,62.1073,Chicago
4,1990,1,62.1073,New York


In [6]:
# Volatility Data
df = pd.read_csv('datasets/VIXCLS.csv')
df['observation_date'] = pd.to_datetime(df['observation_date'])

df['VIXCLS'] = df['VIXCLS'].interpolate(method='linear')

df['year'] = df['observation_date'].dt.year
df['quarter'] = df['observation_date'].dt.quarter

quarterly_df = df.groupby(['year', 'quarter'], as_index=False)['VIXCLS'].mean()

cities = ["Houston", "Phoenix", "Los Angeles", "Chicago", "New York"]
vix = pd.DataFrame()

for city in cities:
    temp = quarterly_df.copy()
    temp['city'] = city
    vix = pd.concat([vix, temp], ignore_index=True)

vix = vix[['city', 'year', 'quarter', 'VIXCLS']]

vix.head()

Unnamed: 0,city,year,quarter,VIXCLS
0,Houston,1990,1,22.168437
1,Houston,1990,2,18.735385
2,Houston,1990,3,25.134462
3,Houston,1990,4,26.005909
4,Houston,1991,1,22.427344


In [14]:
# Inflation Data

In [1]:
#add national gdp data to the csv file
import pandas as pd

city_file = "datasets/final_combined_data.xlsx"
gdp_file = "datasets/National GDP.csv"

city_df = pd.read_excel(city_file)
gdp_df = pd.read_csv(gdp_file)

gdp_df["DATE"] = pd.to_datetime(gdp_df["observation_date"])
gdp_df["Year"] = gdp_df["DATE"].dt.year
gdp_df["Quarter"] = gdp_df["DATE"].dt.quarter

gdp_quarterly = gdp_df[["Year", "Quarter", "GDP"]].rename(columns={"GDP": "National GDP1"})

merged = city_df.merge(gdp_quarterly, on=["Year", "Quarter"], how="left")
merged.to_excel("datasets/test_output.xlsx", index=False)



print(merged.head(20))



       City  Median Household Income  Year  Quarter  Total Population  \
0   Chicago                  26301.0  1990        1           2784500   
1   Chicago                  26584.1  1990        2           2785000   
2   Chicago                  26867.2  1990        3           2785500   
3   Chicago                  27150.3  1990        4           2786000   
4   Chicago                  27433.4  1991        1           2787000   
5   Chicago                  27716.5  1991        2           2788000   
6   Chicago                  27999.6  1991        3           2789000   
7   Chicago                  28282.7  1991        4           2790000   
8   Chicago                  28565.8  1992        1           2790500   
9   Chicago                  28848.9  1992        2           2791000   
10  Chicago                  29132.0  1992        3           2791500   
11  Chicago                  29415.1  1992        4           2792000   
12  Chicago                  29698.2  1993        1

In [11]:
#predict the gdp for 5 cities from 1990 to 2000 based on the national gdp and cities'population

import pandas as pd
pd.set_option('display.float_format', '{:,.0f}'.format)
city_gdp_file = "datasets/5cities (1) (1).csv"      
quarter_file = "datasets/final_combined_data.xlsx" 

city_gdp_df = pd.read_csv(city_gdp_file)   
quarter_df = pd.read_excel(quarter_file)   

city_gdp_long = city_gdp_df.melt(
    id_vars=["GeoFips", "GeoName"], 
    var_name="Year", 
    value_name="City GDP"
)

city_gdp_long["Year"] = city_gdp_long["Year"].astype(int)
city_gdp_long.rename(columns={"GeoName": "City"}, inplace=True)

merged = quarter_df.merge(city_gdp_long, on=["City", "Year"], how="left")
merged = merged.drop(columns=["GeoFips"])

merged.to_excel("datasets/test_with_city_gdp.xlsx", index=False)

import pandas as pd
from sklearn.linear_model import LinearRegression

city_gdp_file = "datasets/5cities (1) (1).csv"       
quarter_file = "datasets/final_combined_data.xlsx"  
test_file = "datasets/test_with_city_gdp.xlsx"     

city_gdp_df = pd.read_csv(city_gdp_file)
quarter_df = pd.read_excel(quarter_file)
test_df = pd.read_excel(test_file)

city_gdp_long = city_gdp_df.melt(
    id_vars=["GeoFips", "GeoName"], 
    var_name="Year", 
    value_name="City GDP"
)
city_gdp_long["Year"] = city_gdp_long["Year"].astype(int)
city_gdp_long.rename(columns={"GeoName": "City"}, inplace=True)

merged = quarter_df.merge(city_gdp_long, on=["City", "Year"], how="left")

train = merged.dropna(subset=["City GDP"])  
X_train = train[["Total Population", "National GDP_Billion"]]
y_train = train["City GDP"]

model = LinearRegression()
model.fit(X_train, y_train)

predict_data = merged[(merged["Year"] < 2001)]
X_pred = predict_data[["Total Population", "National GDP_Billion"]]
merged.loc[merged["Year"] < 2001, "City GDP"] = model.predict(X_pred)
merged = merged.drop(columns=["GeoFips"])

merged.to_excel(test_file, index=False)

print(merged.head(10))





      City  Median Household Income  Year  Quarter  Total Population  \
0  Chicago                   26,301  1990        1           2784500   
1  Chicago                   26,584  1990        2           2785000   
2  Chicago                   26,867  1990        3           2785500   
3  Chicago                   27,150  1990        4           2786000   
4  Chicago                   27,433  1991        1           2787000   
5  Chicago                   27,716  1991        2           2788000   
6  Chicago                   28,000  1991        3           2789000   
7  Chicago                   28,283  1991        4           2790000   
8  Chicago                   28,566  1992        1           2790500   
9  Chicago                   28,849  1992        2           2791000   

   National GDP_Billion    City GDP  
0                 5,873  94,145,077  
1                 5,960  97,685,275  
2                 6,015  99,950,725  
3                 6,005  99,627,413  
4                