In [57]:
import pandas as pd

In [58]:
fastfood = 'data/fasfood_data.csv'
food_at_home = 'data/foodathome.csv'
food_away_from_home = 'data/food_away_from_home.csv'
gas_prices = 'data/gas_prices.csv'
grocery_sale = 'data/grocery_sale_data.csv'
grocery_prices = 'data/groceryprices.csv'
unemployment = 'data/UNRATE.csv'

In [59]:
fastfood_df = pd.read_csv(fastfood)
food_at_home_df = pd.read_csv(food_at_home)
food_away_from_home_df = pd.read_csv(food_away_from_home)
gas_prices_df = pd.read_csv(gas_prices)
grocery_sale_df = pd.read_csv(grocery_sale)
grocery_prices_df = pd.read_csv(grocery_prices)
unemployment_df = pd.read_csv(unemployment)



In [66]:
def transform_df_to_yearly(df):
    df['observation_date'] = pd.to_datetime(df['observation_date'])
    df['Year'] = df['observation_date'].dt.year
    yearly_df = df.groupby('Year').mean().reset_index()
    yearly_df.drop(columns=['observation_date'], inplace=True)
    return yearly_df

In [67]:
fastfood_yearly = transform_df_to_yearly(fastfood_df)
food_at_home_yearly = transform_df_to_yearly(food_at_home_df)
food_away_from_home_yearly = transform_df_to_yearly(food_away_from_home_df)
gas_prices_yearly = transform_df_to_yearly(gas_prices_df)
grocery_sale_yearly = transform_df_to_yearly(grocery_sale_df)
grocery_prices_yearly = transform_df_to_yearly(grocery_prices_df)
unemployment_yearly = transform_df_to_yearly(unemployment_df)

In [69]:
yearly_dfs = {
    'fastfood': fastfood_yearly,
    'food_at_home': food_at_home_yearly,
    'food_away_from_home': food_away_from_home_yearly,
    'gas_prices': gas_prices_yearly,
    'grocery_sale': grocery_sale_yearly,
    'grocery_prices': grocery_prices_yearly,
    'unemployment': unemployment_yearly
}
for name, df in yearly_dfs.items():
    print(f"{name} yearly data: First Year - {df['Year'].min()}, Last Year - {df['Year'].max()}")

fastfood yearly data: First Year - 1992, Last Year - 2025
food_at_home yearly data: First Year - 1952, Last Year - 2025
food_away_from_home yearly data: First Year - 1953, Last Year - 2025
gas_prices yearly data: First Year - 1967, Last Year - 2025
grocery_sale yearly data: First Year - 1992, Last Year - 2025
grocery_prices yearly data: First Year - 1952, Last Year - 2025
unemployment yearly data: First Year - 1948, Last Year - 2025


In [70]:
def get_2013to2023_data(df):
    return df[(df['Year'] >= 2013) & (df['Year'] <= 2023)].reset_index(drop=True)

subset_yearly_dfs = {name: get_2013to2023_data(df) for name, df in yearly_dfs.items()}
for name, df in subset_yearly_dfs.items():
    print(f"{name} subset data: First Year - {df['Year'].min()}, Last Year - {df['Year'].max()}")


fastfood subset data: First Year - 2013, Last Year - 2023
food_at_home subset data: First Year - 2013, Last Year - 2023
food_away_from_home subset data: First Year - 2013, Last Year - 2023
gas_prices subset data: First Year - 2013, Last Year - 2023
grocery_sale subset data: First Year - 2013, Last Year - 2023
grocery_prices subset data: First Year - 2013, Last Year - 2023
unemployment subset data: First Year - 2013, Last Year - 2023


In [72]:
# Put all Dataframes side by side for comparison
combined_df = subset_yearly_dfs['fastfood'], subset_yearly_dfs['food_at_home'], subset_yearly_dfs['food_away_from_home'], subset_yearly_dfs['gas_prices'], subset_yearly_dfs['grocery_sale'], subset_yearly_dfs['grocery_prices'], subset_yearly_dfs['unemployment']
combined_df = pd.concat(combined_df, axis=1)
combined_df = combined_df.loc[:,~combined_df.columns.duplicated()]
combined_df.set_index('Year', inplace=True)

In [74]:
combined_df.columns

Index(['RSFSDP', 'CUSR0000SAF11', 'CUSR0000SEFV', 'CUSR0000SETB01',
       'MRTSSM4451USS', 'UNRATE'],
      dtype='object')

In [76]:
# Create a mapping of column codes to descriptive titles
column_titles = {
    'RSFSDP': 'Fast Food Sales ($)',
    'CUSR0000SAF11': 'Food at Home CPI',
    'CUSR0000SEFV': 'Food Away from Home CPI',
    'CUSR0000SETB01': 'Gasoline CPI',
    'MRTSSM4451USS': 'Grocery Store Sales ($)',
    'UNRATE': 'Unemployment Rate (%)'
}

# Rename columns in combined_df for better readability
combined_df_renamed = combined_df.rename(columns=column_titles)
combined_df_renamed

Unnamed: 0_level_0,Fast Food Sales ($),Food at Home CPI,Food Away from Home CPI,Gasoline CPI,Grocery Store Sales ($),Unemployment Rate (%)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013,44330.0,233.854833,243.0685,302.379833,47689.75,7.358333
2014,47038.333333,239.449333,248.981083,290.277583,49831.583333,6.158333
2015,50900.166667,242.250083,256.100667,211.447167,51064.083333,5.275
2016,53512.916667,239.071417,262.695333,187.532,51881.416667,4.875
2017,56552.75,238.594083,268.825833,212.119917,54201.333333,4.358333
2018,59779.5,239.659333,275.893083,240.45725,55724.083333,3.891667
2019,63149.333333,241.76675,284.409667,231.8675,57602.25,3.675
2020,53534.75,250.212833,293.944333,194.981417,62868.583333,8.1
2021,68658.083333,258.8825,307.296083,264.053833,65824.75,5.358333
2022,79700.083333,288.457583,330.822,346.826417,71169.166667,3.641667


In [78]:
other_data = 'data/combined_df.csv'
other_data_df = pd.read_csv(other_data)
other_data_df.set_index('Year', inplace=True)

In [79]:
full_df = pd.concat([combined_df_renamed, other_data_df], axis=1)

In [81]:
full_df.to_csv('final_data/transformed_dataset.csv')