In [55]:
import sys
import os
import pandas as pd

# Add the 'src' directory to the Python path
src_path = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_path)

# Now you can import your function
from data_collection import fetch_data_from_fred

# Define your FRED API key
api_key = 'e916710d165717e6348556cdce8111f3'

# Define the series IDs for the indicators you want to collect
series_ids = {
    'Unemployment_Rate': 'UNRATE',
    'GDP_Growth': 'A191RL1Q225SBEA',
    'CPI': 'CPIAUCSL',
    'Interest_Rate': 'FEDFUNDS',
    'Money_Supply': 'M2SL',
    'PPI': 'PPIACO',
    'Consumer_Confidence': 'UMCSENT'
}

# Define output directory for files
output_dir = 'C:/Users/ghkjs/OneDrive/바탕 화면/EconomicIndicatorPrediction/data/processed'

starting_dates = []
ending_dates = []

for name, series_id in series_ids.items():
    # Fetch the data
    data = fetch_data_from_fred(series_id, api_key)
    
    # Ensure the index is datetime
    data.index = pd.to_datetime(data.index)
    
    # Print the columns to check
    print(f"Data columns for {name}: {data.columns}")
    
    # Record the first and last date
    starting_dates.append(data.index.min())
    ending_dates.append(data.index.max())

# Step 2: Find the latest starting date and earliest ending date
latest_start_date = max(starting_dates)
earliest_end_date = min(ending_dates)
print(f"Latest starting date among all series: {latest_start_date.date()}")
print(f"Earliest ending date among all series: {earliest_end_date.date()}")

# Show plot for each data series and save as CSV and PNG files
for name, series_id in series_ids.items():
    # Fetch the data
    data = fetch_data_from_fred(series_id, api_key)

    data.index = pd.to_datetime(data.index)

    if name == 'CPI':
        data['CPIAUCSL'] = data['CPIAUCSL'].pct_change(periods=12)*100

    # Trim data to the date range
    data = data[(data.index >= latest_start_date) & (data.index <= earliest_end_date)]
    
    # Resample to monthly frequency
    data_monthly = data.resample('ME').mean()

    # Display the dataset to check for missing values
    print("Before interpolation:")
    print(data_monthly.isnull().sum())
    
    # Apply linear interpolation for missing values
    data_interpolated = data_monthly.interpolate(method='linear')
    data_interpolated = data_interpolated.bfill()
    
    # Display the dataset again to confirm missing values are filled
    print("\nAfter interpolation:")
    print(data_interpolated.isnull().sum())

    
    # Save the file as csv
    data_interpolated_csv_file_path = os.path.join(output_dir, f'{name}_linear_interpolation.csv')
    data_interpolated.to_csv(data_interpolated_csv_file_path, index=True)
    print(f'Data saved to {data_interpolated_csv_file_path}')
    print(data_interpolated.head())
    print(data_interpolated.tail())




Data columns for Unemployment_Rate: Index(['UNRATE'], dtype='object')
Data columns for GDP_Growth: Index(['A191RL1Q225SBEA'], dtype='object')
Data columns for CPI: Index(['CPIAUCSL'], dtype='object')
Data columns for Interest_Rate: Index(['FEDFUNDS'], dtype='object')
Data columns for Money_Supply: Index(['M2SL'], dtype='object')
Data columns for PPI: Index(['PPIACO'], dtype='object')
Data columns for Consumer_Confidence: Index(['UMCSENT'], dtype='object')
Latest starting date among all series: 1959-01-01
Earliest ending date among all series: 2024-07-01
Before interpolation:
UNRATE    0
dtype: int64

After interpolation:
UNRATE    0
dtype: int64
Data saved to C:/Users/ghkjs/OneDrive/바탕 화면/EconomicIndicatorPrediction/data/processed\Unemployment_Rate_linear_interpolation.csv
            UNRATE
1959-01-31     6.0
1959-02-28     5.9
1959-03-31     5.6
1959-04-30     5.2
1959-05-31     5.1
            UNRATE
2024-03-31     3.8
2024-04-30     3.9
2024-05-31     4.0
2024-06-30     4.1
2024-07