In [9]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# 📂 Define the folder containing all CSV files
data_folder = "../external_data"

# ✅ Function to preprocess a single CSV file
def preprocess_file(file_path):
    try:
        # Read CSV, skipping first two rows to fix column names issue
        df = pd.read_csv(file_path, skiprows=2)

        # Check if 'Unnamed: 1' exists and rename it to 'Close'
        if 'Unnamed: 1' in df.columns:
            df.rename(columns={'Unnamed: 1': 'Close'}, inplace=True)

        df.rename(columns={df.columns[-1]: 'Volume'}, inplace=True)


        # Ensure 'Date' is parsed correctly
        df['Date'] = pd.to_datetime(df['Date'])
        
        df = df[['Date', 'Close', 'Volume']]

        # Rename columns: Add file name as prefix to avoid conflicts
        file_name = os.path.basename(file_path).replace("_historical_data.csv", "")
        df.rename(columns={'Close': f"{file_name}_Close"}, inplace=True)
        df.rename(columns={'Volume': f"{file_name}_Volume"}, inplace=True)
        
        if df.shape[0] < 4500:
            print(f"INSUFFICIENT DATA FOR {file_name}")
            return None

        print(f"✅ Processed: {file_name}")
        return df
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return None

# ✅ Read all CSV files in the folder
all_dataframes = []
for file in os.listdir(data_folder):
    if file.endswith(".csv"):
        file_path = os.path.join(data_folder, file)
        df = preprocess_file(file_path)
        if df is not None:
            all_dataframes.append(df)

# ✅ Merge all data on Date
if all_dataframes:
    merged_df = all_dataframes[0]
    for df in all_dataframes[1:]:
        merged_df = pd.merge(merged_df, df, on="Date", how="outer")

    # ✅ Handle missing values (Forward Fill & Backward Fill)
    merged_df.fillna(method='ffill', inplace=True)  # Forward fill
    merged_df.fillna(method='bfill', inplace=True)  # Backward fill (if needed)

    merged_df = merged_df.sort_values(by='Date')

    # ✅ Save preprocessed data
    merged_df.to_csv("../ready_data/cleaned_external_data.csv", index=False)
    print("🎉 Preprocessing complete! Data saved to `cleaned_external_data.csv`")
else:
    print("⚠️ No valid CSV files found in the folder.")


✅ Processed: Copper
✅ Processed: Taiwan_Weighted_Index
✅ Processed: WTI_Crude_Oil
✅ Processed: LNG
✅ Processed: KOSPI
✅ Processed: S&P_500
✅ Processed: US_10Y_Treasury_Yield
✅ Processed: Hang_Seng_Index
✅ Processed: NASDAQ
✅ Processed: DJIA
INSUFFICIENT DATA FOR CNYVND
✅ Processed: USDVND
✅ Processed: EURVND
INSUFFICIENT DATA FOR Aluminum
✅ Processed: Gold
✅ Processed: FTSE_100
✅ Processed: Shanghai
INSUFFICIENT DATA FOR Brent_Crude_Oil
INSUFFICIENT DATA FOR JPYVND
🎉 Preprocessing complete! Data saved to `cleaned_external_data.csv`


  merged_df.fillna(method='ffill', inplace=True)  # Forward fill
  merged_df.fillna(method='bfill', inplace=True)  # Backward fill (if needed)


In [10]:
merged_df

Unnamed: 0,Date,Copper_Close,Copper_Volume,Taiwan_Weighted_Index_Close,Taiwan_Weighted_Index_Volume,WTI_Crude_Oil_Close,WTI_Crude_Oil_Volume,LNG_Close,LNG_Volume,KOSPI_Close,...,USDVND_Close,USDVND_Volume,EURVND_Close,EURVND_Volume,Gold_Close,Gold_Volume,FTSE_100_Close,FTSE_100_Volume,Shanghai_Close,Shanghai_Volume
6175,2000-07-28,4.8625,41206.0,8122.072266,0.0,66.550003,202246.0,4.166,46698.0,692.650024,...,25790.0,0.0,28696.0,0.0,3052.600098,3901.0,6335.700195,9.470970e+08,2012.792969,0.000000e+00
6176,2000-07-31,4.8625,41206.0,8114.882324,0.0,66.550003,202246.0,4.166,46698.0,705.969971,...,25790.0,0.0,28696.0,0.0,3052.600098,3901.0,6365.299805,7.172740e+08,2023.538940,0.000000e+00
6177,2000-08-01,4.8625,41206.0,7984.612305,0.0,66.550003,202246.0,4.166,46698.0,727.099976,...,25790.0,0.0,28696.0,0.0,3052.600098,3901.0,6379.399902,6.537430e+08,2028.151001,0.000000e+00
6178,2000-08-02,4.8625,41206.0,7916.812988,0.0,66.550003,202246.0,4.166,46698.0,728.330017,...,25790.0,0.0,28696.0,0.0,3052.600098,3901.0,6391.299805,1.028551e+09,2030.682983,0.000000e+00
6179,2000-08-03,4.8625,41206.0,7844.894043,0.0,66.550003,202246.0,4.166,46698.0,722.080017,...,25790.0,0.0,28696.0,0.0,3052.600098,3901.0,6317.100098,9.942390e+08,2036.301025,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6170,2025-03-28,5.1125,1395.0,21602.890625,3242700.0,69.360001,246650.0,4.065,176361.0,2557.979980,...,25560.0,0.0,27188.0,0.0,3086.500000,31206.0,8658.900391,7.475458e+08,3351.306885,4.123000e+05
6171,2025-03-31,5.0200,720.0,20695.900391,4034100.0,71.480003,313087.0,4.119,171857.0,2481.120117,...,25550.0,0.0,27149.0,0.0,3122.800049,3438.0,8582.799805,9.250750e+08,3335.746094,4.726000e+05
6172,2025-04-01,5.0190,838.0,21280.169922,2753300.0,71.199997,272832.0,3.951,156694.0,2521.389893,...,25565.0,0.0,27279.0,0.0,3118.899902,1721.0,8634.799805,6.824101e+08,3348.435059,4.226000e+05
6173,2025-04-02,5.0235,838.0,21298.220703,2300200.0,71.709999,272832.0,4.055,156694.0,2505.860107,...,25620.0,0.0,27281.0,0.0,3139.899902,1721.0,8608.500000,6.014178e+08,3350.126953,3.501000e+05
