In [5]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# 📂 Define the folder containing all CSV files
data_folder = "../external_data"

# ✅ Function to preprocess a single CSV file
def preprocess_file(file_path):
    try:
        # Read CSV, skipping first two rows to fix column names issue
        df = pd.read_csv(file_path, skiprows=2)

        # Check if 'Unnamed: 1' exists and rename it to 'Close'
        if 'Unnamed: 1' in df.columns:
            df.rename(columns={'Unnamed: 1': 'Close'}, inplace=True)

        df.rename(columns={df.columns[-1]: 'Volume'}, inplace=True)


        # Ensure 'Date' is parsed correctly
        df['Date'] = pd.to_datetime(df['Date'])
        
        df = df[['Date', 'Close', 'Volume']]

        # Rename columns: Add file name as prefix to avoid conflicts
        file_name = os.path.basename(file_path).replace("_historical_data.csv", "")
        df.rename(columns={'Close': f"{file_name}_Close"}, inplace=True)
        df.rename(columns={'Volume': f"{file_name}_Volume"}, inplace=True)
        
        if df.shape[0] < 4500:
            print(f"INSUFFICIENT DATA FOR {file_name}")
            return None

        print(f"✅ Processed: {file_name}")
        return df
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return None

# ✅ Read all CSV files in the folder
all_dataframes = []
for file in os.listdir(data_folder):
    if file.endswith(".csv"):
        file_path = os.path.join(data_folder, file)
        df = preprocess_file(file_path)
        if df is not None:
            all_dataframes.append(df)

# ✅ Merge all data on Date
if all_dataframes:
    merged_df = all_dataframes[0]
    for df in all_dataframes[1:]:
        merged_df = pd.merge(merged_df, df, on="Date", how="outer")

    # ✅ Handle missing values (Forward Fill & Backward Fill)
    merged_df.fillna(method='ffill', inplace=True)  # Forward fill
    merged_df.fillna(method='bfill', inplace=True)  # Backward fill (if needed)

    merged_df = merged_df.sort_values(by='Date')

    # ✅ Save preprocessed data
    merged_df.to_csv("../ready_data/cleaned_external_data.csv", index=False)
    print("🎉 Preprocessing complete! Data saved to `cleaned_external_data.csv`")
else:
    print("⚠️ No valid CSV files found in the folder.")


✅ Processed: Copper
✅ Processed: Taiwan_Weighted_Index
✅ Processed: WTI_Crude_Oil
✅ Processed: LNG
✅ Processed: KOSPI
✅ Processed: S&P_500
✅ Processed: US_10Y_Treasury_Yield
✅ Processed: Hang_Seng_Index
✅ Processed: NASDAQ
✅ Processed: DJIA
INSUFFICIENT DATA FOR CNYVND
✅ Processed: USDVND
✅ Processed: EURVND
INSUFFICIENT DATA FOR Aluminum
✅ Processed: Gold
✅ Processed: FTSE_100
✅ Processed: Shanghai
INSUFFICIENT DATA FOR Brent_Crude_Oil
INSUFFICIENT DATA FOR JPYVND
🎉 Preprocessing complete! Data saved to `cleaned_external_data.csv`


  merged_df.fillna(method='ffill', inplace=True)  # Forward fill
  merged_df.fillna(method='bfill', inplace=True)  # Backward fill (if needed)


In [6]:
merged_df

Unnamed: 0,Date,Copper_Close,Copper_Volume,Taiwan_Weighted_Index_Close,Taiwan_Weighted_Index_Volume,WTI_Crude_Oil_Close,WTI_Crude_Oil_Volume,LNG_Close,LNG_Volume,KOSPI_Close,...,USDVND_Close,USDVND_Volume,EURVND_Close,EURVND_Volume,Gold_Close,Gold_Volume,FTSE_100_Close,FTSE_100_Volume,Shanghai_Close,Shanghai_Volume
6191,2000-07-28,4.8415,9242.0,8122.072266,0.0,62.349998,114179.0,3.120,1293.0,692.650024,...,25985.0,0.0,29472.0,0.0,3315.800049,140440.0,6335.700195,9.470970e+08,2012.792969,0.000000e+00
6192,2000-07-31,4.8415,9242.0,8114.882324,0.0,62.349998,114179.0,3.120,1293.0,705.969971,...,25985.0,0.0,29472.0,0.0,3315.800049,140440.0,6365.299805,7.172740e+08,2023.538940,0.000000e+00
6193,2000-08-01,4.8415,9242.0,7984.612305,0.0,62.349998,114179.0,3.120,1293.0,727.099976,...,25985.0,0.0,29472.0,0.0,3315.800049,140440.0,6379.399902,6.537430e+08,2028.151001,0.000000e+00
6194,2000-08-02,4.8415,9242.0,7916.812988,0.0,62.349998,114179.0,3.120,1293.0,728.330017,...,25985.0,0.0,29472.0,0.0,3315.800049,140440.0,6391.299805,1.028551e+09,2030.682983,0.000000e+00
6195,2000-08-03,4.8415,9242.0,7844.894043,0.0,62.349998,114179.0,3.120,1293.0,722.080017,...,25985.0,0.0,29472.0,0.0,3315.800049,140440.0,6317.100098,9.942390e+08,2036.301025,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6186,2025-04-22,4.8705,516.0,18793.429688,2526000.0,64.309998,297928.0,3.007,133976.0,2486.639893,...,25871.0,0.0,29009.0,0.0,3400.800049,785.0,8328.599609,1.042190e+09,3299.757080,4.032000e+05
6187,2025-04-23,4.8355,714.0,19639.140625,2694600.0,62.270000,397841.0,3.022,86140.0,2525.560059,...,25932.0,0.0,29456.0,0.0,3276.300049,331.0,8403.200195,1.043916e+09,3296.354980,4.116000e+05
6188,2025-04-24,4.8500,1626.0,19478.810547,2418200.0,62.790001,264908.0,2.930,66984.0,2522.330078,...,25965.0,0.0,29247.0,0.0,3332.000000,560.0,8407.400391,1.126606e+09,3297.288086,3.928000e+05
6189,2025-04-25,4.8355,1626.0,19872.730469,2674500.0,63.020000,264908.0,2.937,66984.0,2546.300049,...,26026.0,0.0,29249.0,0.0,3282.399902,560.0,8415.299805,8.027340e+08,3295.060059,4.110000e+05
