In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# 📂 Define the folder containing all CSV files
data_folder = "../vn_30_list"

# ✅ Function to preprocess a single CSV file
def preprocess_file(file_path):
    try:
        # Read CSV, skipping first two rows to fix column names issue
        df = pd.read_csv(file_path)

        # Ensure 'Date' is parsed correctly
        df['Date'] = pd.to_datetime(df['Date'])
        df['Close'] = df['Close'].astype(float)

        df = df[['Date', 'Close']]

        # Rename columns: Add file name as prefix to avoid conflicts
        file_name = os.path.basename(file_path).replace("_historical_data.csv", "")
        df.rename(columns={'Close': f"{file_name}_Close"}, inplace=True)

        print(f"✅ Processed: {file_name}")
        return df
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return None

# ✅ Read all CSV files in the folder
all_dataframes = []
for file in os.listdir(data_folder):
    if file.endswith(".csv"):
        file_path = os.path.join(data_folder, file)
        df = preprocess_file(file_path)
        if df is not None:
            all_dataframes.append(df)

# ✅ Merge all data on Date
if all_dataframes:
    merged_df = all_dataframes[0]
    for df in all_dataframes[1:]:
        merged_df = pd.merge(merged_df, df, on="Date", how="inner")

    # ✅ Handle missing values (Forward Fill & Backward Fill)
    merged_df.fillna(method='ffill', inplace=True)  # Forward fill
    merged_df.fillna(method='bfill', inplace=True)  # Backward fill (if needed)

    merged_df = merged_df.sort_values(by='Date')

    # ✅ Save preprocessed data
    merged_df.to_csv("../ready_data/cleaned_vn30_list_data.csv", index=False)
    print("🎉 Preprocessing complete! Data saved to `cleaned_vn30_list_data.csv`")
else:
    print("⚠️ No valid CSV files found in the folder.")

✅ Processed: VRE
✅ Processed: VIB
✅ Processed: STB
✅ Processed: VIC
✅ Processed: VCB
✅ Processed: MSN
✅ Processed: MWG
✅ Processed: PLX
✅ Processed: TCB
✅ Processed: FPT
✅ Processed: VHM
✅ Processed: HDB
✅ Processed: HPG
✅ Processed: ACB
✅ Processed: MBB
✅ Processed: SAB
✅ Processed: NVL
✅ Processed: PNJ
✅ Processed: SHB
✅ Processed: VPB
✅ Processed: POW
✅ Processed: TPB
✅ Processed: GAS
✅ Processed: CTG
✅ Processed: VNM
✅ Processed: PDR
✅ Processed: SSI
✅ Processed: BVH
✅ Processed: VJC
✅ Processed: BID
🎉 Preprocessing complete! Data saved to `cleaned_vn30_list_data.csv`


  merged_df.fillna(method='ffill', inplace=True)  # Forward fill
  merged_df.fillna(method='bfill', inplace=True)  # Backward fill (if needed)
