In [20]:
import pandas as pd
import numpy as np

In [21]:
# Load CSV
df = pd.read_csv("../vn_30/vn_30_historical_data.csv")  # Update path if needed

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Convert numeric columns to float
numeric_cols = ['Price', 'Open', 'High', 'Low']
for col in numeric_cols:
    df[col] = df[col].str.replace(',', '').astype(float)

# Rename columns
df.rename(columns={
    'Price': 'VN_30_Close',
    'Vol.': 'Volume'
}, inplace=True)

# Parse volume strings like '557.83M' or '107.03K'
def parse_volume(vol_str):
    if isinstance(vol_str, str):
        vol_str = vol_str.replace(',', '')
        if 'M' in vol_str:
            return float(vol_str.replace('M', '')) * 1_000_000
        elif 'K' in vol_str:
            return float(vol_str.replace('K', '')) * 1_000
    return np.nan

df['Volume'] = df['Volume'].apply(parse_volume)

# Drop 'Change %' column
if 'Change %' in df.columns:
    df.drop(columns=['Change %'], inplace=True)

# Drop rows with any missing (NaN) values
df.dropna(inplace=True)

# Drop duplicate rows (if any)
df.drop_duplicates(inplace=True)

# Sort by date ascending
df.sort_values(by='Date', inplace=True)
df.reset_index(drop=True, inplace=True)

# (Optional) Save cleaned data
df[:-1].to_csv("../ready_data/cleaned_vn_30_data.csv", index=False)

print("✅ VN30 historical data cleaned and saved to 'cleaned_vn_30_data.csv'")


✅ VN30 historical data cleaned and saved to 'cleaned_vn_30_data.csv'


In [22]:
df

Unnamed: 0,Date,VN_30_Close,Open,High,Low,Volume
0,2012-06-18,510.81,509.62,514.55,508.64,22570.0
1,2012-06-19,506.67,508.37,510.60,505.02,12690.0
2,2012-06-20,508.23,507.23,510.37,504.02,10660.0
3,2012-06-21,504.71,504.85,507.84,504.52,13240.0
4,2012-06-22,500.34,503.97,504.21,499.40,17350.0
...,...,...,...,...,...,...
3194,2025-04-03,1283.18,1342.46,1342.46,1282.99,810570000.0
3195,2025-04-04,1280.52,1269.06,1280.52,1208.42,867530000.0
3196,2025-04-08,1197.51,1241.58,1245.51,1193.47,557830000.0
3197,2025-04-09,1168.68,1182.91,1220.73,1133.90,667410000.0
