In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('../vn_index/hose_historical_data.csv')

# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove the 'Change'column
df.drop(columns=['Change'], inplace=True)

# Remove rows with missing values
df.replace('---', pd.NA, inplace=True)  
df.dropna(inplace=True)

# Convert 'Date' to datetime type
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')


In [2]:
df

Unnamed: 0,Index,Date,VN-INDEX,Total Volume,Total Value,Total Foreigner Buy Volume,Total Foreigner Buy Value,Total Foreigner Sell Volume,Total Foreigner Sell Value
0,0,2025-04-03,1229.84,1763077400,"39,630.15 bil",65698308,"2,039.66 bil",198638382,"5,734.34 bil"
1,1,2025-04-02,1317.83,849307834,"19,296.71 bil",66044032,"1,957.07 bil",81823582,"2,665.49 bil"
2,2,2025-04-01,1317.33,620033587,"15,025.64 bil",52789640,"1,876.93 bil",64539449,"2,316.52 bil"
3,3,2025-03-31,1306.86,867812434,"21,205.64 bil",53798688,"1,684.46 bil",89320381,"2,966.36 bil"
4,4,2025-03-28,1317.46,739652036,"17,084.16 bil",62460716,"1,650.07 bil",62678442,"2,056.68 bil"
...,...,...,...,...,...,...,...,...,...
4542,4542,2007-01-08,825.11,8911220,"1,088.81 bil",525000,49.90 bil,10300,1.55 bil
4543,4543,2007-01-05,818.51,8119454,972.26 bil,663600,66.71 bil,10800,470.54 mil
4544,4544,2007-01-04,788.82,5998810,652.08 bil,123300,6.17 bil,8000,1.28 bil
4545,4545,2007-01-03,757.71,4485494,400.70 bil,8000,454.25 mil,22000,2.55 bil


In [3]:
# Function to convert values to numeric, handling 'mil', 'bil', and commas
def convert_to_numeric(value):
    if isinstance(value, str):
        value = value.replace(',', '')  # Remove commas
        if ' bil' in value:  
            return float(value.replace(' bil', '')) * 1e9  # Convert 'bil' to numeric
        elif ' mil' in value:
            return float(value.replace(' mil', '')) * 1e6  # Convert 'mil' to numeric
    return float(value)  # Convert remaining values


In [4]:
# Apply the conversion to all relevant columns
columns_to_convert = [
    'Total Volume', 'Total Value', 'Total Foreigner Buy Volume', 
    'Total Foreigner Buy Value', 'Total Foreigner Sell Volume', 
    'Total Foreigner Sell Value'
]

for col in columns_to_convert:
    df[col] = df[col].apply(convert_to_numeric)

In [5]:
# Convert other columns to numerical types
df['VN-INDEX'] = df['VN-INDEX'].astype(float)
df.rename(columns={'VN-INDEX': 'VN_Index_Close'}, inplace=True)

df = df.sort_values('Date')

# Save the cleaned data to a new CSV file
df.to_csv('../ready_data/cleaned_hose_historical_data.csv', index=False)

print("Data preprocessing completed and saved to 'cleaned_hose_historical_data.csv'")

Data preprocessing completed and saved to 'cleaned_hose_historical_data.csv'
