In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('../vn_index/hose_historical_data.csv')

# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove the 'Change'column
df.drop(columns=['Change'], inplace=True)

# Remove rows with missing values
df.replace('---', pd.NA, inplace=True)  
df.dropna(inplace=True)

# Convert 'Date' to datetime type
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')


In [2]:
df

Unnamed: 0,Index,Date,VN-INDEX,Total Volume,Total Value,Total Foreigner Buy Volume,Total Foreigner Buy Value,Total Foreigner Sell Volume,Total Foreigner Sell Value
0,0,2025-03-20,1318.08,521709978,"10,325.15 bil",42918037,928.85 bil,55366790,"1,707.19 bil"
1,1,2025-03-19,1324.63,949333600,"23,455.44 bil",70411951,"2,322.35 bil",88499930,"3,727.96 bil"
2,2,2025-03-18,1330.97,903370731,"19,697.88 bil",62962966,"1,861.51 bil",60959886,"2,294.32 bil"
3,3,2025-03-17,1336.26,971316343,"20,674.91 bil",65441999,"1,941.25 bil",48772000,"1,737.13 bil"
4,4,2025-03-14,1326.15,1023002500,"23,043.57 bil",109020574,"2,724.43 bil",76904163,"2,950.41 bil"
...,...,...,...,...,...,...,...,...,...
4532,4532,2007-01-08,825.11,8911220,"1,088.81 bil",525000,49.90 bil,10300,1.55 bil
4533,4533,2007-01-05,818.51,8119454,972.26 bil,663600,66.71 bil,10800,470.54 mil
4534,4534,2007-01-04,788.82,5998810,652.08 bil,123300,6.17 bil,8000,1.28 bil
4535,4535,2007-01-03,757.71,4485494,400.70 bil,8000,454.25 mil,22000,2.55 bil


In [3]:
# Function to convert values to numeric, handling 'mil', 'bil', and commas
def convert_to_numeric(value):
    if isinstance(value, str):
        value = value.replace(',', '')  # Remove commas
        if ' bil' in value:  
            return float(value.replace(' bil', '')) * 1e9  # Convert 'bil' to numeric
        elif ' mil' in value:
            return float(value.replace(' mil', '')) * 1e6  # Convert 'mil' to numeric
    return float(value)  # Convert remaining values


In [4]:
# Apply the conversion to all relevant columns
columns_to_convert = [
    'Total Volume', 'Total Value', 'Total Foreigner Buy Volume', 
    'Total Foreigner Buy Value', 'Total Foreigner Sell Volume', 
    'Total Foreigner Sell Value'
]

for col in columns_to_convert:
    df[col] = df[col].apply(convert_to_numeric)

In [5]:
# Convert other columns to numerical types
df['VN-INDEX'] = df['VN-INDEX'].astype(float)
df.rename(columns={'VN-INDEX': 'VN_Index_Close'}, inplace=True)

df = df.sort_values('Date')

# Save the cleaned data to a new CSV file
df.to_csv('../ready_data/cleaned_hose_historical_data.csv', index=False)

print("Data preprocessing completed and saved to 'cleaned_hose_historical_data.csv'")

Data preprocessing completed and saved to 'cleaned_hose_historical_data.csv'
