In [3]:
import pandas as pd
import os

# Define the dataset folder
DATASET_PATH = "dataset"

# Load metadata
meta_file = os.path.join(DATASET_PATH, "symbols_valid_meta.csv")
meta_df = pd.read_csv(meta_file)

# Function to clean stock data
def clean_stock_data(file_path):
    df = pd.read_csv(file_path)
    
    # Convert Date column to datetime
    df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
    
    # Ensure numerical columns are in the correct format
    numeric_cols = ["Open", "High", "Low", "Close", "Adj Close", "Volume"]
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    
    # Drop rows with missing Date
    df = df.dropna(subset=["Date"])
    
    return df

# Process all stock and ETF files
stocks_path = os.path.join(DATASET_PATH, "stocks")
etfs_path = os.path.join(DATASET_PATH, "ETFs")

cleaned_data = {}

for folder in [stocks_path, etfs_path]:
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            file_path = os.path.join(folder, file)
            df_cleaned = clean_stock_data(file_path)
            cleaned_data[file] = df_cleaned

# Example: Display cleaned data for a sample stock
ticker_sample = list(cleaned_data.keys())[0]
print(cleaned_data[ticker_sample].head())
print(cleaned_data[ticker_sample].describe())

        Date       Open       High        Low      Close  Adj Close  Volume
0 2016-01-08  19.162428  19.162428  19.114038  19.114038  11.411459   10300
1 2016-01-11  19.133394  19.230173  19.017258  19.017258  11.353675   49600
2 2016-01-12  19.113070  19.113070  19.103392  19.103392  11.405104    2300
3 2016-01-13  19.103392  19.103392  19.103392  19.103392  11.405104       0
4 2016-01-14  19.065647  19.385021  18.968868  19.104361  11.405683   26100
              Open         High          Low        Close    Adj Close  \
count  1065.000000  1065.000000  1065.000000  1065.000000  1065.000000   
mean     18.249711    18.371465    18.103308    18.236492    14.059059   
std       1.741026     1.721080     1.769183     1.737220     1.713009   
min       9.260000    10.550000     8.650000     9.570000     9.262057   
25%      16.770000    16.863323    16.666666    16.759096    12.676388   
50%      18.544971    18.688206    18.377581    18.533358    14.464704   
75%      19.404377    19.4