In [1]:
import os
import pandas as pd

In [2]:
def check_and_clean_csv(file_path, expected_interval='15min'):
    print(f"\nChecking {file_path}")
    try:
        df = pd.read_csv(file_path, parse_dates=['timestamp'])
    except Exception as e:
        print(f"Failed to load the file: {e}")
        return

    required_columns = {"timestamp", "open", "high", "low", "close", "volume", "turnover"}
    missing_columns = required_columns - set(df.columns)
    if missing_columns:
        print(f"Missing required columns: {missing_columns}")
        return

    if df.empty:
        print("The file is empty.")
        return

    # הסרת כפילויות לפי timestamp
    dup_ts = df["timestamp"].duplicated()
    if dup_ts.any():
        print(f"Removed {dup_ts.sum()} duplicate timestamps.")
        df = df[~dup_ts]

    # הסרת שורות עם ערכים חסרים
    null_rows = df[df.isnull().any(axis=1)]
    if not null_rows.empty:
        print(f"Removed {len(null_rows)} rows with missing values.")
        df = df.dropna()

    # מיון לפי timestamp אם לא ממויין
    if not df["timestamp"].is_monotonic_increasing:
        print("Timestamps were not sorted. Sorted them.")
        df = df.sort_values(by="timestamp")

    # הסרת כפילויות לפי symbol + timestamp
    if "symbol" in df.columns:
        dup_pairs = df.duplicated(subset=["symbol", "timestamp"])
        if dup_pairs.any():
            print(f"Removed {dup_pairs.sum()} duplicate symbol-timestamp pairs.")
            df = df[~dup_pairs]

    # השלמת טווחי זמן חסרים
    df.set_index("timestamp", inplace=True)
    full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq=expected_interval)
    df = df.reindex(full_range)
    missing_times = df[df.isnull().any(axis=1)].index
    if not missing_times.empty:
        print(f"Filled {len(missing_times)} missing timestamps with NaN.")
    df.reset_index(inplace=True)
    df.rename(columns={"index": "timestamp"}, inplace=True)

    # שמירה מחדש
    df.to_csv(file_path, index=False)
    print("Cleaned file saved.")


In [3]:
def process_directory(directory):
    # Scan all CSV files in the given directory and check them
    all_files = [f for f in os.listdir(directory) if f.endswith(".csv")]
    for file in all_files:
        file_path = os.path.join(directory, file)
        valid = check_and_clean_csv(file_path)
        if not valid:
            print(f"{file} has issues")



In [4]:
directory = "data"  # Change this to your actual directory
process_directory(directory)


Checking data\BTCUSDT_15m_full.csv
Cleaned file saved.
BTCUSDT_15m_full.csv has issues

Checking data\LTCUSDT_15m_full.csv
Cleaned file saved.
LTCUSDT_15m_full.csv has issues

Checking data\LUNA2USDT_15m_full.csv
Cleaned file saved.
LUNA2USDT_15m_full.csv has issues

Checking data\MAGICUSDT_15m_full.csv
Cleaned file saved.
MAGICUSDT_15m_full.csv has issues

Checking data\MANAUSDT_15m_full.csv
Cleaned file saved.
MANAUSDT_15m_full.csv has issues

Checking data\MASKUSDT_15m_full.csv
Cleaned file saved.
MASKUSDT_15m_full.csv has issues

Checking data\MAVUSDT_15m_full.csv
Cleaned file saved.
MAVUSDT_15m_full.csv has issues

Checking data\MBLUSDT_15m_full.csv
Cleaned file saved.
MBLUSDT_15m_full.csv has issues

Checking data\MDTUSDT_15m_full.csv
Cleaned file saved.
MDTUSDT_15m_full.csv has issues

Checking data\MINAUSDT_15m_full.csv
Cleaned file saved.
MINAUSDT_15m_full.csv has issues

Checking data\MKRUSDT_15m_full.csv
Cleaned file saved.
MKRUSDT_15m_full.csv has issues

Checking data\MNT