In [1]:
import os
import pandas as pd

In [27]:
def combine_csv_files(folder_path):
    """
    Combine all CSV files in a folder in time order while handling overlaps and duplicates.
    """
    # Get all CSV files in the folder, sorted by modification time
    csv_files = sorted(
        [f for f in os.listdir(folder_path) if f.endswith('.csv')],
        key=lambda x: os.path.getctime(os.path.join(folder_path, x))
    )

    if not csv_files:
        print("No CSV files found in the folder.")
        return

    combined_data = None

    for i, file in enumerate(csv_files):
        file_path = os.path.join(folder_path, file)
        print(f"Processing file: {file_path}")

        # Read the file
        data = pd.read_csv(file_path, parse_dates=['time'])
        data = data.iloc[:, :5]

        # Check for overlap with the previous file
        if combined_data is not None:
            # Check if there's an overlap
            overlap = combined_data['time'].iloc[-1] >= data['time'].iloc[0]
            if not overlap:
                raise ValueError(f"Gap detected between files: {csv_files[i-1]} and {csv_files[i]}")

        # Append the data
        combined_data = pd.concat([combined_data, data]) if combined_data is not None else data

    # Drop duplicates based on the 'time' column and ensure all values in overlapping rows match
    duplicates = combined_data.duplicated(subset=['time'], keep=False)
    duplicate_rows = combined_data[duplicates]
    if not duplicate_rows.empty:
        mismatched_duplicates = duplicate_rows.groupby('time').nunique().max(axis=1)
        if (mismatched_duplicates > 1).any():
            #mismatched_df = duplicate_rows.loc[duplicate_rows['time'].isin(mismatched_duplicates[mismatched_duplicates > 1].index)]
            #return mismatched_df 
            raise ValueError(f"Mismatched duplicate rows found: {duplicate_rows}")

    combined_data = combined_data.drop_duplicates(subset=['time'], keep='first')

    # Check if times are unique
    if not combined_data['time'].is_unique:
        raise ValueError("Non-unique timestamps found after removing duplicates.")

    # Save the combined file
    stock_name = os.path.basename(folder_path)  # Use folder name as stock name
    output_file = os.path.join(folder_path, f"{stock_name}_combined.csv")
    combined_data.to_csv(output_file, index=False)
    print(f"Combined file saved to: {output_file}")

    return combined_data

In [40]:
folder_paths = ["./NSEBANK Front Month Future/"]
for folder_path in folder_paths:
    combined_data = combine_csv_files(folder_path)
    combined_data.to_csv(folder_path+'_combined.csv', index=False)

Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5.csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (1).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (2).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (3).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (4).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (5).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (6).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (7).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (8).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (9).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (10).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (11).csv
Processing file: ./NSEBANK Front Month Future/NSE_BANKNIFTY1!, 5 (12).csv
Processing file: ./NSEBANK Front Month Future/NSE_BA

In [None]:
'''
def check_trading_hours(data, trading_start='09:30:00', trading_end='16:00:00'):
    """
    Check if all rows in the data fall within the trading hours.
    """
    data['time_only'] = data['time'].dt.time
    start_time = pd.to_datetime(trading_start).time()
    end_time = pd.to_datetime(trading_end).time()

    outside_hours = data[(data['time_only'] < start_time) | (data['time_only'] > end_time)]
    if not outside_hours.empty:
        print("Rows found outside trading hours:")
        print(outside_hours)
        return False
    return True


def check_trading_dates(data):
    """
    Check if there are any missing dates in the data.
    """
    data['date_only'] = data['time'].dt.date
    all_dates = pd.date_range(
        start=data['date_only'].min(),
        end=data['date_only'].max(),
        freq='B'  # Business days
    )

    missing_dates = set(all_dates.date) - set(data['date_only'])
    if missing_dates:
        print("Missing trading dates:")
        print(sorted(missing_dates))
        return False
    return True'
'''