In [1]:
import pandas as pd
import glob
import os
import re

def get_nq_contracts(start_year=2015, end_year=2025):
    months = [('H', 3), ('M', 6), ('U', 9), ('Z', 12)]
    contracts = []
    for year in range(start_year, end_year + 1):
        for code, month in months:
            third_friday = pd.Timestamp(year=year, month=month, day=15)
            while third_friday.weekday() != 4:
                third_friday += pd.Timedelta(days=1)
            contract_end = (third_friday - pd.Timedelta(days=7)).replace(hour=16, minute=59, second=0)
            contracts.append({
                'code': f"NQ{code}{year}",
                'end': contract_end.tz_localize('America/New_York')
            })
    for i, c in enumerate(contracts):
        if i == 0:
            c['start'] = pd.Timestamp(f"{start_year}-01-01T00:00:00-05:00").tz_convert('America/New_York')
        else:
            prev_end = contracts[i-1]['end']
            next_start = (prev_end + pd.Timedelta(days=2)).replace(hour=18, minute=0, second=0)
            c['start'] = next_start
    contracts[-1]['end'] = pd.Timestamp(f"{end_year+1}-12-31T23:59:59-05:00").tz_convert('America/New_York')
    return contracts

contracts = get_nq_contracts(2015, 2025)

raw_folder = r'D:\Youtube\Data\Raw Data'
post_parse_folder = r'D:\Youtube\Data\Post Parse'
yearly_folder = r'D:\Youtube\Data\Yearly Data'
os.makedirs(post_parse_folder, exist_ok=True)
os.makedirs(yearly_folder, exist_ok=True)

for file in glob.glob(os.path.join(raw_folder, '*.csv')):
    basename = os.path.basename(file)
    match = re.search(r'(NQ[HMUZ]\d{4})', basename, re.IGNORECASE)
    if match:
        contract_from_file = match.group(1).upper()
    else:
        contract_from_file = None

    df = pd.read_csv(file)
    df['time'] = pd.to_datetime(df['time'], utc=True)
    df['time'] = df['time'].dt.tz_convert('America/New_York')

    contract_info = next((c for c in contracts if c['code'] == contract_from_file), None)
    if contract_info is not None:
        mask = (df['time'] >= contract_info['start']) & (df['time'] <= contract_info['end'])
        df = df.loc[mask]
        df['contract'] = contract_from_file
    else:
        df['contract'] = contract_from_file

    if not df.empty:
        min_date = df['time'].min().strftime('%Y-%m-%d')
        max_date = df['time'].max().strftime('%Y-%m-%d')
        outname = f"{min_date} to {max_date}.csv"
        df.to_csv(os.path.join(post_parse_folder, outname), index=False)
        print(f"Saved: {outname}")

all_files = glob.glob(os.path.join(post_parse_folder, '*.csv'))
if all_files:
    df_master = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    df_master['time'] = pd.to_datetime(df_master['time'], utc=True)
    df_master['time'] = df_master['time'].dt.tz_convert('America/New_York')
    df_master = df_master.sort_values('time')
    for year in df_master['time'].dt.year.unique():
        df_year = df_master[df_master['time'].dt.year == year]
        out_path = os.path.join(yearly_folder, f'{year}_NQ.csv')
        df_year.to_csv(out_path, index=False)
        print(f"Yearly file saved: {out_path}")
else:
    print("No post-parse files found to combine.")

Saved: 2020-12-13 to 2021-01-08.csv
Saved: 2021-02-21 to 2021-03-12.csv
Saved: 2021-01-10 to 2021-01-29.csv
Saved: 2021-01-31 to 2021-02-19.csv
Saved: 2022-01-16 to 2022-02-04.csv
Saved: 2021-12-26 to 2022-01-14.csv
Saved: 2022-02-06 to 2022-02-25.csv
Saved: 2022-02-27 to 2022-03-11.csv
Saved: 2021-12-12 to 2021-12-23.csv
Saved: 2022-12-11 to 2023-01-06.csv
Saved: 2023-01-29 to 2023-02-17.csv
Saved: 2023-02-19 to 2023-03-10.csv
Saved: 2023-01-08 to 2023-01-27.csv
Saved: 2024-01-07 to 2024-01-26.csv
Saved: 2024-01-28 to 2024-02-16.csv
Saved: 2023-12-10 to 2024-01-05.csv
Saved: 2024-02-18 to 2024-03-08.csv
Saved: 2024-12-15 to 2025-01-10.csv
Saved: 2025-01-12 to 2025-01-31.csv
Saved: 2025-02-02 to 2025-02-21.csv
Saved: 2025-02-23 to 2025-03-14.csv
Saved: 2021-05-02 to 2021-05-21.csv
Saved: 2021-03-21 to 2021-04-09.csv
Saved: 2021-03-14 to 2021-03-19.csv
Saved: 2021-05-23 to 2021-06-11.csv
Saved: 2021-04-11 to 2021-04-30.csv
Saved: 2022-05-01 to 2022-05-20.csv
Saved: 2022-04-03 to 2022-04