In [1]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from tqdm import tqdm
import os

In [2]:
input_dir = '/data/stock_data/csv/5-2/'
output_dir = '/data/stock_csv_data_by_year/'

In [3]:
csv_files = [file for file in os.listdir(input_dir) if file.endswith('.csv')]

In [4]:
def process_file(csv_file, progress_bar):
    progress_bar.set_postfix({'csv_file': csv_file})
    
    csv_path = os.path.join(input_dir, csv_file)
    df = pd.read_csv(csv_path)
        
    for year in df['time'].apply(lambda x: str(x)[:4]).unique():
        year_df = df[df['time'].apply(lambda x: str(x).startswith(year))]
            
        year_dir = os.path.join(output_dir, year)
        os.makedirs(year_dir, exist_ok=True)
            
        new_file_name = f"{csv_file.split('_')[0]}_{year}.csv"
        new_file_path = os.path.join(year_dir, new_file_name)
            
        if not os.path.exists(new_file_path):
            year_df.to_csv(new_file_path, index=False)
    
    progress_bar.update(1)

In [5]:
with ThreadPoolExecutor(max_workers=30) as executor:
    # Initialize TQDM progress bar
    with tqdm(total=len(csv_files)) as progress_bar:
        # Create a future-to-file mapping
        future_to_file = {executor.submit(process_file, csv_file, progress_bar): csv_file for csv_file in csv_files}

        for future in future_to_file.keys():
            csv_file = future_to_file[future]
            try:
                future.result()  # wait for the thread to finish processing file
            except Exception as exc:
                print(f'{csv_file} generated an exception: {exc}')
   

100%|██████████| 5346/5346 [2:21:53<00:00,  1.59s/it, csv_file=sz.301591_19900101-20231231.csv]   
