In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import gc 

pd.set_option('display.max_columns', None)
plt.style.use('seaborn')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 8]

sys.path.append('../src')
from preprocessing import MIMICPreprocessor

  plt.style.use('seaborn')


In [3]:
DATA_PATH = Path('../data')
PROCESSED_PATH = Path('../data/processed')
PROCESSED_PATH.mkdir(exist_ok=True)
RANDOM_SEED = 42

preprocessor = MIMICPreprocessor(DATA_PATH)

In [4]:
# 第1步：加载和处理 PATIENTS 表（较小的表）
print("Step 1: Processing PATIENTS table...")
try:
    patients_df = pd.read_csv(
        DATA_PATH / "PATIENTS.csv",
        usecols=['SUBJECT_ID', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG'],
        dtype={
            'SUBJECT_ID': 'int32',
            'GENDER': 'category',
            'EXPIRE_FLAG': 'int8'
        }
    )
    patients_df = preprocessor.process_demographics(patients_df)
    print(f"Processed {len(patients_df)} patients")
    
    # 保存处理后的患者数据
    patients_df.to_pickle(PROCESSED_PATH / 'processed_patients.pkl')
    gc.collect()  # 清理内存
except Exception as e:
    print(f"Error processing PATIENTS: {str(e)}")
    raise

2024-11-07 23:15:23,773 - INFO - Processing demographics data...


Step 1: Processing PATIENTS table...


2024-11-07 23:15:24,384 - INFO - 
Demographic Processing Summary:
2024-11-07 23:15:24,384 - INFO - Total patients: 46520
2024-11-07 23:15:24,385 - INFO - Patients with accurate age (≤89): 28386
2024-11-07 23:15:24,385 - INFO - Patients with age >89 (marked as 90+): 18134


Processed 46520 patients


In [5]:
# 第2步：分块处理 NOTEEVENTS 表
print("\nStep 2: Processing NOTEEVENTS table in chunks...")
try:
    chunk_size = 50000  # 可以根据内存调整
    notes_chunks = []
    
    for chunk in tqdm(pd.read_csv(
        DATA_PATH / "NOTEEVENTS.csv",
        usecols=['SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CATEGORY', 'TEXT'],
        chunksize=chunk_size
    )):
        processed_chunk = preprocessor.clean_notes(chunk)
        notes_chunks.append(processed_chunk[['SUBJECT_ID', 'HADM_ID', 'cleaned_text', 'sections']])
        del chunk
        gc.collect()

    notes_df = pd.concat(notes_chunks, ignore_index=True)
    del notes_chunks
    gc.collect()
    
    # 保存处理后的临床记录
    notes_df.to_pickle(PROCESSED_PATH / 'processed_notes.pkl')
    print(f"Processed {len(notes_df)} clinical notes")
except Exception as e:
    print(f"Error processing NOTEEVENTS: {str(e)}")
    raise



Step 2: Processing NOTEEVENTS table in chunks...


0it [00:00, ?it/s]2024-11-07 23:15:31,882 - INFO - Starting clinical notes cleaning...

Cleaning text: 0it [00:00, ?it/s][A

Extracting sections: 0it [00:00, ?it/s][A
1it [00:03,  3.74s/it]2024-11-07 23:15:33,451 - INFO - Starting clinical notes cleaning...

Cleaning text: 0it [00:00, ?it/s][A

Extracting sections: 0it [00:00, ?it/s][A
2it [00:05,  2.46s/it]2024-11-07 23:15:33,757 - INFO - Starting clinical notes cleaning...

Cleaning text: 0it [00:00, ?it/s][A

Extracting sections: 0it [00:00, ?it/s][A
3it [00:05,  1.48s/it]2024-11-07 23:15:33,926 - INFO - Starting clinical notes cleaning...

Cleaning text: 0it [00:00, ?it/s][A

Extracting sections: 0it [00:00, ?it/s][A
4it [00:05,  1.04it/s]2024-11-07 23:15:34,089 - INFO - Starting clinical notes cleaning...

Cleaning text: 0it [00:00, ?it/s][A

Extracting sections: 0it [00:00, ?it/s][A
5it [00:05,  1.49it/s]2024-11-07 23:15:34,255 - INFO - Starting clinical notes cleaning...

Cleaning text: 0it [00:00, ?it/s][A

Extractin

Cleaning text:   0%|                                  | 0/17609 [00:00<?, ?it/s][A
Cleaning text:   2%|▍                     | 394/17609 [00:00<00:04, 3933.98it/s][A
Cleaning text:   4%|▉                     | 791/17609 [00:00<00:04, 3948.63it/s][A
Cleaning text:   7%|█▍                   | 1186/17609 [00:00<00:04, 3918.29it/s][A
Cleaning text:   9%|█▉                   | 1578/17609 [00:00<00:04, 3858.99it/s][A
Cleaning text:  11%|██▎                  | 1983/17609 [00:00<00:03, 3926.10it/s][A
Cleaning text:  13%|██▊                  | 2377/17609 [00:00<00:03, 3926.46it/s][A
Cleaning text:  16%|███▎                 | 2770/17609 [00:00<00:03, 3901.74it/s][A
Cleaning text:  18%|███▊                 | 3176/17609 [00:00<00:03, 3949.85it/s][A
Cleaning text:  20%|████▎                | 3590/17609 [00:00<00:03, 4006.35it/s][A
Cleaning text:  23%|████▊                | 3998/17609 [00:01<00:03, 4028.18it/s][A
Cleaning text:  25%|█████▎               | 4403/17609 [00:01<00:03, 4031.69i

Extracting sections:  83%|███████████▋  | 14682/17609 [00:04<00:01, 2923.43it/s][A
Extracting sections:  85%|███████████▉  | 14975/17609 [00:05<00:00, 2917.04it/s][A
Extracting sections:  87%|████████████▏ | 15267/17609 [00:05<00:00, 2899.55it/s][A
Extracting sections:  88%|████████████▎ | 15558/17609 [00:05<00:00, 2865.84it/s][A
Extracting sections:  90%|████████████▌ | 15845/17609 [00:05<00:00, 2815.28it/s][A
Extracting sections:  92%|████████████▊ | 16127/17609 [00:05<00:00, 2785.70it/s][A
Extracting sections:  93%|█████████████ | 16406/17609 [00:05<00:00, 2750.11it/s][A
Extracting sections:  95%|█████████████▎| 16689/17609 [00:05<00:00, 2770.29it/s][A
Extracting sections:  96%|█████████████▍| 16967/17609 [00:05<00:00, 2757.24it/s][A
Extracting sections:  98%|█████████████▋| 17252/17609 [00:05<00:00, 2782.78it/s][A
Extracting sections: 100%|██████████████| 17609/17609 [00:05<00:00, 2952.35it/s][A
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Extracting sections:  66%|█████████▏    | 10650/16169 [00:03<00:01, 3134.11it/s][A
Extracting sections:  68%|█████████▌    | 10972/16169 [00:03<00:01, 3158.50it/s][A
Extracting sections:  70%|█████████▊    | 11288/16169 [00:03<00:01, 3077.18it/s][A
Extracting sections:  72%|██████████    | 11597/16169 [00:03<00:01, 3023.62it/s][A
Extracting sections:  74%|██████████▎   | 11908/16169 [00:03<00:01, 3045.78it/s][A
Extracting sections:  76%|██████████▌   | 12213/16169 [00:03<00:01, 3004.64it/s][A
Extracting sections:  78%|██████████▊   | 12538/16169 [00:04<00:01, 3075.46it/s][A
Extracting sections:  80%|███████████▏  | 12882/16169 [00:04<00:01, 3180.16it/s][A
Extracting sections:  82%|███████████▍  | 13201/16169 [00:04<00:00, 3114.20it/s][A
Extracting sections:  84%|███████████▋  | 13513/16169 [00:04<00:00, 3111.51it/s][A
Extracting sections:  86%|███████████▉  | 13825/16169 [00:04<00:00, 3078.43it/s][A
Extracting sections:  87%|████████████▏ | 14134/16169 [00:04<00:00, 3046.35i

Extracting sections:  51%|███████▋       | 8142/15894 [00:02<00:02, 2844.24it/s][A
Extracting sections:  53%|███████▉       | 8427/15894 [00:02<00:02, 2843.85it/s][A
Extracting sections:  55%|████████▏      | 8739/15894 [00:02<00:02, 2925.22it/s][A
Extracting sections:  57%|████████▌      | 9057/15894 [00:02<00:02, 2999.61it/s][A
Extracting sections:  59%|████████▊      | 9364/15894 [00:03<00:02, 3019.37it/s][A
Extracting sections:  61%|█████████▏     | 9670/15894 [00:03<00:02, 3030.82it/s][A
Extracting sections:  63%|█████████▍     | 9974/15894 [00:03<00:01, 2963.77it/s][A
Extracting sections:  65%|█████████     | 10271/15894 [00:03<00:01, 2919.27it/s][A
Extracting sections:  66%|█████████▎    | 10564/15894 [00:03<00:01, 2864.07it/s][A
Extracting sections:  68%|█████████▌    | 10851/15894 [00:03<00:01, 2810.84it/s][A
Extracting sections:  70%|█████████▊    | 11133/15894 [00:03<00:01, 2787.89it/s][A
Extracting sections:  72%|██████████    | 11412/15894 [00:03<00:01, 2760.76i

Extracting sections:  37%|█████▌         | 5739/15553 [00:01<00:03, 2988.98it/s][A
Extracting sections:  39%|█████▊         | 6042/15553 [00:02<00:03, 3000.37it/s][A
Extracting sections:  41%|██████▏        | 6354/15553 [00:02<00:03, 3035.39it/s][A
Extracting sections:  43%|██████▍        | 6658/15553 [00:02<00:02, 2996.36it/s][A
Extracting sections:  45%|██████▋        | 6975/15553 [00:02<00:02, 3046.08it/s][A
Extracting sections:  47%|███████        | 7280/15553 [00:02<00:02, 3013.99it/s][A
Extracting sections:  49%|███████▎       | 7582/15553 [00:02<00:02, 3013.69it/s][A
Extracting sections:  51%|███████▌       | 7888/15553 [00:02<00:02, 3023.17it/s][A
Extracting sections:  53%|███████▉       | 8191/15553 [00:02<00:02, 3016.95it/s][A
Extracting sections:  55%|████████▏      | 8498/15553 [00:02<00:02, 3030.28it/s][A
Extracting sections:  57%|████████▍      | 8802/15553 [00:02<00:02, 3017.48it/s][A
Extracting sections:  59%|████████▊      | 9111/15553 [00:03<00:02, 3037.92i

Extracting sections:  23%|███▍           | 3656/16244 [00:01<00:04, 3033.10it/s][A
Extracting sections:  24%|███▋           | 3960/16244 [00:01<00:04, 2999.07it/s][A
Extracting sections:  26%|███▉           | 4261/16244 [00:01<00:04, 2953.13it/s][A
Extracting sections:  28%|████▏          | 4576/16244 [00:01<00:03, 3010.69it/s][A
Extracting sections:  30%|████▌          | 4900/16244 [00:01<00:03, 3077.95it/s][A
Extracting sections:  32%|████▊          | 5210/16244 [00:01<00:03, 3082.03it/s][A
Extracting sections:  34%|█████          | 5521/16244 [00:01<00:03, 3089.05it/s][A
Extracting sections:  36%|█████▍         | 5840/16244 [00:01<00:03, 3116.41it/s][A
Extracting sections:  38%|█████▋         | 6169/16244 [00:02<00:03, 3167.96it/s][A
Extracting sections:  40%|█████▉         | 6486/16244 [00:02<00:03, 3138.79it/s][A
Extracting sections:  42%|██████▎        | 6823/16244 [00:02<00:02, 3205.65it/s][A
Extracting sections:  44%|██████▌        | 7159/16244 [00:02<00:02, 3250.48i

Extracting sections:   8%|█▏             | 1194/15384 [00:00<00:04, 2873.16it/s][A
Extracting sections:  10%|█▍             | 1482/15384 [00:00<00:04, 2852.01it/s][A
Extracting sections:  11%|█▋             | 1768/15384 [00:00<00:04, 2841.91it/s][A
Extracting sections:  13%|██             | 2053/15384 [00:00<00:04, 2844.28it/s][A
Extracting sections:  15%|██▎            | 2340/15384 [00:00<00:04, 2850.26it/s][A
Extracting sections:  17%|██▌            | 2629/15384 [00:00<00:04, 2862.10it/s][A
Extracting sections:  19%|██▊            | 2920/15384 [00:01<00:04, 2876.08it/s][A
Extracting sections:  21%|███▏           | 3208/15384 [00:01<00:04, 2873.59it/s][A
Extracting sections:  23%|███▍           | 3509/15384 [00:01<00:04, 2914.84it/s][A
Extracting sections:  25%|███▋           | 3805/15384 [00:01<00:03, 2927.29it/s][A
Extracting sections:  27%|████           | 4105/15384 [00:01<00:03, 2948.69it/s][A
Extracting sections:  29%|████▎          | 4400/15384 [00:01<00:03, 2923.74i


Extracting sections:   0%|                            | 0/16426 [00:00<?, ?it/s][A
Extracting sections:   2%|▎               | 321/16426 [00:00<00:05, 3192.50it/s][A
Extracting sections:   4%|▋               | 648/16426 [00:00<00:04, 3233.55it/s][A
Extracting sections:   6%|▉               | 972/16426 [00:00<00:05, 3086.80it/s][A
Extracting sections:   8%|█▏             | 1282/16426 [00:00<00:04, 3082.14it/s][A
Extracting sections:  10%|█▍             | 1591/16426 [00:00<00:04, 3078.75it/s][A
Extracting sections:  12%|█▋             | 1913/16426 [00:00<00:04, 3125.30it/s][A
Extracting sections:  14%|██             | 2230/16426 [00:00<00:04, 3138.96it/s][A
Extracting sections:  15%|██▎            | 2545/16426 [00:00<00:04, 3111.09it/s][A
Extracting sections:  17%|██▌            | 2868/16426 [00:00<00:04, 3143.67it/s][A
Extracting sections:  19%|██▉            | 3183/16426 [00:01<00:04, 3123.69it/s][A
Extracting sections:  21%|███▏           | 3496/16426 [00:01<00:04, 3112.64

Extracting sections:  11%|█▋             | 1200/10606 [00:00<00:03, 2899.28it/s][A
Extracting sections:  14%|██             | 1491/10606 [00:00<00:03, 2832.00it/s][A
Extracting sections:  17%|██▌            | 1786/10606 [00:00<00:03, 2868.58it/s][A
Extracting sections:  20%|██▉            | 2080/10606 [00:00<00:02, 2890.02it/s][A
Extracting sections:  22%|███▎           | 2378/10606 [00:00<00:02, 2916.46it/s][A
Extracting sections:  25%|███▊           | 2678/10606 [00:00<00:02, 2940.07it/s][A
Extracting sections:  28%|████▏          | 2974/10606 [00:01<00:02, 2945.25it/s][A
Extracting sections:  31%|████▌          | 3269/10606 [00:01<00:02, 2941.15it/s][A
Extracting sections:  34%|█████          | 3565/10606 [00:01<00:02, 2945.44it/s][A
Extracting sections:  36%|█████▍         | 3861/10606 [00:01<00:02, 2948.72it/s][A
Extracting sections:  39%|█████▉         | 4164/10606 [00:01<00:02, 2972.31it/s][A
Extracting sections:  42%|██████▎        | 4466/10606 [00:01<00:02, 2984.79i


Extracting sections:   0%|                             | 0/1722 [00:00<?, ?it/s][A
Extracting sections:  21%|███▌             | 363/1722 [00:00<00:00, 3622.39it/s][A
Extracting sections:  42%|███████▏         | 726/1722 [00:00<00:00, 3519.59it/s][A
Extracting sections:  63%|██████████      | 1079/1722 [00:00<00:00, 3503.75it/s][A
Extracting sections: 100%|████████████████| 1722/1722 [00:00<00:00, 3348.74it/s][A
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes_df['sections'] = notes_df['cleaned_text'].progress_apply(extract_sections)
41it [01:53,  1.47it/s]2024-11-07 23:17:22,056 - INFO - Starting clinical notes cleaning...

Cleaning text:   0%|                                   | 0/2789 [00:00<?, ?it/s][A
Cleaning text:  15%|███▎                   | 408/2789 [0

Processed 141624 clinical notes


In [None]:
# 第3步：分块处理 PRESCRIPTIONS 表
print("\nStep 3: Processing PRESCRIPTIONS table in chunks...")
try:
    chunk_size = 100000  # 可以根据内存调整
    prescription_chunks = []
    
    for chunk in tqdm(pd.read_csv(
        DATA_PATH / "PRESCRIPTIONS.csv",
        usecols=['SUBJECT_ID', 'HADM_ID', 'STARTDATE', 'ENDDATE', 'DRUG', 'ROUTE', 
                 'DOSE_VAL_RX', 'DOSE_UNIT_RX'],
        dtype={
            'SUBJECT_ID': 'int32',
            'HADM_ID': 'int32',
            'DRUG': 'category',
            'ROUTE': 'category',
            'DOSE_UNIT_RX': 'category'
        },
        chunksize=chunk_size
    )):
        processed_chunk = preprocessor.process_prescriptions(chunk)
        prescription_chunks.append(processed_chunk)
        del chunk
        gc.collect()

    prescriptions_df = pd.concat(prescription_chunks, ignore_index=True)
    del prescription_chunks
    gc.collect()
    
    # 保存处理后的处方数据
    prescriptions_df.to_pickle(PROCESSED_PATH / 'processed_prescriptions.pkl')
    print(f"Processed {len(prescriptions_df)} prescriptions")
except Exception as e:
    print(f"Error processing PRESCRIPTIONS: {str(e)}")
    raise


Step 3: Processing PRESCRIPTIONS table in chunks...


0it [00:00, ?it/s]2024-11-07 23:17:24,295 - INFO - Processing prescription data...
2024-11-07 23:17:25,865 - INFO - Processed 100000 prescriptions
2024-11-07 23:17:25,865 - INFO - Available columns after prescription processing:
2024-11-07 23:17:25,866 - INFO - ['SUBJECT_ID', 'HADM_ID', 'STARTDATE', 'ENDDATE', 'DRUG', 'DOSE_VAL_RX', 'DOSE_UNIT_RX', 'ROUTE', 'DURATION', 'DRUG_NAME_GENERIC', 'drug_id']
1it [00:01,  1.73s/it]2024-11-07 23:17:26,019 - INFO - Processing prescription data...
2024-11-07 23:17:27,602 - INFO - Processed 100000 prescriptions
2024-11-07 23:17:27,603 - INFO - Available columns after prescription processing:
2024-11-07 23:17:27,603 - INFO - ['SUBJECT_ID', 'HADM_ID', 'STARTDATE', 'ENDDATE', 'DRUG', 'DOSE_VAL_RX', 'DOSE_UNIT_RX', 'ROUTE', 'DURATION', 'DRUG_NAME_GENERIC', 'drug_id']
2it [00:03,  1.73s/it]2024-11-07 23:17:27,754 - INFO - Processing prescription data...
2024-11-07 23:17:29,343 - INFO - Processed 100000 prescriptions
2024-11-07 23:17:29,344 - INFO - Avai

2024-11-07 23:18:00,624 - INFO - Processed 100000 prescriptions
2024-11-07 23:18:00,625 - INFO - Available columns after prescription processing:
2024-11-07 23:18:00,625 - INFO - ['SUBJECT_ID', 'HADM_ID', 'STARTDATE', 'ENDDATE', 'DRUG', 'DOSE_VAL_RX', 'DOSE_UNIT_RX', 'ROUTE', 'DURATION', 'DRUG_NAME_GENERIC', 'drug_id']
21it [00:36,  1.74s/it]2024-11-07 23:18:00,785 - INFO - Processing prescription data...
2024-11-07 23:18:02,379 - INFO - Processed 100000 prescriptions
2024-11-07 23:18:02,380 - INFO - Available columns after prescription processing:
2024-11-07 23:18:02,380 - INFO - ['SUBJECT_ID', 'HADM_ID', 'STARTDATE', 'ENDDATE', 'DRUG', 'DOSE_VAL_RX', 'DOSE_UNIT_RX', 'ROUTE', 'DURATION', 'DRUG_NAME_GENERIC', 'drug_id']
22it [00:38,  1.74s/it]2024-11-07 23:18:02,535 - INFO - Processing prescription data...
2024-11-07 23:18:04,120 - INFO - Processed 100000 prescriptions
2024-11-07 23:18:04,121 - INFO - Available columns after prescription processing:
2024-11-07 23:18:04,121 - INFO - ['S

In [6]:
# 第4步：处理 DIAGNOSES_ICD 表
print("\nStep 4: Processing DIAGNOSES_ICD table...")
try:
    diagnoses_df = pd.read_csv(
        DATA_PATH / "DIAGNOSES_ICD.csv",
        usecols=['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE'],
        dtype={
            'SUBJECT_ID': 'int32',
            'HADM_ID': 'int32',
            'ICD9_CODE': 'category'
        }
    )
    diagnoses_df.to_pickle(PROCESSED_PATH / 'processed_diagnoses.pkl')
    print(f"Processed {len(diagnoses_df)} diagnoses")
except Exception as e:
    print(f"Error processing DIAGNOSES_ICD: {str(e)}")
    raise


Step 4: Processing DIAGNOSES_ICD table...
Processed 651047 diagnoses


In [7]:
# 第5步：分析和可视化（可选，根据需要运行）
def analyze_processed_data():
    print("\nAnalyzing processed data...")
    
    # 加载处理后的数据
    patients_df = pd.read_pickle(PROCESSED_PATH / 'processed_patients.pkl')
    
    # 绘制人口统计图
    plot_demographics(patients_df)
    
    # 分析处方数据（使用样本以节省内存）
    prescriptions_df = pd.read_pickle(PROCESSED_PATH / 'processed_prescriptions.pkl')
    prescription_sample = prescriptions_df.sample(n=min(100000, len(prescriptions_df)))
    plot_prescriptions_summary(prescription_sample)
    del prescription_sample
    gc.collect()
    
    # 分析临床记录
    notes_df = pd.read_pickle(PROCESSED_PATH / 'processed_notes.pkl')
    analyze_notes_summary(notes_df.sample(n=min(10000, len(notes_df))))

In [8]:
# 第6步：分块合并数据（如果需要）
def merge_data_in_chunks():
    print("\nMerging data in chunks...")
    
    chunk_size = 50000  # 可以调整
    merged_chunks = []
    
    notes_df = pd.read_pickle(PROCESSED_PATH / 'processed_notes.pkl')
    patients_df = pd.read_pickle(PROCESSED_PATH / 'processed_patients.pkl')
    diagnoses_df = pd.read_pickle(PROCESSED_PATH / 'processed_diagnoses.pkl')
    
    for chunk in tqdm(np.array_split(notes_df, len(notes_df) // chunk_size + 1)):
        # 读取对应的处方数据
        prescriptions_chunk = pd.read_pickle(PROCESSED_PATH / 'processed_prescriptions.pkl')
        
        merged_chunk = preprocessor.merge_patient_data(
            chunk,
            prescriptions_chunk,
            diagnoses_df,
            patients_df
        )
        merged_chunks.append(merged_chunk)
        del chunk, prescriptions_chunk
        gc.collect()
    
    merged_df = pd.concat(merged_chunks, ignore_index=True)
    del merged_chunks
    gc.collect()
    
    return merged_df

In [1]:
try:
    merged_df = merge_data_in_chunks()
    train_df, test_df = preprocessor.create_train_test_split(merged_df)

except Exception as e:
    print(f"Error during analysis: {str(e)}")
    raise

Error during analysis: name 'merge_data_in_chunks' is not defined


NameError: name 'merge_data_in_chunks' is not defined

In [None]:
       # 分块保存
        chunk_size = 100000
        print("Saving training data in chunks...")
        for i, chunk in enumerate(np.array_split(train_df, len(train_df) // chunk_size + 1)):
            chunk.to_csv(PROCESSED_PATH / f'train_data_chunk_{i}.csv', index=False)
            
        print("Saving test data in chunks...")
        for i, chunk in enumerate(np.array_split(test_df, len(test_df) // chunk_size + 1)):
            chunk.to_csv(PROCESSED_PATH / f'test_data_chunk_{i}.csv', index=False)
            
        print("\nProcessing completed successfully!")
        print(f"Training set size: {len(train_df)}")
        print(f"Test set size: {len(test_df)}")
        print(f"Data saved to: {PROCESSED_PATH}")
        
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        raise