In [1]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..'))

from data.data_loader import FactorDataLoader
import pandas as pd
import numpy as np

print("Data loader imported successfully!")

Data loader imported successfully!


In [2]:
# 데이터 파일들이 존재하는지 확인
import os
from pathlib import Path

base_path = Path("../data/")
factor_file = base_path / "factors.csv"
industry_file = base_path / "industry49.csv"

print(f"Base path exists: {base_path.exists()}")
print(f"Factor file exists: {factor_file.exists()}")
print(f"Industry file exists: {industry_file.exists()}")

# 디렉토리 구조 확인
if base_path.exists():
    files = list(base_path.glob("*"))
    print(f"\nFiles in data directory:")
    for file in files:
        print(f"  - {file.name}")

Base path exists: True
Factor file exists: True
Industry file exists: True

Files in data directory:
  - dataset.ipynb
  - data_loader.py
  - factors.csv
  - industry49.csv
  - __pycache__


In [3]:
# 데이터 로더 인스턴스 생성 및 기본 데이터 로드 테스트
try:
    loader = FactorDataLoader()
    
    # Factor 데이터 로드
    factor_data = loader.load_factor_data()
    print(f"Factor data shape: {factor_data.shape}")
    print(f"Factor data columns: {list(factor_data.columns)}")
    print(f"Factor data date range: {factor_data.index.min()} to {factor_data.index.max()}")
    print(f"Factor data sample:")
    print(factor_data.head())
    
except Exception as e:
    print(f"Error loading factor data: {e}")
    print(f"Error type: {type(e)}")

Error loading factor data: No factor files found in c:\Users\Kong\code\study_ccpo\data\factors.csv
Error type: <class 'FileNotFoundError'>


In [4]:
# 수정된 로더로 다시 테스트
# 모듈 리로드
import importlib
import sys
sys.modules.pop('data.data_loader', None)
from data.data_loader import FactorDataLoader

try:
    loader = FactorDataLoader()
    
    # Factor 데이터 로드
    factor_data = loader.load_factor_data()
    print(f"Factor data shape: {factor_data.shape}")
    print(f"Factor data columns: {list(factor_data.columns)}")
    print(f"Factor data date range: {factor_data.index.min()} to {factor_data.index.max()}")
    print(f"\nFirst few rows:")
    print(factor_data.head())
    
except Exception as e:
    print(f"Error loading factor data: {e}")
    print(f"Error type: {type(e)}")

Loaded factor data: (5033, 9)
Factor data shape: (5033, 9)
Factor data columns: ['Date', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF', 'Mom', 'ST_Rev']
Factor data date range: 0 to 5032

First few rows:
         Date  Mkt-RF   SMB   HML   RMW   CMA    RF   Mom  ST_Rev
0  2005-01-03   -0.97 -0.63 -0.05  0.35 -0.01  0.01 -0.62    0.42
1  2005-01-04   -1.30 -0.51  0.44  0.83 -0.48  0.01  0.24    0.15
2  2005-01-05   -0.51 -1.14  0.02  0.04 -0.13  0.01  0.00   -0.14
3  2005-01-06    0.34 -0.03  0.13  0.52 -0.12  0.01  0.03   -0.31
4  2005-01-07   -0.22 -0.84 -0.10 -0.15 -0.02  0.01 -0.17   -0.01


In [5]:
# Industry 데이터도 로드해보고 전체 파이프라인 테스트
try:
    # Industry 데이터 로드
    industry_data = loader.load_industry_data()
    print(f"Industry data shape: {industry_data.shape}")
    print(f"Industry data columns: {list(industry_data.columns)[:10]}...")  # 처음 10개만
    print(f"Industry data date range: {industry_data.index.min()} to {industry_data.index.max()}")
    print(f"\nFirst few rows:")
    print(industry_data.head())
    
except Exception as e:
    print(f"Error loading industry data: {e}")
    print(f"Error type: {type(e)}")

Loaded industry data: (5033, 50)
Industry data shape: (5033, 50)
Industry data columns: ['Date', 'Agric', 'Food', 'Soda', 'Beer', 'Smoke', 'Toys', 'Fun', 'Books', 'Hshld']...
Industry data date range: 0 to 5032

First few rows:
         Date  Agric  Food  Soda  Beer  Smoke  Toys   Fun  Books  Hshld  ...  \
0  2005-01-03  -0.53 -1.12 -0.04 -0.32  -0.73 -2.73 -0.72  -1.13  -0.55  ...   
1  2005-01-04  -1.87 -0.32 -0.96 -1.01   0.56 -0.76 -1.38  -0.38  -1.12  ...   
2  2005-01-05  -1.08 -0.35 -1.07 -0.69  -0.18 -1.40 -0.44  -0.50  -0.02  ...   
3  2005-01-06  -0.06  0.22  0.83  0.27  -0.58  0.44  0.92   0.70   0.70  ...   
4  2005-01-07  -0.54  0.33 -0.39  0.27   1.34 -1.43 -0.60  -0.16   0.70  ...   

   Boxes  Trans  Whlsl  Rtail  Meals  Banks  Insur  RlEst   Fin  Other  
0  -0.84  -1.21  -1.57   0.08  -0.94  -0.53  -0.55  -0.80 -0.60  -0.26  
1  -1.89  -2.17  -1.84  -1.13  -0.46  -1.04  -0.45  -1.93 -1.50  -1.34  
2  -2.07  -0.63  -0.89   0.00  -0.96  -0.37   0.19  -2.75 -0.54  -0.55  

In [6]:
# 전처리 및 주간 데이터 생성 테스트
try:
    print("=== 전처리 테스트 ===")
    factor_processed, industry_processed = loader.preprocess_data()
    
    print(f"Processed factor data shape: {factor_processed.shape}")
    print(f"Processed industry data shape: {industry_processed.shape}")
    print(f"Factor data index type: {type(factor_processed.index)}")
    print(f"Industry data index type: {type(industry_processed.index)}")
    
    print(f"\nFactor data sample after preprocessing:")
    print(factor_processed.head())
    
except Exception as e:
    print(f"Error in preprocessing: {e}")
    import traceback
    traceback.print_exc()

=== 전처리 테스트 ===


  self.load_factor_data()
  if self.industry_data is None:


Error in preprocessing: could not convert string to float: '2005-01-03'


Traceback (most recent call last):
  File "C:\Users\Kong\AppData\Local\Temp\ipykernel_28080\2315521886.py", line 4, in <module>
    factor_processed, industry_processed = loader.preprocess_data()
  File "c:\Users\Kong\code\study_ccpo\examples\..\data\data_loader.py", line 92, in preprocess_data
    self.factor_data[factor_cols] = scaler.fit_transform(self.factor_data[factor_cols])
  File "c:\Users\Kong\anaconda3\envs\ccpo\lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Kong\anaconda3\envs\ccpo\lib\site-packages\sklearn\base.py", line 918, in fit_transform
    return self.fit(X, **fit_params).transform(X)
  File "c:\Users\Kong\anaconda3\envs\ccpo\lib\site-packages\sklearn\preprocessing\_data.py", line 894, in fit
    return self.partial_fit(X, y, sample_weight)
  File "c:\Users\Kong\anaconda3\envs\ccpo\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **k

In [7]:
# 수정된 로더로 다시 테스트
import importlib
import sys
sys.modules.pop('data.data_loader', None)
from data.data_loader import FactorDataLoader

try:
    print("=== 새로운 로더로 테스트 ===")
    loader = FactorDataLoader()
    
    # 전처리
    factor_processed, industry_processed = loader.preprocess_data()
    
    print(f"Processed factor data shape: {factor_processed.shape}")
    print(f"Processed industry data shape: {industry_processed.shape}")
    print(f"Factor data index: {factor_processed.index[:5]}")
    print(f"Industry data index: {industry_processed.index[:5]}")
    
    print(f"\nFactor data sample:")
    print(factor_processed.head())
    
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

=== 새로운 로더로 테스트 ===
Loaded factor data: (5033, 8)
Loaded industry data: (5033, 49)
Processed factor data shape: (5033, 8)
Processed industry data shape: (5033, 49)
Factor data index: DatetimeIndex(['2005-01-03', '2005-01-04', '2005-01-05', '2005-01-06',
               '2005-01-07'],
              dtype='datetime64[ns]', name='Date', freq=None)
Industry data index: DatetimeIndex(['2005-01-03', '2005-01-04', '2005-01-05', '2005-01-06',
               '2005-01-07'],
              dtype='datetime64[ns]', name='Date', freq=None)

Factor data sample:
              Mkt-RF       SMB       HML       RMW       CMA        RF  \
Date                                                                     
2005-01-03 -0.827257 -0.984733 -0.057949  0.740493 -0.026012  0.477285   
2005-01-04 -1.097280 -0.796925  0.553803  1.803303 -1.227799  0.477285   
2005-01-05 -0.450861 -1.782914  0.029444  0.054094 -0.332851  0.477285   
2005-01-06  0.244653 -0.045696  0.166776  1.116905 -0.307281  0.477285   
2005-

In [8]:
# 주간 데이터 생성 테스트
try:
    print("=== 주간 데이터 생성 테스트 ===")
    
    # 주간 데이터 생성 (기간 시작일 팩터 사용)
    X_weekly, y_weekly = loader.create_frequency_data(frequency='weekly', factor_timing='start')
    
    print(f"Weekly X shape: {X_weekly.shape}")
    print(f"Weekly y shape: {y_weekly.shape}")
    print(f"X index range: {X_weekly.index.min()} to {X_weekly.index.max()}")
    print(f"y index range: {y_weekly.index.min()} to {y_weekly.index.max()}")
    
    print(f"\nWeekly X sample:")
    print(X_weekly.head())
    
    print(f"\nWeekly y sample:")
    print(y_weekly.head())
    
except Exception as e:
    print(f"Error creating weekly data: {e}")
    import traceback
    traceback.print_exc()

=== 주간 데이터 생성 테스트 ===
Error creating weekly data: "['week'] not found in axis"


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_factors = factor_data.groupby('week').first()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  else:  # factor_timing == 'end'
Traceback (most recent call last):
  File "C:\Users\Kong\AppData\Local\Temp\ipykernel_28080\36501718.py", line 6, in <module>
    X_weekly, y_weekly = loader.create_frequency_data(frequency='weekly', factor_timing='start')
  File "c:\Users\Kong\code\study_ccpo\examples\..\data\data_loader.py", line 125, in create_frequency_data
    return self._create_weekly_data(factor_data_common,

In [9]:
# 수정된 주간 데이터 생성 테스트
import importlib
import sys
sys.modules.pop('data.data_loader', None)
from data.data_loader import FactorDataLoader

try:
    print("=== 수정된 주간 데이터 생성 테스트 ===")
    loader = FactorDataLoader()
    
    # 주간 데이터 생성 (기간 시작일 팩터 사용)
    X_weekly, y_weekly = loader.create_frequency_data(frequency='weekly', factor_timing='start')
    
    print(f"Weekly X shape: {X_weekly.shape}")
    print(f"Weekly y shape: {y_weekly.shape}")
    print(f"X index range: {X_weekly.index.min()} to {X_weekly.index.max()}")
    print(f"y index range: {y_weekly.index.min()} to {y_weekly.index.max()}")
    
    print(f"\nWeekly X sample (first 3 rows):")
    print(X_weekly.head(3))
    
    print(f"\nWeekly y sample (first 3 rows, first 5 columns):")
    print(y_weekly.iloc[:3, :5])
    
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

=== 수정된 주간 데이터 생성 테스트 ===
Loaded factor data: (5033, 8)
Loaded industry data: (5033, 49)
Created weekly data - Factors: (1044, 8), Returns: (1044, 49)
Weekly X shape: (1044, 8)
Weekly y shape: (1044, 49)
X index range: 2005-01-04/2005-01-10 to 2024-12-31/2025-01-06
y index range: 2005-01-04/2005-01-10 to 2024-12-31/2025-01-06

Weekly X sample (first 3 rows):
                         Mkt-RF       SMB       HML       RMW       CMA  \
week                                                                      
2005-01-04/2005-01-10 -1.097280 -0.796925  0.553803  1.803303 -1.227799   
2005-01-11/2005-01-17 -0.589964 -0.452612  0.466410  1.914013 -0.435131   
2005-01-18/2005-01-24  0.760152  0.298618 -0.282674 -0.211608 -0.307281   

                             RF       Mom    ST_Rev  
week                                                 
2005-01-04/2005-01-10  0.477285  0.219417  0.131214  
2005-01-11/2005-01-17  0.477285  0.285704 -0.110648  
2005-01-18/2005-01-24  0.477285  0.011089  0.12

In [10]:
# 전체 파이프라인 테스트 - DataLoader 생성
try:
    print("=== 전체 파이프라인 테스트 ===")
    
    # 날짜 기반 분할로 DataLoader 생성
    train_loader, val_loader, test_loader = loader.create_dataloaders(
        frequency='weekly',
        factor_timing='start',  # 주 시작일 팩터 사용
        train_end_date='2020-12-31',  # 2021년 이전까지 훈련
        val_end_date='2021-12-31',    # 2022년 이전까지 검증
        batch_size=32
    )
    
    print(f"Created DataLoaders successfully!")
    print(f"Train batches: {len(train_loader)}")
    print(f"Val batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")
    
    # 데이터 확인
    for batch_X, batch_y in train_loader:
        print(f"\nBatch X shape: {batch_X.shape}")
        print(f"Batch y shape: {batch_y.shape}")
        print(f"X sample: {batch_X[0]}")
        print(f"y sample: {batch_y[0, :5]}")  # 첫 5개 산업만
        break
        
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

=== 전체 파이프라인 테스트 ===
Created weekly data - Factors: (1044, 8), Returns: (1044, 49)
Error: Invalid comparison between dtype=period[W-MON] and Timestamp


Traceback (most recent call last):
  File "c:\Users\Kong\anaconda3\envs\ccpo\lib\site-packages\pandas\core\arrays\datetimelike.py", line 1006, in _cmp_method
    other = self._validate_comparison_value(other)
  File "c:\Users\Kong\anaconda3\envs\ccpo\lib\site-packages\pandas\core\arrays\datetimelike.py", line 565, in _validate_comparison_value
    raise InvalidComparison(other)
pandas.errors.InvalidComparison: 2020-12-31 00:00:00

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Kong\AppData\Local\Temp\ipykernel_28080\4195899470.py", line 6, in <module>
    train_loader, val_loader, test_loader = loader.create_dataloaders(
  File "c:\Users\Kong\code\study_ccpo\examples\..\data\data_loader.py", line 330, in create_dataloaders
    X_train, X_val, X_test, y_train, y_val, y_test = self.train_test_split(
  File "c:\Users\Kong\code\study_ccpo\examples\..\data\data_loader.py", line 263, in train_test_split
    X_train = X

In [11]:
# 최종 전체 파이프라인 테스트
import importlib
import sys
sys.modules.pop('data.data_loader', None)
from data.data_loader import FactorDataLoader

try:
    print("=== 최종 전체 파이프라인 테스트 ===")
    loader = FactorDataLoader()
    
    # 날짜 기반 분할로 DataLoader 생성
    train_loader, val_loader, test_loader = loader.create_dataloaders(
        frequency='weekly',
        factor_timing='start',  # 주 시작일 팩터 사용
        train_end_date='2020-12-31',  # 2021년 이전까지 훈련
        val_end_date='2021-12-31',    # 2022년 이전까지 검증
        batch_size=32
    )
    
    print(f"✅ DataLoaders 생성 성공!")
    print(f"Train batches: {len(train_loader)}")
    print(f"Val batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")
    
    # 데이터 확인
    for batch_X, batch_y in train_loader:
        print(f"\n✅ 데이터 배치 확인:")
        print(f"Batch X shape: {batch_X.shape}")
        print(f"Batch y shape: {batch_y.shape}")
        print(f"X data type: {batch_X.dtype}")
        print(f"y data type: {batch_y.dtype}")
        break
    
    print(f"\n✅ 모든 테스트 통과!")
        
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== 최종 전체 파이프라인 테스트 ===
Loaded factor data: (5033, 8)
Loaded industry data: (5033, 49)
Created weekly data - Factors: (1044, 8), Returns: (1044, 49)
❌ Error: Invalid comparison between dtype=datetime64[ns] and NoneType


Traceback (most recent call last):
  File "c:\Users\Kong\anaconda3\envs\ccpo\lib\site-packages\pandas\core\arrays\datetimelike.py", line 1006, in _cmp_method
    other = self._validate_comparison_value(other)
  File "c:\Users\Kong\anaconda3\envs\ccpo\lib\site-packages\pandas\core\arrays\datetimelike.py", line 565, in _validate_comparison_value
    raise InvalidComparison(other)
pandas.errors.InvalidComparison: None

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Kong\AppData\Local\Temp\ipykernel_28080\2016116540.py", line 12, in <module>
    train_loader, val_loader, test_loader = loader.create_dataloaders(
  File "c:\Users\Kong\code\study_ccpo\examples\..\data\data_loader.py", line 338, in create_dataloaders
    X_train, X_val, X_test, y_train, y_val, y_test = self.train_test_split(
  File "c:\Users\Kong\code\study_ccpo\examples\..\data\data_loader.py", line 282, in train_test_split
    X_test = X[(X.index > val

In [12]:
# 최종 수정된 전체 파이프라인 테스트
import importlib
import sys
sys.modules.pop('data.data_loader', None)
from data.data_loader import FactorDataLoader

try:
    print("=== 최종 수정된 전체 파이프라인 테스트 ===")
    loader = FactorDataLoader()
    
    # 날짜 기반 분할로 DataLoader 생성
    train_loader, val_loader, test_loader = loader.create_dataloaders(
        frequency='weekly',
        factor_timing='start',  # 주 시작일 팩터 사용
        train_end_date='2020-12-31',  # 2021년 이전까지 훈련
        val_end_date='2021-12-31',    # 2022년 이전까지 검증
        batch_size=32
    )
    
    print(f"✅ DataLoaders 생성 성공!")
    print(f"Train batches: {len(train_loader)}")
    print(f"Val batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")
    
    # 각 데이터셋의 실제 데이터 개수 확인
    total_train = len(train_loader.dataset)
    total_val = len(val_loader.dataset)
    total_test = len(test_loader.dataset)
    
    print(f"\n📊 데이터 개수:")
    print(f"Train samples: {total_train}")
    print(f"Val samples: {total_val}")
    print(f"Test samples: {total_test}")
    print(f"Total: {total_train + total_val + total_test}")
    
    # 데이터 확인
    for batch_X, batch_y in train_loader:
        print(f"\n✅ 데이터 배치 확인:")
        print(f"Batch X shape: {batch_X.shape}")
        print(f"Batch y shape: {batch_y.shape}")
        print(f"X data type: {batch_X.dtype}")
        print(f"y data type: {batch_y.dtype}")
        break
    
    print(f"\n🎉 모든 테스트 통과! 로더가 완벽하게 동작합니다!")
        
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== 최종 수정된 전체 파이프라인 테스트 ===
Loaded factor data: (5033, 8)
Loaded industry data: (5033, 49)
Created weekly data - Factors: (1044, 8), Returns: (1044, 49)
Data split - Train: 835 (2005-01-04 00:00:00 ~ 2020-12-29 00:00:00)
           - Val: 52 (2021-01-05 00:00:00 ~ 2021-12-28 00:00:00)
           - Test: 157 (2022-01-04 00:00:00 ~ 2024-12-31 00:00:00)
✅ DataLoaders 생성 성공!
Train batches: 27
Val batches: 2
Test batches: 5

📊 데이터 개수:
Train samples: 835
Val samples: 52
Test samples: 157
Total: 1044

✅ 데이터 배치 확인:
Batch X shape: torch.Size([32, 8])
Batch y shape: torch.Size([32, 49])
X data type: torch.float32
y data type: torch.float32

🎉 모든 테스트 통과! 로더가 완벽하게 동작합니다!


In [13]:
# Daily와 Monthly 옵션 테스트
try:
    print("=== Daily 데이터 테스트 ===")
    daily_loader = FactorDataLoader()
    
    # Daily 데이터로 테스트
    X_daily, y_daily = daily_loader.create_frequency_data(frequency='daily', factor_timing='start')
    print(f"Daily X shape: {X_daily.shape}, y shape: {y_daily.shape}")
    
    print("\n=== Monthly 데이터 테스트 ===")
    monthly_loader = FactorDataLoader()
    
    # Monthly 데이터로 테스트  
    X_monthly, y_monthly = monthly_loader.create_frequency_data(frequency='monthly', factor_timing='start')
    print(f"Monthly X shape: {X_monthly.shape}, y shape: {y_monthly.shape}")
    
    print("\n✅ 모든 빈도 옵션 (daily, weekly, monthly) 정상 동작!")
    
    # Factor timing 옵션 테스트
    print("\n=== Factor timing 'end' 옵션 테스트 ===")
    X_end, y_end = loader.create_frequency_data(frequency='weekly', factor_timing='end')
    print(f"Weekly (end timing) X shape: {X_end.shape}, y shape: {y_end.shape}")
    
    print("\n🎯 모든 옵션 테스트 완료!")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== Daily 데이터 테스트 ===
Loaded factor data: (5033, 8)
Loaded industry data: (5033, 49)
Created daily data - Factors: (5027, 8), Returns: (5027, 49)
Daily X shape: (5027, 8), y shape: (5027, 49)

=== Monthly 데이터 테스트 ===
Loaded factor data: (5033, 8)
Loaded industry data: (5033, 49)
Created monthly data - Factors: (239, 8), Returns: (239, 49)
Monthly X shape: (239, 8), y shape: (239, 49)

✅ 모든 빈도 옵션 (daily, weekly, monthly) 정상 동작!

=== Factor timing 'end' 옵션 테스트 ===
Created weekly data - Factors: (1044, 8), Returns: (1044, 49)
Weekly (end timing) X shape: (1044, 8), y shape: (1044, 49)

🎯 모든 옵션 테스트 완료!
