In [471]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
import math
import csv
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
os.chdir('/zfs/projects/darc/wolee_edehaan_suzienoh-exploratory-ml')

In [3]:
# Declare global variables
global con_list
global dum_list
global deps
global header

In [4]:
# List of continuous variables
con_list = ['absacc', 'acc', 'aeavol', 'age', 'agr', 'baspread', 'beta', 
            'betasq', 'bm', 'bm_ia', 'cash', 'cashdebt', 'cashpr','cfp', 
            'cfp_ia', 'chatoia', 'chcsho', 'chempia', 'chfeps', 'chinv', 
            'chmom', 'chnanalyst', 'chpmia', 'chtx', 'cinvest', 'currat', 
            'depr', 'disp', 'dolvol', 'dy', 'ear', 'egr', 'ep', 'fgr5yr', 
            'gma', 'grcapx', 'grltnoa', 'herf', 'hire', 'idiovol', 'ill', 
            'indmom', 'invest', 'lev', 'lgr', 'maxret', 'mom12m', 'mom1m', 
            'mom36m', 'mom6m', 'ms', 'mve', 'mve_ia', 'nanalyst', 'nincr', 
            'operprof', 'orgcap', 'pchcapx_ia', 'pchcurrat', 'pchdepr', 
            'pchgm_pchsale', 'pchquick', 'pchsale_pchinvt', 'pchsale_pchrect', 
            'pchsale_pchxsga', 'pchsaleinv', 'pctacc', 'pricedelay', 'ps', 
            'quick', 'rd_mve', 'rd_sale', 'realestate', 'retvol', 'roaq', 
            'roavol', 'roeq', 'roic', 'rsup', 'salecash', 'saleinv', 
            'salerec', 'secured', 'sfe', 'sgr', 'sp', 'std_dolvol', 
            'std_turn', 'stdacc', 'stdcf', 'sue', 'tang', 'tb', 'turn', 
            'zerotrade']


# List of dummy variables
dum_list = ['convind', 'divi', 'divo', 'ipo', 'rd', 'securedind', 'sin'] # Categorical variable binary 

# List of dependent variable
deps = con_list + dum_list +['date']

# Headers
header = ['permno','pyear']

In [5]:
def load_and_preprocess_data(file_path, period):
    
    """
    Loads and preprocesses the input data.

    Args:
    file_path (str): The path to the CSV file to be loaded.

    Returns:
    DataFrame: Preprocessed pandas DataFrame.
    """
    
    # Load data
    df = pd.read_csv(file_path)
    df.columns = [e.lower() for e in df.columns]
    
    df['date'] = df['date'].copy()
    df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
    # df['date'] = df['date'].dt.strftime('%m-%d-%Y')

    # Extract year
    df['pyear'] = df['date'].dt.year
    # Remove months if quarterly, otherwise, monthly, keep all months
    if period == 'quarter':
        df = df[df['date'].dt.month.isin([1,4,7,10])]

    # df.sort_values(['permno','date'], inplace=True)
    df.sort_values(['date', 'permno'], inplace=True)
    df['date'] = df['date'].dt.strftime('%Y-%m')
    del df['fpedats']
    
    print(df[['date', 'permno']].head())
    print('-' * 50)
        
    return df

In [6]:
class CustomWinsorizer(BaseEstimator, TransformerMixin):
    
    """
    A custom transformer for Winsorizing numeric data.

    Attributes:
    lower_percentile (int): The lower percentile for clipping data.
    upper_percentile (int): The upper percentile for clipping data.
    """
    
    def __init__(self, lower_percentile, upper_percentile):
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile

    def fit(self, X, y=None):
        self.lower_bound_ = np.percentile(X, self.lower_percentile)
        self.upper_bound_ = np.percentile(X, self.upper_percentile)
        return self

    def transform(self, X):
        X_clipped = np.clip(X, self.lower_bound_, self.upper_bound_)
        
        return X_clipped

In [7]:
class timePeriodMeanTransformer(BaseEstimator, TransformerMixin):
    
    """
    A custom transformer for imputing missing data based on time period means.

    Attributes:
    date_column (str): The column name representing dates.
    numeric_columns (list): List of numeric column names for which means are calculated.
    period (str): The time period for grouping data, either 'quarter' or 'month'.
    """
    
    def __init__(self, date_column, numeric_columns, period='quarter'):
        self.date_column = date_column
        self.numeric_columns = numeric_columns
        self.period = period

    def fit(self, X, y=None):
        X[self.date_column] = pd.to_datetime(X[self.date_column])
        if self.period == 'quarter':
            X['Period'] = X[self.date_column].dt.quarter
        elif self.period == 'month':
            X['Period'] = X[self.date_column].dt.month
        else:
            raise ValueError("period must be 'quarter' or 'month'")
       
       # Calculate and store the means of each numeric column for each time period
        self.period_means_ = X.groupby('Period')[self.numeric_columns].mean()
        return self

    def transform(self, X):
        X[self.date_column] = pd.to_datetime(X[self.date_column])
        if self.period == 'quarter':
            X['Period'] = X[self.date_column].dt.quarter
        elif self.period == 'month':
            X['Period'] = X[self.date_column].dt.month
        
        for col in self.numeric_columns:
            X[col] = X.apply(lambda row: row[col] if not pd.isna(row[col]) 
                             else self.period_means_.loc[row['Period'], col], axis=1)
        # return X.drop(['Period'], axis=1)
        return X

In [238]:
def build_pipeline(con_list, dum_list, lower_percentile, upper_percentile, period):
    
    """
    Builds a preprocessing pipeline for both numeric and categorical data.

    Args:
    con_list (list): List of continuous variable names.
    dum_list (list): List of dummy (categorical) variable names.
    lower_percentile (float): Lower percentile for winsorization.
    upper_percentile (float): Upper percentile for winsorization.
    period (string): Period for getting mean values (month vs quarter)

    Returns:
    Pipeline: A composed preprocessing pipeline.
    """
    
    numeric_pipeline = Pipeline([
        # ('fill_na', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
        ('winsorizer', CustomWinsorizer(lower_percentile=lower_percentile, upper_percentile=upper_percentile)),
        ('scaler', StandardScaler()),
        ('impute_con', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0))
    ])

    categorical_pipeline = Pipeline([
        ('impute_cat', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ])

    preprocessing = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, con_list),
            ('cat', categorical_pipeline, dum_list)
        ], remainder='passthrough')

    pipeline = Pipeline([
        ('Time_period_mean_imputation', timePeriodMeanTransformer('date', con_list, period)),
        ('Preprocessing', preprocessing),
    ])
    
    return pipeline

In [119]:
infile_path = 'Info Processing and Mutual Funds/masterv14.csv'
period = 'month'

In [120]:
if period == 'quarter':
    target = 'retq'
elif period == 'month':
    target = 'ret'
else:
    raise ValueError("period must be 'quarter' or 'month'")

In [121]:
# Load and preprocess data
print('\nLoading and preprocessing data...\n')
df = load_and_preprocess_data(infile_path, period)


Loading and preprocessing data...

      date  permno
0  1980-01   10006
1  1980-01   10057
2  1980-01   10103
3  1980-01   10137
4  1980-01   10145
--------------------------------------------------


In [122]:
# Drop null values in the target column and get years 2020 or prior
df1 = df.dropna(subset=[target])
df1 = df1[df1['pyear'] <= 2020]
df1.reset_index(drop=True, inplace=True)

In [123]:
df1

Unnamed: 0,permno,gvkey,adatadate,fyear,sic2,spi,mve_f,bm,ep,cashpr,...,std_dolvol,std_turn,ill,zerotrade,beta,betasq,rsq1,pricedelay,idiovol,pyear
0,10006,1010,12/31/1978,1978,37,0.0000,269.308500,1.180962,0.153022,-32.218678,...,0.881844,0.635898,2.565667e-08,1.115306e-07,1.060420,1.124491,0.343408,0.029859,0.025576,1980
1,10057,1098,09/30/1978,1978,36,0.0000,97.372000,0.956692,0.135131,-4.408581,...,1.368363,2.546787,2.719812e-07,6.199128e-08,1.526013,2.328716,0.307905,0.092667,0.037473,1980
2,10103,1012,10/31/1978,1978,33,,1.697500,3.362003,0.338144,-17.143817,...,,,,,1.759493,3.095816,0.096753,0.221851,0.087020,1980
3,10137,1279,12/31/1978,1978,49,,537.524500,1.330341,0.153238,-87.819837,...,0.553246,0.740017,1.765620e-08,9.726790e-08,0.492885,0.242936,0.189693,0.125777,0.017540,1980
4,10145,1300,12/31/1978,1978,99,-0.0031,805.633282,1.579284,0.149248,-22.050470,...,0.427617,0.657563,2.898901e-09,6.190654e-08,1.139163,1.297691,0.279437,0.024228,0.031201,1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2165510,93422,154357,12/31/2019,2019,13,-0.0090,1069.650000,2.487356,-0.090964,-14.117559,...,0.797001,12.233361,7.505129e-09,5.571619e-09,2.691027,7.241625,0.265207,0.257939,0.132692,2020
2165511,93423,10567,12/31/2019,2019,79,0.0004,3817.839740,-0.187572,0.046902,19.464647,...,0.519033,17.649093,4.462048e-10,3.803709e-09,1.921529,3.692274,0.485215,0.068369,0.061119,2020
2165512,93426,185138,12/31/2019,2019,36,-0.0108,459.782000,0.524944,0.048258,1.095352,...,0.473367,2.144264,2.296462e-08,3.236729e-08,1.302016,1.695247,0.472220,0.037482,0.043174,2020
2165513,93434,184259,06/30/2019,2019,1,-0.1349,87.853920,1.138777,-0.105914,-13.505851,...,0.935967,0.897075,3.435272e-07,1.037670e-07,0.389842,0.151977,0.021429,-0.694649,0.073887,2020


In [239]:
print('Training in progress...\n')
# Build a training pipeline
pipeline = build_pipeline(con_list, dum_list, 5, 95, period)

Training in progress...



In [240]:
pipeline

In [149]:
sample = df1.loc[df1['permno']==10103]

In [150]:
sample[['permno', 'date']]

Unnamed: 0,permno,date
2,10103,1980-01
3462,10103,1980-02
6932,10103,1980-03
10441,10103,1980-04
13941,10103,1980-05
...,...,...
479711,10103,1989-07
484311,10103,1989-08
488942,10103,1989-09
493659,10103,1989-10


In [151]:
# Set year range of the sample
years = list(sample['pyear'].drop_duplicates().sort_values())

In [152]:
years

[1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989]

In [153]:
year = 1986

In [154]:
train_data = sample.loc[(sample['pyear']<=year)]
test_data = sample.loc[(sample['pyear']==year+1)]

In [155]:
# Training and testing data
training_years = sorted(x_train.pyear.unique())
print(f'Training Years: {training_years}\n')
print(f'Testing Year: {test_data.pyear.unique()}')

Training Years: [1980, 1981, 1982, 1983, 1984, 1985, 1986]

Testing Year: [1987]


In [512]:
class SequenceDataset(Dataset):
    def __init__(self, train_data, test_data, features, target, pipeline, sequence_length=5):
        self.train_data = train_data
        self.test_data = test_data
        self.features = features
        self.target = target
        self.pipeline = pipeline
        self.sequence_length = sequence_length
        self.x_train, self.y_train, self.x_test, self.y_test = self._transform_data()
        
    def _transform_data(self):
        """
        Transform the data using provided pipeline
        """
        x_train = self.train_data.loc[:, self.features]
        y_train = self.train_data.loc[:, self.target]
        
        # Fit the pipeline to the training data
        self.pipeline.fit(x_train)
        x_train = self.pipeline.transform(x_train)
        x_train = x_train[:, :-2]
        
        x_test = self.test_data.loc[:, self.features] 
        y_test = self.test_data.loc[:, self.target]
        
        # Fit the pipeline to the testing data  
        x_test = self.pipeline.transform(x_test)
        x_test = x_test[:, :-2]
        
        # Transform data into numpy array as type float32
        x_train = x_train.astype(np.float32)
        y_train = y_train.to_numpy(np.float32)
        x_test = x_test.astype(np.float32)
        y_test = y_test.to_numpy(np.float32)
        
        # # Transform them to tensor floats
        x_train = torch.tensor(x_train).float()
        y_train = torch.tensor(y_train).float()
        x_test = torch.tensor(x_test).float()
        y_test = torch.tensor(y_test).float()
        
        print(f'x_train shape: {x_train.shape}')
        print(f'y_train shape: {y_train.shape}\n')
        print(f'x_test shape: {x_test.shape}')
        print(f'y_test shape: {y_test.shape}\n')
        
        return x_train, y_train, x_test, y_test
        
    def __len__(self):
        return max(self.x_train.shape[0], self.x_test.shape[0])

    def __getitem__(self, i):
        """
        Return the corresponding ith data for all x_train, y_train, x_test, y_test
        """
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x_train = self.x_train[i_start:(i + 1), :]
            x_test = self.x_test[i_start:(i+1), :]
        else:
            # repeat the first row as many times to make up the gap
            train_padding = self.x_train[0].repeat(self.sequence_length - i - 1, 1)
            test_padding = self.x_test[0].repeat(self.sequence_length - i - 1, 1)
            # concatenate the padding with the available rows
            x_train = self.x_train[0:(i + 1), :]
            x_train = torch.cat((train_padding, x_train), 0)
            x_test = self.x_test[0:(i+1), :]
            x_test = torch.cat((test_padding, x_test), 0)
            
        # Since train and test have different length, return None when index i exceeds max length
        if i >= self.y_train.shape[0]:
            x_train = None
            y_train = None
            pass
        else:
            y_train = self.y_train[i]
            print(f'x_train ith item shape: {x_train.shape}')
            print(f'y_train: {y_train}')
            
        if i >= self.y_test.shape[0]:
            x_test = None
            y_test = None
            pass
        else:
            y_test = self.y_test[i]
            print(f'x_test ith item shape: {x_test.shape}')
            print(f'y_test: {y_test}')

        return x_train, y_train, x_test, y_test

In [517]:
dataset = SequenceDataset(train_data, test_data, deps, target, pipeline, 3)

x_train shape: torch.Size([84, 102])
y_train shape: torch.Size([84])

x_test shape: torch.Size([12, 102])
y_test shape: torch.Size([12])



  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [518]:
dataset[11]

x_train ith item shape: torch.Size([3, 102])
y_train: 0.0
x_test ith item shape: torch.Size([3, 102])
y_test: 0.03333299979567528


(tensor([[ 0.7499,  0.6793,  0.0000, -1.2978, -0.8048,  0.7191,  1.2253,  1.3455,
           1.3428,  1.1902,  0.0000, -1.4420, -1.4628, -1.1093,  0.3620,  0.0111,
           1.0113, -2.2560,  0.0000,  0.0262, -0.4906,  0.0000, -0.7407,  0.0000,
           0.0000, -0.3796,  1.4237,  0.0000, -0.5749,  0.0000,  0.0000, -1.4784,
          -2.2394,  0.0000, -0.3324, -0.0843, -1.6917,  1.1095, -2.0936,  0.6739,
          -0.4314,  0.2874, -1.9973,  0.8968, -0.1511, -0.8526, -1.1743, -1.6113,
           0.1533, -0.7896,  0.0000, -2.1048,  0.7597,  0.0000,  0.0000, -0.8128,
           1.0980, -1.3131,  1.5445,  2.1129, -0.4312,  1.3736, -0.1862, -1.6223,
          -0.7674, -0.3479,  0.1157, -0.8245, -0.7717, -0.7289, -0.1765, -0.1333,
          -0.5189,  0.6398,  0.0000,  0.0000,  0.0000, -0.4560,  0.0000,  1.3541,
          -1.2003, -1.1821,  0.0848,  0.0000, -0.4491,  1.1873, -0.0420, -0.4636,
           0.0000,  0.0000,  0.0000, -1.7919,  0.8007,  0.2662,  0.6818,  0.0000,
           0.000

In [519]:
data_loader = DataLoader(dataset, batch_size=13)

In [520]:
next(iter(data_loader))

x_train ith item shape: torch.Size([3, 102])
y_train: 0.08333300054073334
x_test ith item shape: torch.Size([3, 102])
y_test: 0.06666699796915054
x_train ith item shape: torch.Size([3, 102])
y_train: -0.23076899349689484
x_test ith item shape: torch.Size([3, 102])
y_test: 0.0625
x_train ith item shape: torch.Size([3, 102])
y_train: 0.0
x_test ith item shape: torch.Size([3, 102])
y_test: 0.3823530077934265
x_train ith item shape: torch.Size([3, 102])
y_train: 0.10000000149011612
x_test ith item shape: torch.Size([3, 102])
y_test: -0.10638300329446793
x_train ith item shape: torch.Size([3, 102])
y_train: 0.0
x_test ith item shape: torch.Size([3, 102])
y_test: -0.04761900007724762
x_train ith item shape: torch.Size([3, 102])
y_train: -0.09090899676084518
x_test ith item shape: torch.Size([3, 102])
y_test: -0.02500000037252903
x_train ith item shape: torch.Size([3, 102])
y_train: -0.20000000298023224
x_test ith item shape: torch.Size([3, 102])
y_test: -0.025640999898314476
x_train ith item

TypeError: expected Tensor as element 12 in argument 0, but got NoneType

## QA

In [521]:
temp = SequenceDataset(train_data, test_data, deps, target, pipeline, 5)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


x_train shape: torch.Size([84, 102])
y_train shape: torch.Size([84])

x_test shape: torch.Size([12, 102])
y_test shape: torch.Size([12])



In [522]:
temp[5]

x_train ith item shape: torch.Size([5, 102])
y_train: -0.09090899676084518
x_test ith item shape: torch.Size([5, 102])
y_test: -0.02500000037252903


(tensor([[-0.1187, -0.0967,  0.0000, -1.7844,  0.1356,  1.3458,  2.2846,  3.2693,
           0.3142, -0.2635,  0.0000, -0.9551,  0.1306, -3.1040,  0.7440, -0.0222,
          -0.0753, -0.0151,  0.0000,  0.0097,  0.0219,  0.0000, -0.2875,  0.0000,
           0.0000, -2.4279, -2.1761,  0.0000, -0.7733,  0.0000,  0.0000, -0.0250,
           1.0126,  0.0000, -0.0252,  0.1686,  0.1891,  1.2705,  0.0123,  0.4018,
          -0.6628,  0.6060,  0.0885, -1.4486,  0.1700,  1.5489,  0.0231,  0.4091,
          -0.1774, -0.0247,  0.0000, -0.5897,  2.0314,  0.0000,  0.0000, -0.1237,
           0.1028,  0.1138, -0.1206, -0.0877, -0.1544, -0.0644, -0.0066, -0.0984,
          -0.0579,  0.0137, -0.0554, -0.2955, -0.1715, -2.1826,  0.3529,  0.2666,
           1.0377,  1.1406,  0.0000,  0.0000,  0.0000,  0.4648,  0.0000, -0.4367,
          -1.0676,  1.1820, -0.1696,  0.0000,  0.0319, -1.0462,  0.7483, -0.0803,
           0.0000,  0.0000,  0.0000, -0.8208,  0.2741, -0.4960,  1.3358,  0.0000,
           0.000