In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import Data

In [30]:
data_file_path = './test_period_top15_11Jun24_1546.csv'

In [31]:
df = pd.read_csv(data_file_path)
df.head()

  df = pd.read_csv(data_file_path)


Unnamed: 0,PERMNO,date,NAMEENDT,SHRCD,EXCHCD,SICCD,NCUSIP,TICKER,COMNAM,SHRCLS,...,CFACPR,CFACSHR,OPENPRC,NUMTRD,RETX,vwretd,vwretx,ewretd,ewretx,sprtrn
0,76932,2016-01-04,,12,1,4612,29250N10,ENB,ENBRIDGE INC,,...,1.0,1.0,32.62,,-0.008737,-0.01489,-0.015053,-0.012346,-0.012431,-0.015304
1,76932,2016-01-05,,12,1,4612,29250N10,ENB,ENBRIDGE INC,,...,1.0,1.0,32.74,,-0.000304,0.001397,0.001394,0.000358,0.000335,0.002012
2,76932,2016-01-06,,12,1,4612,29250N10,ENB,ENBRIDGE INC,,...,1.0,1.0,32.08,,-0.037093,-0.013799,-0.014067,-0.013461,-0.013512,-0.013115
3,76932,2016-01-07,,12,1,4612,29250N10,ENB,ENBRIDGE INC,,...,1.0,1.0,31.06,,-0.021156,-0.023909,-0.023933,-0.024817,-0.024835,-0.0237
4,76932,2016-01-08,,12,1,4612,29250N10,ENB,ENBRIDGE INC,,...,1.0,1.0,31.05,,0.022903,-0.010935,-0.01094,-0.01028,-0.010301,-0.010838


In [32]:
## Check the trade status: Expect Active
df['TRDSTAT'].value_counts()

TRDSTAT
A    50173
Name: count, dtype: int64

In [33]:
## Check number of unique PERMCO
len(df['PERMCO'].unique())

25

In [34]:
## Check number of unique CUSIP
len(df['CUSIP'].unique())

26

In [35]:
## Identify duplicated PERMCO
tmp = df[['PERMCO', 'CUSIP']].drop_duplicates().groupby('PERMCO').count().reset_index()
tmp[tmp['CUSIP'] == 2]

Unnamed: 0,PERMCO,CUSIP
1,11937,2


In [36]:
## There are Share class A and B in the data, therefore class A will be select due to its higher market cap
tmp = df[df['PERMCO'] == 47003][['CUSIP', 'TICKER', 'SHRCLS', 'PRC', 'SHROUT']].drop_duplicates()
tmp['market_cap'] = tmp['PRC'] * tmp['SHROUT']
tmp.groupby('SHRCLS')['market_cap'].mean()

Series([], Name: market_cap, dtype: float64)

In [37]:
## Filter out Class B of PERMCO=47003
df = df[(df['PERMCO']!=47003) | 
        ((df['PERMCO']==47003) & (df['SHRCLS']=='A'))]

In [38]:
## Filter out negative prices
df = df[df['PRC'] >= 0]

In [39]:
## Clean data before calculation
df = df.dropna(subset=['TICKER', 'RET'])
df = df[df['RET'] != 'C']
df['RET'] = df['RET'].astype(float)

In [40]:
df.columns

Index(['PERMNO', 'date', 'NAMEENDT', 'SHRCD', 'EXCHCD', 'SICCD', 'NCUSIP',
       'TICKER', 'COMNAM', 'SHRCLS', 'TSYMBOL', 'NAICS', 'PRIMEXCH', 'TRDSTAT',
       'SECSTAT', 'PERMCO', 'ISSUNO', 'HEXCD', 'HSICCD', 'CUSIP', 'DCLRDT',
       'DLAMT', 'DLPDT', 'DLSTCD', 'NEXTDT', 'PAYDT', 'RCRDDT', 'SHRFLG',
       'HSICMG', 'HSICIG', 'DISTCD', 'DIVAMT', 'FACPR', 'FACSHR', 'ACPERM',
       'ACCOMP', 'SHRENDDT', 'NWPERM', 'DLRETX', 'DLPRC', 'DLRET', 'TRTSCD',
       'NMSIND', 'MMCNT', 'NSDINX', 'BIDLO', 'ASKHI', 'PRC', 'VOL', 'RET',
       'BID', 'ASK', 'SHROUT', 'CFACPR', 'CFACSHR', 'OPENPRC', 'NUMTRD',
       'RETX', 'vwretd', 'vwretx', 'ewretd', 'ewretx', 'sprtrn'],
      dtype='object')

# Calculate Adjusted Price

In [41]:
df['adjusted_prc'] = df['PRC'] / df['CFACPR']

# Calculate Weekly Return

In [42]:
## calulate the following friday
df['date'] = pd.to_datetime(df['date'])
df['fri_of_week'] = df['date'].dt.to_period('W-FRI').apply(lambda r: r.end_time).dt.date

## Before doing this step, first manually ensure the first monday does in in the prev year
df = df[(df['fri_of_week'].astype(str)>='2016-01-05') 
            & (df['fri_of_week'].astype(str)<='2023-12-31')]

In [44]:
## Calculate weekly return
df3 = df.copy()
df3['log_RET_plus1'] = np.log(df3['RET'] + 1)
df4 = df3.groupby(['PERMCO','fri_of_week']).agg({'log_RET_plus1': 'sum', 
                                                                     'TICKER': lambda x: list(set(x))[0], 
                                                                     'COMNAM': lambda x: list(set(x))[0]})
df4 = df4.reset_index().rename(columns={'log_RET_plus1': 'sum_log_RET_plus1'})
df4['weekly_return'] = (np.exp(df4['sum_log_RET_plus1']) - 1) * 100
df4.head()

Unnamed: 0,PERMCO,fri_of_week,sum_log_RET_plus1,TICKER,COMNAM,weekly_return
0,2381,2016-01-08,-0.045616,ENB,ENBRIDGE INC,-4.459144
1,2381,2016-01-15,-0.045155,ENB,ENBRIDGE INC,-4.415097
2,2381,2016-01-22,0.098574,ENB,ENBRIDGE INC,10.359645
3,2381,2016-01-29,0.033804,ENB,ENBRIDGE INC,3.438143
4,2381,2016-02-05,-0.020734,ENB,ENBRIDGE INC,-2.052012


In [45]:
df4.shape

(10393, 6)

In [46]:
## Check how many weeks each stock has
df4.groupby('PERMCO')['fri_of_week'].count().reset_index().groupby('fri_of_week').count().reset_index()

Unnamed: 0,fri_of_week,PERMCO
0,385,1
1,417,24


In [47]:
## Check how many weeks each stock has
df4.groupby('PERMCO')['fri_of_week'].nunique().reset_index().groupby('fri_of_week').count().reset_index()

Unnamed: 0,fri_of_week,PERMCO
0,385,1
1,417,24


In [48]:
df4_non_missing = df4.copy()

In [49]:
## Check the oldest and newest friday of week in the data
print(df4_non_missing['fri_of_week'].min(), df4_non_missing['fri_of_week'].max())

2016-01-08 2023-12-29


In [50]:
## Drop duplicated (PERMCO, fri-of-week) in which keeping the first row
df4_non_missing = df4_non_missing.drop_duplicates(subset=['PERMCO', 'fri_of_week'])

In [51]:
## Add a column of the return sign
df4_non_missing['weekly_return_sign'] = np.sign(df4_non_missing['weekly_return'])
df4_non_missing['weekly_return_sign'] = df4_non_missing['weekly_return_sign'].apply(lambda x: 'pos' if x > 0 else 'neg')
df4_non_missing.head()

Unnamed: 0,PERMCO,fri_of_week,sum_log_RET_plus1,TICKER,COMNAM,weekly_return,weekly_return_sign
0,2381,2016-01-08,-0.045616,ENB,ENBRIDGE INC,-4.459144,neg
1,2381,2016-01-15,-0.045155,ENB,ENBRIDGE INC,-4.415097,neg
2,2381,2016-01-22,0.098574,ENB,ENBRIDGE INC,10.359645,pos
3,2381,2016-01-29,0.033804,ENB,ENBRIDGE INC,3.438143,pos
4,2381,2016-02-05,-0.020734,ENB,ENBRIDGE INC,-2.052012,neg


In [52]:
## Add a column of the return sign
df4_non_missing['weekly_return_sign'] = np.sign(df4_non_missing['weekly_return'])
df4_non_missing['weekly_return_sign'] = df4_non_missing['weekly_return_sign'].apply(lambda x: 1 if x > 0 else 0)
df4_non_missing.head()

Unnamed: 0,PERMCO,fri_of_week,sum_log_RET_plus1,TICKER,COMNAM,weekly_return,weekly_return_sign
0,2381,2016-01-08,-0.045616,ENB,ENBRIDGE INC,-4.459144,0
1,2381,2016-01-15,-0.045155,ENB,ENBRIDGE INC,-4.415097,0
2,2381,2016-01-22,0.098574,ENB,ENBRIDGE INC,10.359645,1
3,2381,2016-01-29,0.033804,ENB,ENBRIDGE INC,3.438143,1
4,2381,2016-02-05,-0.020734,ENB,ENBRIDGE INC,-2.052012,0


In [53]:
df4_non_missing.groupby('PERMCO')['TICKER'].count().reset_index().groupby('TICKER').count()

Unnamed: 0_level_0,PERMCO
TICKER,Unnamed: 1_level_1
385,1
417,24


In [54]:
## Sort to ensure proper order
df4_non_missing.sort_values(by=['PERMCO', 'fri_of_week'], inplace=True)

## Calculate the moving return for each ticker
df4_non_missing['previous_weekly_return'] = df4_non_missing.groupby(['PERMCO'])['weekly_return'].shift(1)
df4_non_missing['moving_return'] = df4_non_missing['weekly_return'] - df4_non_missing['previous_weekly_return']

## Add a column of the moving return direction
df4_non_missing['moving_return_dir'] = np.sign(df4_non_missing['moving_return'])
df4_non_missing['moving_return_dir'] = df4_non_missing['moving_return_dir'].apply(lambda x: 1 if x > 0 else 0)

In [55]:
df4_non_missing.groupby('PERMCO')['TICKER'].count().reset_index().groupby('TICKER').count()

Unnamed: 0_level_0,PERMCO
TICKER,Unnamed: 1_level_1
385,1
417,24


In [57]:
## Calculate adjusted price at the end of the week
df5 = df.copy()

## Select eow row
df5 = df5.sort_values(by=['PERMCO', 'date'], ascending=True).groupby(['PERMCO', 'fri_of_week']).last().reset_index()

df5_non_missing = df5.copy()
df5_non_missing = df5_non_missing[['PERMCO', 'fri_of_week', 'adjusted_prc']]
df5_non_missing = df5_non_missing.rename(columns={'adjusted_prc': 'eow_adjusted_prc'})

In [58]:
df5_non_missing.groupby('PERMCO')['fri_of_week'].count().reset_index().groupby('fri_of_week').count()

Unnamed: 0_level_0,PERMCO
fri_of_week,Unnamed: 1_level_1
385,1
417,24


In [59]:
df6 = pd.merge(df4_non_missing, df5_non_missing, on=['PERMCO', 'fri_of_week'], how='inner')

In [60]:
## Sort to ensure proper order
df6.sort_values(by=['PERMCO', 'fri_of_week'], inplace=True)

## Calculate the moving return for each ticker
df6['previous_eom_adjusted_prc'] = df6.groupby(['PERMCO'])['eow_adjusted_prc'].shift(1)
df6['moving_prc'] = df6['eow_adjusted_prc'] - df6['previous_eom_adjusted_prc']

## Add a column of the moving return direction
df6['moving_prc_dir'] = np.sign(df6['moving_prc'])
df6['moving_prc_dir'] = df6['moving_prc_dir'].apply(lambda x: 1 if x > 0 else 0)

In [61]:
df6.groupby('PERMCO')['TICKER'].count().reset_index().groupby('TICKER').count()

Unnamed: 0_level_0,PERMCO
TICKER,Unnamed: 1_level_1
385,1
417,24


In [62]:
## Remove data of missing return or moving prc
df7 = df6[~df6['moving_return'].isna() & ~df6['moving_prc'].isna()]

In [63]:
## Check number of unique PERMCO
df7['PERMCO'].nunique()

25

In [64]:
df7.head()

Unnamed: 0,PERMCO,fri_of_week,sum_log_RET_plus1,TICKER,COMNAM,weekly_return,weekly_return_sign,previous_weekly_return,moving_return,moving_return_dir,eow_adjusted_prc,previous_eom_adjusted_prc,moving_prc,moving_prc_dir
1,2381,2016-01-15,-0.045155,ENB,ENBRIDGE INC,-4.415097,0,-4.459144,0.044046,1,30.31,31.71,-1.4,0
2,2381,2016-01-22,0.098574,ENB,ENBRIDGE INC,10.359645,1,-4.415097,14.774742,1,33.45,30.31,3.14,1
3,2381,2016-01-29,0.033804,ENB,ENBRIDGE INC,3.438143,1,10.359645,-6.921502,0,34.6,33.45,1.15,1
4,2381,2016-02-05,-0.020734,ENB,ENBRIDGE INC,-2.052012,0,3.438143,-5.490155,0,33.89,34.6,-0.71,0
5,2381,2016-02-12,-0.048848,ENB,ENBRIDGE INC,-4.767402,0,-2.052012,-2.71539,0,31.88,33.89,-2.01,0


In [65]:
df7.groupby('PERMCO')['TICKER'].count().reset_index().groupby('TICKER').count()

Unnamed: 0_level_0,PERMCO
TICKER,Unnamed: 1_level_1
384,1
416,24


In [66]:
## Save preprocessed data to csbv
df7.to_csv('./test_period_v6_preprocessed_top25_14Jun24.csv', index=False)

In [67]:
## Check available columns for future use
df7.columns

Index(['PERMCO', 'fri_of_week', 'sum_log_RET_plus1', 'TICKER', 'COMNAM',
       'weekly_return', 'weekly_return_sign', 'previous_weekly_return',
       'moving_return', 'moving_return_dir', 'eow_adjusted_prc',
       'previous_eom_adjusted_prc', 'moving_prc', 'moving_prc_dir'],
      dtype='object')

In [68]:
## Check the number of weeks of each stock
df7.groupby('PERMCO')['fri_of_week'].count().reset_index().groupby('fri_of_week').count()

Unnamed: 0_level_0,PERMCO
fri_of_week,Unnamed: 1_level_1
384,1
416,24


In [69]:
## Check the info of each stock
df7[['TICKER', 'COMNAM', 'PERMCO']].drop_duplicates().sort_values(by=['PERMCO'])

Unnamed: 0,TICKER,COMNAM,PERMCO
1,ENB,ENBRIDGE INC,2381
418,CHK,CHESAPEAKE ENERGY CORP,11937
803,HES,HESS CORP,20064
1220,BHI,BAKER HUGHES INC,20253
1297,BHI,BAKER HUGHES INC NEW,20253
1298,BHGE,BAKER HUGHES INC NEW,20253
1416,BKR,BAKER HUGHES INC NEW,20253
1417,BKR,BAKER HUGHES CO,20253
1637,BP,B P PLC,20333
2054,CVX,CHEVRON CORP NEW,20440
