In [1]:
import pandas as pd
import numpy as np
from functions import *

%load_ext autoreload
%autoreload 2

In [2]:
trade = pd.read_csv('data/trade.csv', header=[0,1], index_col=0)
stocks = pd.read_csv('data/stocks.csv')
dk_industry = pd.read_csv('data/dk_industry.csv')
financials = pd.read_csv('data/financials.csv')
beta = pd.read_csv('data/beta.csv')

# display(trade.head())
# display(stocks.head())
# display(dk_industry.head())

In [3]:
# offsets
quarterly_offset = 1 # quarterly data is 2 months behind end of quarter
annual_offset = 6 # annual data is 6 months behind publication

In [4]:
# convert the index to datetime (the index holds the dates)
trade.index = pd.to_datetime(trade.index)
trade = trade.stack(level=0).reset_index()


# rename columns to have a proper
trade.rename(columns={'level_1': 'ticker'}, inplace=True)

# Data Cleaning and Sorting
trade.drop_duplicates(inplace=True)
trade.dropna(inplace=True) # happens if there was one data point the first day of a given ticker but not the rest of the values (e.g. trade values but no ask or bid)
trade.sort_values(['ticker', 'timestamp'], inplace=True)
trade.reset_index(drop=True, inplace=True)

# display(trade)


  trade = trade.stack(level=0).reset_index()


In [5]:
df = pd.merge(trade, stocks[['ticker','shares','NACE']], how='left', on=['ticker'])
# display(df)

In [6]:
dk_industry['timestamp'] = pd.to_datetime(dk_industry['timestamp'])
# join stocks and dk_industry on 'NACE industry' 
industry = stocks[['ticker','NACE']].merge(dk_industry, how='left', on='NACE')

# adjust the timestamp to be 2 months behind
industry['timestamp'] = industry['timestamp'] + pd.DateOffset(months=2+quarterly_offset)
industry['timestamp'] = (
    industry['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

industry = industry.drop(columns=['NACE'])

# display(industry)

In [7]:
df = pd.merge(df, industry, how='left', on=['timestamp', 'ticker'])

#ffil the industry values
for col in industry.columns[2:]:
    df[col] = df[col].groupby(df['ticker']).ffill()

# display(df)


In [8]:
financials['timestamp'] = pd.to_datetime(financials['timestamp'])

financials['timestamp1'] = financials['timestamp']

financials['timestamp'] = financials['timestamp'] + pd.DateOffset(months=annual_offset)
financials['timestamp'] = (
    financials['timestamp']
      .dt.to_period('M')
      .dt.to_timestamp('M')
)

# expand the dataset
financials = (
    financials
        # sort data and find the next timestamp
      .sort_values(['ticker','timestamp'])
      .assign(
        next_fye   = lambda df: df.groupby('ticker')['timestamp'].shift(-1),
        plus_12m   = lambda df: df['timestamp'] + pd.DateOffset(months=12),
        period_end = lambda df: pd.to_datetime(np.where(
                          (df.next_fye - df.timestamp).abs()
                            < 
                          (df.plus_12m   - df.timestamp).abs(),
                          df.next_fye,
                          df.plus_12m
                        ))
      )
      # expand the data
      .assign(timestamp = lambda df: df.apply(expand_monthly, axis=1))
      .explode('timestamp')
      .drop(columns=['next_fye','plus_12m','period_end'])
      .reset_index(drop=True)
)

# display(financials)

In [9]:
# join df and financials
df = pd.merge(df, financials, how='left', on=['timestamp', 'ticker'])

# display(df)

### Create variables

In [10]:
df['target'] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=1))
df['target'] = df['target'].shift(-1) # shift the target by 1 month

# momentum
momentum_periods = {'mom1m': 1, 'mom3m': 3, 'mom6m': 6, 'mom12m': 12}

# compute percentage change over each period for each ticker separately.
for feature_name, period in momentum_periods.items():
    df[feature_name] = df.groupby('ticker')['adjclose'].transform(lambda x: x.pct_change(periods=period))

display(df)

Unnamed: 0,timestamp,ticker,ask,bid,adjclose,high,low,open,volume,turnover,...,opinc,cashflow,debt,currentassets,timestamp1,target,mom1m,mom3m,mom6m,mom12m
0,2001-01-31,AAB.CO,4183.493029,4026.612041,4026.612041,4183.493029,2614.683143,2771.564132,6.915369e+02,2.255122e+06,...,,,,,NaT,-0.064935,,,,
1,2001-02-28,AAB.CO,3765.143726,3660.556401,3765.143726,4235.786692,3399.088086,4131.199367,5.061034e+02,2.047656e+06,...,,,,,NaT,-0.069444,-0.064935,,,
2,2001-03-31,AAB.CO,3660.556401,3529.822244,3503.675412,3765.143726,3451.381749,3765.143726,1.172800e+02,4.385660e+05,...,,,,,NaT,-0.089552,-0.069444,,,
3,2001-04-30,AAB.CO,3346.794423,3189.913435,3189.913435,4183.493029,3189.913435,4183.493029,1.263059e+02,4.467180e+05,...,,,,,NaT,-0.098361,-0.089552,-0.207792,,
4,2001-05-31,AAB.CO,3137.619772,3006.885615,2876.151458,3399.088086,2876.151458,3137.619772,1.653355e+02,5.048560e+05,...,,,,,NaT,0.018182,-0.098361,-0.236111,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49761,2024-11-30,ZELA.CO,730.000000,729.000000,730.000000,893.000000,672.500000,789.500000,4.650485e+06,3.587543e+09,...,-707826000.0,-678878000.0,119230000.0,1.779788e+09,2023-12-31,-0.019863,-0.076534,-0.176537,0.141517,1.172619
49762,2024-12-31,ZELA.CO,716.500000,715.500000,715.500000,819.000000,587.000000,728.000000,4.138727e+06,2.995576e+09,...,-707826000.0,-678878000.0,119230000.0,1.779788e+09,2023-12-31,0.027952,-0.019863,-0.119926,-0.198319,0.917203
49763,2025-01-31,ZELA.CO,738.000000,737.000000,735.500000,808.000000,682.500000,718.500000,3.538390e+06,2.572094e+09,...,-707826000.0,-678878000.0,119230000.0,1.779788e+09,2023-12-31,-0.098572,0.027952,-0.069576,-0.209140,0.557603
49764,2025-02-28,ZELA.CO,665.000000,664.000000,663.000000,788.000000,657.000000,714.500000,3.876841e+06,2.801496e+09,...,-707826000.0,-678878000.0,119230000.0,1.779788e+09,2023-12-31,-0.187783,-0.098572,-0.091781,-0.252115,-0.001506


In [11]:
# drop if mom12m is nan
df.dropna(subset=['mom12m','target'], inplace=True)

nan_tickers = df[df.isna().any(axis=1)][['ticker','timestamp','timestamp1','netinc', 'assets', 'revenue', 'rnd', 'cash',
       'grossprofit', 'opinc', 'cashflow', 'debt', 'currentassets']].drop_duplicates()
nan_tickers = nan_tickers.sort_values(['ticker','timestamp1'])
# nan_tickers['year'] = nan_tickers['timestamp1'].dt.year
nan_tickers = nan_tickers[['ticker','timestamp','timestamp1','netinc', 'assets', 'revenue', 'rnd', 'cash',
       'grossprofit', 'opinc', 'cashflow', 'debt', 'currentassets']].drop_duplicates().reset_index(drop=True)
display(nan_tickers)
display(nan_tickers[(nan_tickers['ticker'] == 'MOLS.CO^J16')])

# count by ticker
nan_tickers_count = nan_tickers.groupby('ticker').count().reset_index()
print(nan_tickers_count)

Unnamed: 0,ticker,timestamp,timestamp1,netinc,assets,revenue,rnd,cash,grossprofit,opinc,cashflow,debt,currentassets
0,ALBCb.CO^F02,2002-03-31,NaT,,,,,,,,,,
1,ALBCb.CO^F02,2002-04-30,NaT,,,,,,,,,,
2,ESI.CO^I02,2002-06-30,NaT,,,,,,,,,,
3,ESI.CO^I02,2002-07-31,NaT,,,,,,,,,,
4,FPLIM.CO^C14,2007-09-30,2006-06-30,118344000.0,952790000.0,139347000.0,0.0,,139347000.0,120718000.0,8385000.0,3233000.0,9583000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,UPAL.CO^H10,2006-06-30,NaT,,,,,,,,,,
186,UPAL.CO^H10,2006-07-31,NaT,,,,,,,,,,
187,UPAL.CO^H10,2006-08-31,NaT,,,,,,,,,,
188,VEND.CO^C02,2002-01-31,NaT,,,,,,,,,,


Unnamed: 0,ticker,timestamp,timestamp1,netinc,assets,revenue,rnd,cash,grossprofit,opinc,cashflow,debt,currentassets


           ticker  timestamp  timestamp1  netinc  assets  revenue  rnd  cash  \
0    ALBCb.CO^F02          2           0       0       0        0    0     0   
1      ESI.CO^I02          2           0       0       0        0    0     0   
2    FPLIM.CO^C14          3           3       3       3        3    3     0   
3     FUEI.CO^E13          1           0       0       0        0    0     0   
4    FUEII.CO^E13          1           1       0       0        1    1     0   
5    FUNKI.CO^J04          9           9       9       9        9    9     0   
6           GJ.CO          9           0       0       0        0    0     0   
7    IDATA.CO^H02          1           0       0       0        0    0     0   
8     IFAC.CO^D03         15           0       0       0        0    0     0   
9    IPFCa.CO^G02          5           0       0       0        0    0     0   
10   IPFCb.CO^G02          5           0       0       0        0    0     0   
11     KAP.CO^D10          4           0

In [12]:
# drop if mom12m is nan
df.dropna(subset=['mom12m','target'], inplace=True)

nan_tickers = df[df.isna().any(axis=1)][['ticker','timestamp1','netinc', 'assets', 'revenue', 'rnd', 'cash',
       'grossprofit', 'opinc', 'cashflow', 'debt', 'currentassets']].drop_duplicates()
nan_tickers = nan_tickers.sort_values(['ticker','timestamp1'])
# nan_tickers['year'] = nan_tickers['timestamp1'].dt.year
nan_tickers = nan_tickers[['ticker','timestamp1','netinc', 'assets', 'revenue', 'rnd', 'cash',
       'grossprofit', 'opinc', 'cashflow', 'debt', 'currentassets']].drop_duplicates().reset_index(drop=True)
display(nan_tickers)

Unnamed: 0,ticker,timestamp1,netinc,assets,revenue,rnd,cash,grossprofit,opinc,cashflow,debt,currentassets
0,ALBCb.CO^F02,NaT,,,,,,,,,,
1,ESI.CO^I02,NaT,,,,,,,,,,
2,FPLIM.CO^C14,2006-06-30,118344000.0,952790000.0,139347000.0,0.0,,139347000.0,120718000.0,8385000.0,3233000.0,9583000.0
3,FUEI.CO^E13,NaT,,,,,,,,,,
4,FUEII.CO^E13,2007-12-31,,,0.0,0.0,,0.0,0.0,,,0.0
5,FUNKI.CO^J04,2001-04-30,-16111000.0,156747000.0,175359000.0,0.0,,84135000.0,-20627000.0,-6114000.0,70304000.0,84775000.0
6,GJ.CO,NaT,,,,,,,,,,
7,IDATA.CO^H02,NaT,,,,,,,,,,
8,IFAC.CO^D03,NaT,,,,,,,,,,
9,IPFCa.CO^G02,NaT,,,,,,,,,,


In [13]:
# display(financials[(financials['ticker'] == 'INVb.CO^F05') & (financials['timestamp1'].dt.year == 2002)])
# display(financials[(financials['ticker'] == 'INVb.CO^F05') & (financials['timestamp1'].dt.year == 2017)])
# display(financials[(financials['ticker'] == 'INVb.CO^F05')].head(60))