In [11]:
import pandas as pd
import numpy as np


In [12]:
df = pd.read_csv('../../data/processed/data.csv')

max number of rows that can occur: 99

In [13]:
num_unique_tickers = df['ticker'].nunique()
print(f"Number of unique tickers: {num_unique_tickers}")

print('\n')

unique_tickers = df['ticker'].unique()
print(unique_tickers)

Number of unique tickers: 303


['11B' '3RG' 'ABE' 'ABS' 'ACG' 'ACP' 'ACT' 'ADV' 'AGO' 'AGT' 'ALG' 'ALL'
 'AMB' 'AMC' 'AML' 'ANR' 'APE' 'APL' 'APN' 'APR' 'APT' 'ARR' 'ART' 'ASB'
 'ASE' 'ASM' 'ATC' 'ATD' 'ATG' 'ATP' 'ATR' 'ATT' 'AWM' 'B24' 'BAH' 'BBT'
 'BCM' 'BCS' 'BCX' 'BDX' 'BDZ' 'BFT' 'BIO' 'BIP' 'BMC' 'BMX' 'BOW' 'BRA'
 'BRG' 'BRS' 'CAR' 'CCC' 'CDL' 'CDR' 'CEZ' 'CFI' 'CIE' 'CIG' 'CLN' 'CMP'
 'CMR' 'CNT' 'COG' 'CPL' 'CPR' 'CPS' 'CRI' 'CRJ' 'CRM' 'CTS' 'CTX' 'DAD'
 'DAT' 'DBC' 'DCR' 'DEK' 'DEL' 'DIG' 'DNP' 'DOM' 'DPL' 'DTR' 'EAH' 'EAT'
 'EDI' 'EEX' 'EFK' 'ELT' 'ELZ' 'EMC' 'ENA' 'ENE' 'ENG' 'ENI' 'ENP' 'ENT'
 'ERB' 'ERG' 'ETL' 'EUR' 'FEE' 'FER' 'FRO' 'FSG' 'FTE' 'GIF' 'GIG' 'GOB'
 'GOP' 'GPP' 'GRN' 'HDR' 'HEL' 'HLD' 'HRP' 'HRS' 'HUG' 'ICE' 'IFI' 'IFR'
 'IMC' 'IMS' 'INK' 'INL' 'IPO' 'IRL' 'ITB' 'IZB' 'IZO' 'IZS' 'JSW' 'JWW'
 'KCH' 'KCI' 'KDM' 'KER' 'KGH' 'KGL' 'KGN' 'KMP' 'KOM' 'KPD' 'KPL' 'KRK'
 'KTY' 'KVT' 'LAB' 'LBT' 'LBW' 'LEN' 'LES' 'LPP' 'LRK' 'LRQ' 'LSI' 'LTX'
 'LWB' 'MAB' 'MAK' 

In [14]:
ticker_counts = df['ticker'].value_counts()

print("\nSummary statistics:")
print(f"Maximum entries for a ticker: {ticker_counts.max()}")
print(f"Minimum entries for a ticker: {ticker_counts.min()}")
print(f"Average entries per ticker: {ticker_counts.mean():.2f}")
print(f"Median entries per ticker: {ticker_counts.median():.2f}")


Summary statistics:
Maximum entries for a ticker: 99
Minimum entries for a ticker: 1
Average entries per ticker: 51.14
Median entries per ticker: 50.00


In [15]:
# Drop tickers that are insufficient

tickers_to_drop = ['AGT', 'ANR', 'ASB', 'BBT', 'BCS', 'BCX', 'CRI', 'CRJ', 'CTS', 'CTX', 'DAD',
                  'DNP', 'GIF', 'GOP', 'GPP', 'HLD', 'HUG', 'ICE', 'IMC', 'KDM', 'KER', 'MLK',
                  'MLS', 'MOC', 'NNG', 'NTU', 'OND', 'PCF', 'PTG', 'PUR', 'SFG', 'SHO', 'SIM',
                  'SLV', 'SLZ', 'SPH', 'SPR', 'STH', 'SVRS', 'TEN', 'TMR', 'TXM', 'VRC']


df = df[~df['ticker'].isin(tickers_to_drop)]


df['end_of_period'] = df['end_of_period'].astype('datetime64[ns]')
cutoff_date = pd.to_datetime('2022-07-01')
df = df[df['end_of_period'] <= cutoff_date]

In [16]:
def fill_missing_quarters(df):
   new_rows = []

   for ticker in df['ticker'].unique():
       ticker_data = df[df['ticker'] == ticker]

       ticker_data = ticker_data.sort_values('end_of_period')

       first_date = ticker_data['end_of_period'].min()
       last_date = ticker_data['end_of_period'].max()

       day_of_month = ticker_data['end_of_period'].dt.day.mode()[0]

       all_quarters = []
       current_date = first_date

       while current_date <= last_date:
           all_quarters.append(current_date)
           year = current_date.year + (current_date.month + 3) // 12
           month = (current_date.month + 3 - 1) % 12 + 1
           current_date = pd.Timestamp(year=year, month=month, day=day_of_month)

       existing_dates = set(ticker_data['end_of_period'])
       all_quarters_set = set(all_quarters)
       missing_dates = all_quarters_set - existing_dates

       for missing_date in missing_dates:
           new_row = {'ticker': ticker, 'end_of_period': missing_date}
           new_rows.append(new_row)

   if new_rows:
       missing_df = pd.DataFrame(new_rows)

       for col in df.columns:
           if col not in ['ticker', 'end_of_period']:
               missing_df[col] = np.nan

       result_df = pd.concat([df, missing_df], ignore_index=True)
       result_df = result_df.sort_values(['ticker', 'end_of_period'])

       return result_df

   return df


df = fill_missing_quarters(df)

In [17]:
df_stooq = pd.read_csv('../../data/processed/stooq_data.csv')

In [18]:
df_stooq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594620 entries, 0 to 1594619
Data columns (total 11 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   TICKER   1594620 non-null  object 
 1   PER      1594620 non-null  object 
 2   DATE     1594620 non-null  object 
 3   TIME     1594620 non-null  int64  
 4   OPEN     1594620 non-null  float64
 5   HIGH     1594620 non-null  float64
 6   LOW      1594620 non-null  float64
 7   CLOSE    1594620 non-null  float64
 8   VOL      1594620 non-null  float64
 9   OPENINT  1594620 non-null  int64  
 10  target   1594620 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 133.8+ MB


In [19]:
import pandas as pd


def update_null_targets(df, df_stooq, tolerance_days=7):
    result_df = df.copy()

    if result_df['end_of_period'].dtype != 'datetime64[ns]':
        result_df['end_of_period'] = pd.to_datetime(result_df['end_of_period'])

    df_stooq_prep = df_stooq.copy()
    df_stooq_prep['DATE'] = pd.to_datetime(df_stooq_prep['DATE'])
    df_stooq_prep = df_stooq_prep.rename(columns={
        'TICKER': 'ticker',
        'DATE': 'end_of_period',
    })

    null_target_rows = result_df[result_df['target'].isna()].copy()

    if len(null_target_rows) == 0:
        return result_df

    merged_groups = []

    for ticker, group in null_target_rows.groupby('ticker'):
        group = group.sort_values('end_of_period')

        stooq_group = df_stooq_prep[df_stooq_prep['ticker'] == ticker].sort_values('end_of_period')

        if stooq_group.empty:
            merged_groups.append(group)
            continue

        stooq_group = stooq_group[['ticker', 'end_of_period', 'target']]

        try:
            merged = pd.merge_asof(
                group,
                stooq_group,
                on='end_of_period',
                by='ticker',
                direction='nearest',
                tolerance=pd.Timedelta(days=tolerance_days),
                suffixes=('', '_stooq')
            )

            if 'target_stooq' in merged.columns:
                merged['target'] = merged['target_stooq'].combine_first(merged['target'])
                merged = merged.drop('target_stooq', axis=1)

            merged_groups.append(merged)

        except Exception as e:
            merged_groups.append(group)

    updated_rows = pd.concat(merged_groups, ignore_index=True) if merged_groups else pd.DataFrame()

    if not updated_rows.empty:
        non_null_rows = result_df[~result_df['target'].isna()].copy()
        result_df = pd.concat([non_null_rows, updated_rows], ignore_index=True)
        result_df = result_df.sort_values(['ticker', 'end_of_period'])

    return result_df

In [20]:
df = update_null_targets(df, df_stooq, tolerance_days=7)

In [21]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor


datetime_cols = ['end_of_period']
object_cols = ['file_name', 'ticker', 'sector']
numeric_cols = [col for col in df.columns if col not in datetime_cols + object_cols]
numeric_data = df[numeric_cols].copy()

# Initialize the MICE imputer
# Using RandomForestRegressor as the estimator often gives good results
mice_imputer = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    max_iter=10,
    random_state=42,
    verbose=2
)

# Fit and transform the data
imputed_numeric_data = mice_imputer.fit_transform(numeric_data)
imputed_df = pd.DataFrame(imputed_numeric_data, columns=numeric_cols)

# Add back the non-numeric columns
for col in datetime_cols + object_cols:
    imputed_df[col] = df[col].values

# Verify the imputation results
print("Missing values before imputation:")
print(df[numeric_cols].isna().sum())

print("\nMissing values after imputation:")
print(imputed_df[numeric_cols].isna().sum())

[IterativeImputer] Completing matrix with shape (15528, 17)
[IterativeImputer] Ending imputation round 1/10, elapsed time 497.93
[IterativeImputer] Change: 4750006.83743915, scaled tolerance: 1396695.0 
[IterativeImputer] Ending imputation round 2/10, elapsed time 1040.11
[IterativeImputer] Change: 2148887.1051999996, scaled tolerance: 1396695.0 
[IterativeImputer] Ending imputation round 3/10, elapsed time 1509.08
[IterativeImputer] Change: 1685603.5993400002, scaled tolerance: 1396695.0 
[IterativeImputer] Ending imputation round 4/10, elapsed time 1982.00
[IterativeImputer] Change: 1638688.275, scaled tolerance: 1396695.0 
[IterativeImputer] Ending imputation round 5/10, elapsed time 2482.08
[IterativeImputer] Change: 1926642.8484300002, scaled tolerance: 1396695.0 
[IterativeImputer] Ending imputation round 6/10, elapsed time 3233.37
[IterativeImputer] Change: 1428875.15543, scaled tolerance: 1396695.0 
[IterativeImputer] Ending imputation round 7/10, elapsed time 3928.32
[Iterativ



Missing values before imputation:
total_assets                              412
non_current_assets                        412
current_assets                            412
property_plant_equipment                  412
intangible_assets                         412
inventories                               412
trade_receivables                         412
cash_and_cash_equivalents                 412
equity_shareholders_of_the_parent         412
share_capital                             412
retained_earning_accumulated_losses       412
non_current_liabilities                   412
current_liabilities                       412
non_current_loans_and_borrowings          412
financial_liabilities_loans_borrowings    412
total_shares                              412
target                                     29
dtype: int64

Missing values after imputation:
total_assets                              0
non_current_assets                        0
current_assets                            0
prope

In [22]:
imputed_df.to_csv('../../data/filled_data.csv', index=False)