# DATA PREPARATION PART 1

In [31]:
import pandas as pd
import pyarrow as pa

## General Company Info Cleaning

In [32]:
df_general = pd.read_csv('../../data/processed/general_company_info.csv')

In [33]:
rename = {
    "Name":      "company_name",
    "TICKER":    "ticker",
    "Sector":    "sector",
    "File_Name": "file_name"
}
dtypes = {
    "company_name": pd.ArrowDtype(pa.string()),
    "ticker":       pd.ArrowDtype(pa.string()),
    "sector":       pd.ArrowDtype(pa.string()),
    "file_name":    pd.ArrowDtype(pa.string())
}

# Rename the columns
df_general = df_general.rename(columns=rename)
# Apply the specified data types
df_general = df_general.astype(dtypes)

In [34]:
df_general.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   company_name  349 non-null    string[pyarrow]
 1   ticker        349 non-null    string[pyarrow]
 2   sector        349 non-null    string[pyarrow]
 3   file_name     349 non-null    string[pyarrow]
dtypes: string[pyarrow](4)
memory usage: 21.2 KB


In [35]:
df_general.head(3)

Unnamed: 0,company_name,ticker,sector,file_name
0,11 bit studios SA,11B,gry,11BIT.xlsx
1,3LP SA,3LP_IPO,handel,3LPSA.xlsx
2,3R Games SA,3RG,gry,3RGAMES.xlsx


## Stooq Data Cleaning

In [36]:
df_market_value = pd.read_csv('../../data/processed/stooq_data.csv')

In [37]:
df_market_value.isnull().sum()

TICKER     0
PER        0
DATE       0
TIME       0
OPEN       0
HIGH       0
LOW        0
CLOSE      0
VOL        0
OPENINT    0
target     0
dtype: int64

In [38]:
df_market_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594620 entries, 0 to 1594619
Data columns (total 11 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   TICKER   1594620 non-null  object 
 1   PER      1594620 non-null  object 
 2   DATE     1594620 non-null  object 
 3   TIME     1594620 non-null  int64  
 4   OPEN     1594620 non-null  float64
 5   HIGH     1594620 non-null  float64
 6   LOW      1594620 non-null  float64
 7   CLOSE    1594620 non-null  float64
 8   VOL      1594620 non-null  float64
 9   OPENINT  1594620 non-null  int64  
 10  target   1594620 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 133.8+ MB


In [39]:
rename = {
    "TICKER": "ticker",
    "DATE": "end_of_period",
}

In [40]:
df_market_value = df_market_value.rename(columns=rename)

In [41]:
df_market_value = df_market_value[['ticker', 'end_of_period', 'target']]

In [42]:
dtypes = {
    "ticker":        pd.ArrowDtype(pa.string()),
    "end_of_period": "datetime64[s]",
    "target":        "float32[pyarrow]"
}

In [43]:
df_market_value = df_market_value.astype(dtypes)

In [44]:
df_market_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594620 entries, 0 to 1594619
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype          
---  ------         --------------    -----          
 0   ticker         1594620 non-null  string[pyarrow]
 1   end_of_period  1594620 non-null  datetime64[s]  
 2   target         1594620 non-null  float[pyarrow] 
dtypes: datetime64[s](1), float[pyarrow](1), string[pyarrow](1)
memory usage: 28.9 MB


In [45]:
df_market_value.head(3)

Unnamed: 0,ticker,end_of_period,target
0,06N,1997-06-12,85.848999
1,06N,1997-06-13,90.764
2,06N,1997-06-16,86.667999


## Detailed Company Info Cleaning

In [46]:
df_detailed = pd.read_csv('../../data/processed/details_company_info.csv')

In [47]:
rename = {
    "date":     "end_of_period",
    "filename": "file_name",
    "assets":   "total_assets"
}

dtypes = {
    "end_of_period":                          "datetime64[s]",
    "total_assets":                           "float32[pyarrow]",
    "non_current_assets":                     "float32[pyarrow]",
    "current_assets":                         "float32[pyarrow]",
    "property_plant_equipment":               "float32[pyarrow]",
    "intangible_assets":                      "float32[pyarrow]",
    "inventories":                            "float32[pyarrow]",
    "trade_receivables":                      "float32[pyarrow]",
    "cash_and_cash_equivalents":              "float32[pyarrow]",
    "equity_shareholders_of_the_parent":      "float32[pyarrow]",
    "share_capital":                          "float32[pyarrow]",
    "retained_earning_accumulated_losses":    "float32[pyarrow]",
    "non_current_liabilities":                "float32[pyarrow]",
    "current_liabilities":                    "float32[pyarrow]",
    "non_current_loans_and_borrowings":       "float32[pyarrow]",
    "financial_liabilities_loans_borrowings": "float32[pyarrow]",
    "total_shares":                           "float32[pyarrow]",
    "file_name":                              pd.ArrowDtype(pa.string())
}

In [48]:
# Rename the columns
df_detailed = df_detailed.rename(columns=rename)

# Apply the specified data types
df_detailed = df_detailed.astype(dtypes)

## Merging into one dataframe

In [49]:
df_detailed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19906 entries, 0 to 19905
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype          
---  ------                                  --------------  -----          
 0   end_of_period                           19906 non-null  datetime64[s]  
 1   total_assets                            18995 non-null  float[pyarrow] 
 2   non_current_assets                      18859 non-null  float[pyarrow] 
 3   current_assets                          18929 non-null  float[pyarrow] 
 4   property_plant_equipment                18215 non-null  float[pyarrow] 
 5   intangible_assets                       17381 non-null  float[pyarrow] 
 6   inventories                             16529 non-null  float[pyarrow] 
 7   trade_receivables                       18741 non-null  float[pyarrow] 
 8   cash_and_cash_equivalents               18696 non-null  float[pyarrow] 
 9   equity_shareholders_of_the_parent      

In [50]:
df_market_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594620 entries, 0 to 1594619
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype          
---  ------         --------------    -----          
 0   ticker         1594620 non-null  string[pyarrow]
 1   end_of_period  1594620 non-null  datetime64[s]  
 2   target         1594620 non-null  float[pyarrow] 
dtypes: datetime64[s](1), float[pyarrow](1), string[pyarrow](1)
memory usage: 28.9 MB


In [51]:
df_general.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   company_name  349 non-null    string[pyarrow]
 1   ticker        349 non-null    string[pyarrow]
 2   sector        349 non-null    string[pyarrow]
 3   file_name     349 non-null    string[pyarrow]
dtypes: string[pyarrow](4)
memory usage: 21.2 KB


In [52]:
df_detailed = df_detailed.merge(df_general[['file_name', 'ticker', 'sector']], how='left', on='file_name')
merged_groups = []

# Group by ticker and apply merge_asof within each group
for ticker, group in df_detailed.groupby('ticker'):
    # Sort each group by end_of_period
    group = group.sort_values('end_of_period')
    # Get the corresponding market value rows for this ticker and sort them
    market_group = df_market_value[df_market_value['ticker'] == ticker].sort_values('end_of_period')

    # Perform merge_asof on this group with a tolerance of ±7 days
    merged = pd.merge_asof(
        group,
        market_group,
        on='end_of_period',
        direction='nearest',
        tolerance=pd.Timedelta(days=7)
    )
    merged_groups.append(merged)

df_merged = pd.concat(merged_groups, ignore_index=True)

df_merged = df_merged.dropna(subset=['end_of_period', 'target', 'total_assets', 'current_assets', 'non_current_assets'])
df_merged = df_merged.fillna(0)

df = df_merged.copy()

In [53]:
df.shape

(15495, 22)

In [54]:
df = df.round(2)

In [55]:
df.head(20)

Unnamed: 0,end_of_period,total_assets,non_current_assets,current_assets,property_plant_equipment,intangible_assets,inventories,trade_receivables,cash_and_cash_equivalents,equity_shareholders_of_the_parent,...,non_current_liabilities,current_liabilities,non_current_loans_and_borrowings,financial_liabilities_loans_borrowings,total_shares,file_name,ticker_x,sector,ticker_y,target
3,2011-01-01,1342.0,86.0,1256.0,0.0,0.0,0.0,0.0,0.0,1221.0,...,0.0,110.0,0.0,0.0,1870.76001,11BIT.xlsx,11B,gry,11B,7.54
4,2011-04-01,1659.0,103.0,1556.0,0.0,0.0,0.0,0.0,0.0,1595.0,...,0.0,64.0,0.0,0.0,1870.76001,11BIT.xlsx,11B,gry,11B,24.0
5,2011-07-01,2141.0,85.0,2055.0,0.0,0.0,0.0,0.0,0.0,2060.0,...,0.0,81.0,0.0,0.0,1870.76001,11BIT.xlsx,11B,gry,11B,14.95
6,2011-10-01,2986.189941,86.870003,2899.310059,0.0,53.73,1109.48999,418.5,1299.400024,2560.97998,...,0.0,425.220001,0.0,0.0,1870.76001,11BIT.xlsx,11B,gry,11B,7.15
7,2012-01-01,3473.0,51.0,3422.0,0.0,0.0,0.0,0.0,0.0,3259.0,...,0.0,76.0,0.0,0.0,1911.699951,11BIT.xlsx,11B,gry,11B,9.83
8,2012-04-01,3428.0,49.0,3379.0,0.0,0.0,0.0,231.0,1604.0,3217.0,...,0.0,74.0,0.0,0.0,1911.699951,11BIT.xlsx,11B,gry,11B,10.51
9,2012-07-01,6384.0,48.0,6336.0,0.0,0.0,0.0,0.0,0.0,6137.0,...,0.0,93.0,0.0,0.0,2217.199951,11BIT.xlsx,11B,gry,11B,9.31
10,2012-10-01,7336.529785,95.449997,7240.680176,7.18,29.85,2558.439941,1178.290039,3465.899902,6913.930176,...,0.0,422.359985,0.0,-0.62,2217.199951,11BIT.xlsx,11B,gry,11B,8.62
11,2013-01-01,7410.160156,82.349998,7327.810059,5.91,23.879999,2895.939941,392.600006,4034.72998,6971.109863,...,0.0,439.049988,0.0,0.0,2217.199951,11BIT.xlsx,11B,gry,11B,8.52
12,2013-04-01,7740.990234,567.130005,7173.870117,4.64,507.730011,2792.800049,561.130005,3816.389893,7312.279785,...,0.0,428.709991,0.0,0.0,2217.199951,11BIT.xlsx,11B,gry,11B,11.37


In [56]:
df.drop(columns='ticker_y', inplace=True)
df.rename(columns={'ticker_x': 'ticker'}, inplace=True)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15495 entries, 3 to 19905
Data columns (total 21 columns):
 #   Column                                  Non-Null Count  Dtype          
---  ------                                  --------------  -----          
 0   end_of_period                           15495 non-null  datetime64[s]  
 1   total_assets                            15495 non-null  float[pyarrow] 
 2   non_current_assets                      15495 non-null  float[pyarrow] 
 3   current_assets                          15495 non-null  float[pyarrow] 
 4   property_plant_equipment                15495 non-null  float[pyarrow] 
 5   intangible_assets                       15495 non-null  float[pyarrow] 
 6   inventories                             15495 non-null  float[pyarrow] 
 7   trade_receivables                       15495 non-null  float[pyarrow] 
 8   cash_and_cash_equivalents               15495 non-null  float[pyarrow] 
 9   equity_shareholders_of_the_parent       1549

### Adjusting Date columns to the same quarters

move every 01, 02 to 03

move every 04, 05 to 06

move every 07, 08 to 09

move every 10, 11 to 12

to keep the same quarters per files

In [58]:
df['end_of_period'] = df['end_of_period'].apply(lambda x: pd.Timestamp(year=x.year,
    month=(3 if x.month in [1, 2] else
           6 if x.month in [4, 5] else
           9 if x.month in [7, 8] else
           12),
    day=1))

In [59]:
df.to_csv('../../data/processed/data.csv', index=False)

In [60]:
company_id = 'ticker' if 'ticker' in df.columns else 'file_name'
df[company_id] = df[company_id].astype(str)
df['present'] = 1

# Create a pivot table with companies as rows and end_of_period as columns.
# Using aggfunc='max' ensures that if there is at least one record for that quarter, the value is 1.
pivot_df = df.pivot_table(
    index=company_id,
    columns='end_of_period',
    values='present',
    aggfunc='max',
    fill_value=0
)
pivot_df = pivot_df.reindex(fill_value=0)
pivot_df = pivot_df.astype(int)

file_path = '../../data/missing_quarters_report.xlsx'
pivot_df.to_excel(file_path)
print(f"File saved successfully: {file_path}")

File saved successfully: ../../data/missing_quarters_report.xlsx
