# DATA CLEANING PART 1

In [99]:
import pandas as pd
import pyarrow as pa

## General Company Info Cleaning

In [100]:
df_general = pd.read_csv('../../data/processed/general_company_info.csv')

In [101]:
rename = {
    "Name":      "company_name",
    "TICKER":    "ticker",
    "Sector":    "sector",
    "File_Name": "file_name"
}
dtypes = {
    "company_name": pd.ArrowDtype(pa.string()),
    "ticker":       pd.ArrowDtype(pa.string()),
    "sector":       pd.ArrowDtype(pa.string()),
    "file_name":    pd.ArrowDtype(pa.string())
}

# Rename the columns
df_general = df_general.rename(columns=rename)
# Apply the specified data types
df_general = df_general.astype(dtypes)

In [102]:
df_general.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   company_name  349 non-null    string[pyarrow]
 1   ticker        349 non-null    string[pyarrow]
 2   sector        349 non-null    string[pyarrow]
 3   file_name     349 non-null    string[pyarrow]
dtypes: string[pyarrow](4)
memory usage: 21.2 KB


In [103]:
df_general.head(3)

Unnamed: 0,company_name,ticker,sector,file_name
0,11 bit studios SA,11B,gry,11BIT.xlsx
1,3LP SA,3LP_IPO,handel,3LPSA.xlsx
2,3R Games SA,3RG,gry,3RGAMES.xlsx


## Stooq Data Cleaning

In [104]:
df_market_value = pd.read_csv('../../data/processed/stooq_data.csv')

In [105]:
df_market_value.isnull().sum()

TICKER     0
PER        0
DATE       0
TIME       0
OPEN       0
HIGH       0
LOW        0
CLOSE      0
VOL        0
OPENINT    0
target     0
dtype: int64

In [106]:
df_market_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594620 entries, 0 to 1594619
Data columns (total 11 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   TICKER   1594620 non-null  object 
 1   PER      1594620 non-null  object 
 2   DATE     1594620 non-null  object 
 3   TIME     1594620 non-null  int64  
 4   OPEN     1594620 non-null  float64
 5   HIGH     1594620 non-null  float64
 6   LOW      1594620 non-null  float64
 7   CLOSE    1594620 non-null  float64
 8   VOL      1594620 non-null  float64
 9   OPENINT  1594620 non-null  int64  
 10  target   1594620 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 133.8+ MB


In [107]:
rename = {
    "TICKER": "ticker",
    "DATE": "end_of_period",
}

In [108]:
df_market_value = df_market_value.rename(columns=rename)

In [109]:
df_market_value = df_market_value[['ticker', 'end_of_period', 'target']]

In [110]:
dtypes = {
    "ticker":        pd.ArrowDtype(pa.string()),
    "end_of_period": "datetime64[s]",
    "target":        "float32[pyarrow]"
}

In [111]:
df_market_value = df_market_value.astype(dtypes)

In [112]:
df_market_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594620 entries, 0 to 1594619
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype          
---  ------         --------------    -----          
 0   ticker         1594620 non-null  string[pyarrow]
 1   end_of_period  1594620 non-null  datetime64[s]  
 2   target         1594620 non-null  float[pyarrow] 
dtypes: datetime64[s](1), float[pyarrow](1), string[pyarrow](1)
memory usage: 28.9 MB


In [113]:
df_market_value.head(3)

Unnamed: 0,ticker,end_of_period,target
0,06N,1997-06-12,85.848999
1,06N,1997-06-13,90.764
2,06N,1997-06-16,86.667999


## Detailed Company Info Cleaning

In [114]:
df_detailed = pd.read_csv('../../data/processed/details_company_info.csv')

In [115]:
rename = {
    "date":     "end_of_period",
    "filename": "file_name",
    "assets":   "total_assets"
}

dtypes = {
    "end_of_period":                          "datetime64[s]",
    "total_assets":                           "float32[pyarrow]",
    "non_current_assets":                     "float32[pyarrow]",
    "current_assets":                         "float32[pyarrow]",
    "property_plant_equipment":               "float32[pyarrow]",
    "intangible_assets":                      "float32[pyarrow]",
    "inventories":                            "float32[pyarrow]",
    "trade_receivables":                      "float32[pyarrow]",
    "cash_and_cash_equivalents":              "float32[pyarrow]",
    "equity_shareholders_of_the_parent":      "float32[pyarrow]",
    "share_capital":                          "float32[pyarrow]",
    "retained_earning_accumulated_losses":    "float32[pyarrow]",
    "non_current_liabilities":                "float32[pyarrow]",
    "current_liabilities":                    "float32[pyarrow]",
    "non_current_loans_and_borrowings":       "float32[pyarrow]",
    "financial_liabilities_loans_borrowings": "float32[pyarrow]",
    "total_shares":                           "float32[pyarrow]",
    "file_name":                              pd.ArrowDtype(pa.string())
}

In [116]:
# Rename the columns
df_detailed = df_detailed.rename(columns=rename)

# Apply the specified data types
df_detailed = df_detailed.astype(dtypes)

In [117]:
df_detailed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32660 entries, 0 to 32659
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype          
---  ------                                  --------------  -----          
 0   end_of_period                           19906 non-null  datetime64[s]  
 1   total_assets                            18995 non-null  float[pyarrow] 
 2   non_current_assets                      18859 non-null  float[pyarrow] 
 3   current_assets                          18929 non-null  float[pyarrow] 
 4   property_plant_equipment                18215 non-null  float[pyarrow] 
 5   intangible_assets                       17381 non-null  float[pyarrow] 
 6   inventories                             16529 non-null  float[pyarrow] 
 7   trade_receivables                       18741 non-null  float[pyarrow] 
 8   cash_and_cash_equivalents               18696 non-null  float[pyarrow] 
 9   equity_shareholders_of_the_parent      

## Merging into one dataframe

In [118]:
df = (df_detailed.merge(df_general, how='left', on='file_name')
                 .merge(df_market_value, how='left', on=['end_of_period', 'ticker'])
                 .dropna(subset=['end_of_period', 'target', 'total_assets', 'current_assets', 'non_current_assets'])
                 .fillna(0))

In [119]:
df.shape

(7675, 22)

In [121]:
df = df.round(0)

In [122]:
df.head(5)

Unnamed: 0,end_of_period,total_assets,non_current_assets,current_assets,property_plant_equipment,intangible_assets,inventories,trade_receivables,cash_and_cash_equivalents,equity_shareholders_of_the_parent,...,non_current_liabilities,current_liabilities,non_current_loans_and_borrowings,financial_liabilities_loans_borrowings,total_shares,file_name,company_name,ticker,sector,target
53,2011-04-01,1659.0,103.0,1556.0,0.0,0.0,0.0,0.0,0.0,1595.0,...,0.0,64.0,0.0,0.0,1871.0,11BIT.xlsx,11 bit studios SA,11B,gry,24.0
54,2011-07-01,2141.0,85.0,2055.0,0.0,0.0,0.0,0.0,0.0,2060.0,...,0.0,81.0,0.0,0.0,1871.0,11BIT.xlsx,11 bit studios SA,11B,gry,15.0
59,2012-10-01,7337.0,95.0,7241.0,7.0,30.0,2558.0,1178.0,3466.0,6914.0,...,0.0,422.0,0.0,-1.0,2217.0,11BIT.xlsx,11 bit studios SA,11B,gry,9.0
62,2013-07-01,7452.0,518.0,6934.0,3.0,459.0,2905.0,400.0,3624.0,7008.0,...,0.0,444.0,0.0,0.0,2217.0,11BIT.xlsx,11 bit studios SA,11B,gry,9.0
63,2013-10-01,8713.0,590.0,8123.0,63.0,467.0,3442.0,813.0,3741.0,7922.0,...,0.0,790.0,0.0,0.0,2217.0,11BIT.xlsx,11 bit studios SA,11B,gry,9.0
