# DATA CLEANING PART 1

In [159]:
import pandas as pd
import pyarrow as pa

## General Company Info Cleaning

In [160]:
df_general = pd.read_csv('../../data/processed/general_company_info.csv')

In [161]:
rename = {
    "Name":      "company_name",
    "TICKER":    "ticker",
    "Sector":    "sector",
    "File_Name": "file_name"
}
dtypes = {
    "company_name": pd.ArrowDtype(pa.string()),
    "ticker":       pd.ArrowDtype(pa.string()),
    "sector":       pd.ArrowDtype(pa.string()),
    "file_name":    pd.ArrowDtype(pa.string())
}

# Rename the columns
df_general = df_general.rename(columns=rename)

# Apply the specified data types
df_general = df_general.astype(dtypes)

In [162]:
df_general.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   company_name  349 non-null    string[pyarrow]
 1   ticker        349 non-null    string[pyarrow]
 2   sector        349 non-null    string[pyarrow]
 3   file_name     349 non-null    string[pyarrow]
dtypes: string[pyarrow](4)
memory usage: 21.2 KB


In [163]:
df_general.head(3)

Unnamed: 0,company_name,ticker,sector,file_name
0,11 bit studios SA,11B,gry,11BIT.xlsx
1,3LP SA,3LP_IPO,handel,3LPSA.xlsx
2,3R Games SA,3RG,gry,3RGAMES.xlsx


## Stooq Data Cleaning

In [164]:
df_market_value = pd.read_csv('../../data/processed/stooq_data.csv')

In [165]:
df_market_value.isnull().sum()

TICKER     0
PER        0
DATE       0
TIME       0
OPEN       0
HIGH       0
LOW        0
CLOSE      0
VOL        0
OPENINT    0
target     0
dtype: int64

In [166]:
df_market_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594620 entries, 0 to 1594619
Data columns (total 11 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   TICKER   1594620 non-null  object 
 1   PER      1594620 non-null  object 
 2   DATE     1594620 non-null  object 
 3   TIME     1594620 non-null  int64  
 4   OPEN     1594620 non-null  float64
 5   HIGH     1594620 non-null  float64
 6   LOW      1594620 non-null  float64
 7   CLOSE    1594620 non-null  float64
 8   VOL      1594620 non-null  float64
 9   OPENINT  1594620 non-null  int64  
 10  target   1594620 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 133.8+ MB


In [167]:
rename = {
    "TICKER": "ticker",
    "DATE": "end_of_period",
}

In [168]:
df_market_value = df_market_value.rename(columns=rename)

In [169]:
df_market_value = df_market_value[['ticker', 'end_of_period', 'target']]

In [170]:
dtypes = {
    "ticker":        pd.ArrowDtype(pa.string()),
    "end_of_period": "datetime64[s]",
    "target":        "float32[pyarrow]"
}

In [171]:
df_market_value = df_market_value.astype(dtypes)

In [172]:
df_market_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1594620 entries, 0 to 1594619
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype          
---  ------         --------------    -----          
 0   ticker         1594620 non-null  string[pyarrow]
 1   end_of_period  1594620 non-null  datetime64[s]  
 2   target         1594620 non-null  float[pyarrow] 
dtypes: datetime64[s](1), float[pyarrow](1), string[pyarrow](1)
memory usage: 28.9 MB


In [173]:
df_market_value.head(3)

Unnamed: 0,ticker,end_of_period,target
0,06N,1997-06-12,85.848999
1,06N,1997-06-13,90.764
2,06N,1997-06-16,86.667999


## Detailed Company Info Cleaning

In [174]:
df_detailed = pd.read_csv('../../data/processed')

In [175]:
# Merging into one dataframe