# DATA PREPARATION PART 3: Preparing Data for Model

In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('../../data/processed/data_with_features.csv')

# One Hot Encoding

In [45]:
df = (pd.get_dummies(df, columns=['sector'])
        .drop(columns=['file_name', 'company_name', 'ticker']))

df.head(3)

Unnamed: 0,end_of_period,total_assets,non_current_assets,current_assets,property_plant_equipment,intangible_assets,inventories,trade_receivables,cash_and_cash_equivalents,equity_shareholders_of_the_parent,...,sector_urządzenia elektryczne,sector_urządzenia mechaniczne,sector_usługi dla przedsiębiorstw,sector_wydawnictwa,sector_wydobycie i produkcja,sector_wyroby hutnicze,sector_wyroby metalowe,sector_zaopatrzenie - pozostałe,sector_środki transportu,sector_żywność
0,2013-10-01,8713.0,590.0,8123.0,63.0,467.0,3442.0,813.0,3741.0,7922.0,...,False,False,False,False,False,False,False,False,False,False
1,2014-04-01,7645.0,830.0,6815.0,53.0,712.0,4327.0,537.0,1919.0,6918.0,...,False,False,False,False,False,False,False,False,False,False
2,2014-07-01,7699.0,965.0,6735.0,123.0,781.0,4924.0,798.0,976.0,6735.0,...,False,False,False,False,False,False,False,False,False,False


In [46]:
df.to_csv('../../data/processed/model_with_features.csv', index=False)

# Scaling and preparing train_test_split

In [47]:
def load_scale_split(csv_path, split_percentage=0.2, random_state=42):
    df = pd.read_csv(csv_path)
    
    # Identify date column and target column
    date_column = 'end_of_period'
    target_column = 'target'
    
    # Convert date column to datetime64
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    
    target = df.pop(target_column)
    df[target_column] = target
    
    # Separate features and target
    y = df.pop(target_column)
    X = df
    
    # Identify numeric columns
    numeric_columns = X.select_dtypes(include=[np.number]).columns
    
    # Scale the numeric features
    scaler = StandardScaler()
    X.loc[:, numeric_columns] = scaler.fit_transform(X[numeric_columns])
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=split_percentage, random_state=random_state
    )

    return X_train, X_test, y_train, y_test

# Model ready dataframe

In [48]:
X_train, X_test, y_train, y_test = load_scale_split('../../data/processed/model_with_features.csv', split_percentage=0.2, random_state=7)

In [49]:
X_train.head(3)

Unnamed: 0,end_of_period,total_assets,non_current_assets,current_assets,property_plant_equipment,intangible_assets,inventories,trade_receivables,cash_and_cash_equivalents,equity_shareholders_of_the_parent,...,sector_urządzenia elektryczne,sector_urządzenia mechaniczne,sector_usługi dla przedsiębiorstw,sector_wydawnictwa,sector_wydobycie i produkcja,sector_wyroby hutnicze,sector_wyroby metalowe,sector_zaopatrzenie - pozostałe,sector_środki transportu,sector_żywność
2885,2019-04-01,-0.088189,-0.094314,-0.063182,-0.088935,-0.041884,-0.189364,-0.093807,-0.040459,-0.116969,...,False,False,False,False,False,False,False,False,False,False
2503,2009-04-01,-0.090015,-0.102504,-0.0563,-0.087988,-0.131113,-0.06216,-0.089319,-0.039435,-0.122446,...,False,False,False,False,False,False,False,False,False,False
2109,2008-10-01,-0.092551,-0.101388,-0.063179,-0.086665,-0.138555,-0.171542,-0.095533,-0.040477,-0.126703,...,False,True,False,False,False,False,False,False,False,False


In [50]:
y_train.head(3)

2885    11.0
2503     7.0
2109     8.0
Name: target, dtype: float64