# DATA PREPARATION PART 2

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('../../data/processed/data.csv')

In [5]:
data = (data.sort_values(by=['file_name', 'end_of_period'], ascending=False))

In [6]:
data['total_liabilities'] = data['current_liabilities'] + data['non_current_liabilities']

In [7]:
data.head(3)

Unnamed: 0,end_of_period,total_assets,non_current_assets,current_assets,property_plant_equipment,intangible_assets,inventories,trade_receivables,cash_and_cash_equivalents,equity_shareholders_of_the_parent,...,retained_earning_accumulated_losses,non_current_liabilities,current_liabilities,non_current_loans_and_borrowings,financial_liabilities_loans_borrowings,total_shares,file_name,ticker,target,total_liabilities
15494,2022-09-01,2733389.0,1720912.0,1012477.0,1025678.0,337294.0,171367.0,826650.0,6249.0,304293.0,...,222119.0,960672.0,1468424.0,875000.0,85884.0,10271.34,ZYWIEC.xlsx,ZWC,493.5,2429096.0
15493,2022-06-01,2981239.0,1730362.0,1250877.0,1031440.0,340644.0,202267.0,1039206.0,6404.0,288397.0,...,198275.0,1061326.0,1631516.0,975000.0,96674.0,10271.34,ZYWIEC.xlsx,ZWC,510.0,2692842.0
15492,2022-03-01,2741947.0,1723917.0,1018030.0,1036437.0,344336.0,190622.0,749814.0,28068.0,254238.0,...,302329.0,620050.0,1867659.0,540000.0,613619.0,10271.34,ZYWIEC.xlsx,ZWC,506.0,2487709.0


In [8]:
attributes = [
    'total_assets', 'non_current_assets', 'current_assets',
    'property_plant_equipment', 'intangible_assets', 'inventories',
    'trade_receivables', 'cash_and_cash_equivalents', 'equity_shareholders_of_the_parent',
    'share_capital', 'retained_earning_accumulated_losses', 'non_current_liabilities',
    'current_liabilities', 'non_current_loans_and_borrowings', 'financial_liabilities_loans_borrowings'
]

def calculate_financial_differences(df, attributes, periods=[1, 2, 4], company_col='file_name', sort_col='end_of_period'):
    # Ensure the dataframe is sorted by company and date
    df = df.sort_values([company_col, sort_col])
    
    # Create a copy of the dataframe to avoid modifying the original
    result_df = df.copy()
    
    # Calculate differences for each attribute and period
    for attr in attributes:
        for period in periods:
            diff_col_name = f'{attr}_diff_{period}q'
            result_df[diff_col_name] = result_df.groupby(company_col)[attr].diff(periods=period)
    
    return result_df

df = calculate_financial_differences(data, attributes)

In [9]:
df = df.dropna(subset='property_plant_equipment_diff_4q')

In [10]:
df.shape

(14305, 66)

In [11]:
df['target'] = df.pop('target')

In [12]:
df.head(10)

Unnamed: 0,end_of_period,total_assets,non_current_assets,current_assets,property_plant_equipment,intangible_assets,inventories,trade_receivables,cash_and_cash_equivalents,equity_shareholders_of_the_parent,...,current_liabilities_diff_1q,current_liabilities_diff_2q,current_liabilities_diff_4q,non_current_loans_and_borrowings_diff_1q,non_current_loans_and_borrowings_diff_2q,non_current_loans_and_borrowings_diff_4q,financial_liabilities_loans_borrowings_diff_1q,financial_liabilities_loans_borrowings_diff_2q,financial_liabilities_loans_borrowings_diff_4q,target
4,2012-03-01,3473.0,51.0,3422.0,0.0,0.0,0.0,0.0,0.0,3259.0,...,-349.22,-5.0,-34.0,0.0,0.0,0.0,0.0,0.0,0.0,9.83
5,2012-06-01,3428.0,49.0,3379.0,0.0,0.0,0.0,231.0,1604.0,3217.0,...,-2.0,-351.22,10.0,0.0,0.0,0.0,0.0,0.0,0.0,10.51
6,2012-09-01,6384.0,48.0,6336.0,0.0,0.0,0.0,0.0,0.0,6137.0,...,19.0,17.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,9.31
7,2012-12-01,7336.53,95.45,7240.68,7.18,29.85,2558.44,1178.29,3465.9,6913.93,...,329.36,348.36,-2.86,0.0,0.0,0.0,-0.62,-0.62,-0.62,8.62
8,2013-03-01,7410.16,82.35,7327.81,5.91,23.88,2895.94,392.6,4034.73,6971.11,...,16.69,346.05,363.05,0.0,0.0,0.0,0.62,0.0,0.0,8.52
9,2013-06-01,7740.99,567.13,7173.87,4.64,507.73,2792.8,561.13,3816.39,7312.28,...,-10.34,6.35,354.71,0.0,0.0,0.0,0.0,0.62,0.0,11.37
10,2013-09-01,7451.85,518.27,6933.58,3.38,458.54,2904.95,400.14,3623.95,7007.7,...,15.44,5.1,351.15,0.0,0.0,0.0,0.0,0.0,0.0,9.03
11,2013-12-01,8712.67,589.51,8123.16,63.18,466.67,3442.29,812.51,3740.97,7922.3,...,346.22,361.66,368.01,0.0,0.0,0.0,0.0,0.0,0.62,8.73
12,2014-03-01,8078.57,534.78,7543.79,58.17,416.94,3782.35,391.15,3340.6,7391.86,...,-103.66,242.56,247.66,0.0,0.0,0.0,0.0,0.0,0.0,9.3
13,2014-06-01,7645.37,830.06,6815.32,53.16,712.11,4326.58,537.45,1919.27,6917.54,...,41.13,-62.53,299.13,0.0,0.0,0.0,0.0,0.0,0.0,11.03


In [13]:
df.to_csv('../../data/processed/data_with_features.csv', index=False)