# DATA PREPARATION PART 2

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../../data/processed/data.csv')

In [3]:
data = (data.drop(columns='Unnamed: 0')
            .sort_values(by=['file_name', 'end_of_period'], ascending=False))

In [4]:
data['total_liabilities'] = data['current_liabilities'] + data['non_current_liabilities']

In [5]:
data.head(3)

Unnamed: 0,end_of_period,total_assets,non_current_assets,current_assets,property_plant_equipment,intangible_assets,inventories,trade_receivables,cash_and_cash_equivalents,equity_shareholders_of_the_parent,...,current_liabilities,non_current_loans_and_borrowings,financial_liabilities_loans_borrowings,total_shares,file_name,company_name,ticker,sector,target,total_liabilities
7674,2022-07-01,2733389.0,1720912.0,1012477.0,1025678.0,337294.0,171367.0,826650.0,6249.0,304293.0,...,1468424.0,875000.0,85884.0,10271.0,ZYWIEC.xlsx,Grupa Żywiec SA,ZWC,napoje,494.0,2429096.0
7673,2022-04-01,2981239.0,1730362.0,1250877.0,1031440.0,340644.0,202267.0,1039206.0,6404.0,288397.0,...,1631516.0,975000.0,96674.0,10271.0,ZYWIEC.xlsx,Grupa Żywiec SA,ZWC,napoje,510.0,2692842.0
7672,2021-10-01,2508447.0,1726721.0,781726.0,1050251.0,347755.0,140983.0,570261.0,2510.0,216089.0,...,2209321.0,0.0,1094585.0,10271.0,ZYWIEC.xlsx,Grupa Żywiec SA,ZWC,napoje,462.0,2292358.0


In [6]:
attributes = [
    'total_assets', 'non_current_assets', 'current_assets',
    'property_plant_equipment', 'intangible_assets', 'inventories',
    'trade_receivables', 'cash_and_cash_equivalents', 'equity_shareholders_of_the_parent',
    'share_capital', 'retained_earning_accumulated_losses', 'non_current_liabilities',
    'current_liabilities', 'non_current_loans_and_borrowings', 'financial_liabilities_loans_borrowings'
]

def calculate_financial_differences(df, attributes, periods=[1, 2, 4], company_col='file_name', sort_col='end_of_period'):
    # Ensure the dataframe is sorted by company and date
    df = df.sort_values([company_col, sort_col])
    
    # Create a copy of the dataframe to avoid modifying the original
    result_df = df.copy()
    
    # Calculate differences for each attribute and period
    for attr in attributes:
        for period in periods:
            diff_col_name = f'{attr}_diff_{period}q'
            result_df[diff_col_name] = result_df.groupby(company_col)[attr].diff(periods=period)
    
    return result_df

df = calculate_financial_differences(data, attributes)

In [7]:
df = df.dropna(subset='property_plant_equipment_diff_4q')

In [8]:
df.shape

(6508, 68)

In [9]:
df['target'] = df.pop('target')

In [10]:
df.head(10)

Unnamed: 0,end_of_period,total_assets,non_current_assets,current_assets,property_plant_equipment,intangible_assets,inventories,trade_receivables,cash_and_cash_equivalents,equity_shareholders_of_the_parent,...,current_liabilities_diff_1q,current_liabilities_diff_2q,current_liabilities_diff_4q,non_current_loans_and_borrowings_diff_1q,non_current_loans_and_borrowings_diff_2q,non_current_loans_and_borrowings_diff_4q,financial_liabilities_loans_borrowings_diff_1q,financial_liabilities_loans_borrowings_diff_2q,financial_liabilities_loans_borrowings_diff_4q,target
4,2013-10-01,8713.0,590.0,8123.0,63.0,467.0,3442.0,813.0,3741.0,7922.0,...,346.0,368.0,726.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0
5,2014-04-01,7645.0,830.0,6815.0,53.0,712.0,4327.0,537.0,1919.0,6918.0,...,-62.0,284.0,647.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0
6,2014-07-01,7699.0,965.0,6735.0,123.0,781.0,4924.0,798.0,976.0,6735.0,...,236.0,174.0,542.0,0.0,0.0,0.0,0.0,0.0,1.0,12.0
7,2014-10-01,21516.0,6602.0,14914.0,104.0,6301.0,0.0,7252.0,7644.0,17005.0,...,2441.0,2677.0,2961.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0
8,2015-04-01,27828.0,1475.0,26354.0,85.0,1214.0,4807.0,4796.0,16143.0,24626.0,...,-203.0,2238.0,2412.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0
9,2015-07-01,28580.0,7205.0,21375.0,108.0,7035.0,0.0,2058.0,19106.0,26629.0,...,-2154.0,-2357.0,320.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0
10,2015-10-01,33896.0,7073.0,26822.0,438.0,6388.0,0.0,2749.0,22931.0,29852.0,...,2425.0,271.0,2509.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0
11,2016-04-01,38931.0,7469.0,31462.0,610.0,6753.0,0.0,5331.0,24514.0,36563.0,...,-1694.0,731.0,-1626.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0
12,2016-07-01,40648.0,8574.0,32074.0,691.0,7582.0,0.0,4867.0,27106.0,38208.0,...,91.0,-1603.0,-1332.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0
13,2018-10-01,109263.0,41317.0,67946.0,18734.0,17138.0,0.0,14553.0,24251.0,89912.0,...,5464.0,5555.0,6286.0,11340.0,11340.0,11340.0,1452.0,1452.0,1452.0,336.0


In [11]:
df.to_csv('../../data/processed/data_with_features.csv', index=False)