In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# FULL DF
full_row_ranges = [
    range(290, 293+1),  # Current financial liabilities, Non-current financial liabilities, Financial liabilities (total)
    range(278, 283+1),  # Return on equity (ROE), Return on assets (ROA), Leverage (EM), Asset utilization (AU)
    range(260, 275+1),  # Working capital ratio, Current ratio, Quick ratio, Cash ratio, Receivables turnover, Inventory turnover, The operating cycle, Rotation commitments, Cash conversion cycle, Rotation assets, Rotation of assets, Assets ratio, Debt ratio, Debt service ratio, Rate debt security
    range(29, 59+1),    # ASSETS, Non-current assets, Property, plant and equipment, Exploration for and evaluation of mineral resources, Intangible assets, Goodwill, Investment property, Right-of-use assets, Investment in affiliates, Non-current financial assets, Non-current loans and receivables, Deferred income tax, Non-current deferred charges and accruals, Non-current derivative instruments, Other non-current assets, Current assets, Inventories, Current intangible assets, Biological assets, Trade receivables, Loans and other receivables, Financial assets, Cash and cash equivalents, Accruals, Assets from current tax, Derivative instruments, Other assets, Assets held for sale and discontinuing operations, Called up capital, Own shares
    range(60, 93+1),    # EQUITY & LIABILITIES, Equity shareholders of the parent, Share capital, Called up share capital, Treasury shares, Supplementary capital, Valuation and exchange differences, Other capitals, Retained earnings / accumulated losses, Non-controlling interests, Non-current liabilities, Non-current liabilities from derivatives, Non-current loans and borrowings, Non-current liabilities from bonds, Non-current liabilities from finance leases, Non-current trade payables, Long-term provision for employee benefits, Deferred tax liabilities, Non-current provision, Other non-current liabilities, Non-current accruals (liability), Current liabilities, Liabilities from derivatives, Financial liabilities (loans and borrowings), Bond liabilities, Liabilities from finance leases, Trade payables, Employee benefits, Current tax liabilities, Provisions, Other liabilities, Accruals (liability), Liabilities related to assets held for sale and discontinued operations
    range(2, 22+1),     # End of period, Revenues from sales, Operating profit/loss, Net profit/loss attributable to equity holders of the parent, Total comprehensive income attributable to equity holders, Depreciation, Cash flow from operating activities, Cash flow from investing activities, Cash flow from financing activities, Non-current assets, Current assets, Equity shareholders of the parent, Non-current liabilities, Current liabilities, Number of shares (‘000), Earnings per share, Total comprehensive income per share, Book value per share
]

# PREPARATION

In [3]:
FOLDER_PATH = 'data/'

In [4]:
if os.path.exists(FOLDER_PATH):
    print("Folder exists. Proceeding with file processing.")
else:
    print("Folder does not exist. Check the path.")

Folder exists. Proceeding with file processing.


In [5]:
def count_files_in_folder(folder_path):
    file_count = 0

    for entry in os.listdir(folder_path):
        full_path = os.path.join(folder_path, entry)
        if os.path.isfile(full_path):
            file_count += 1
    
    return file_count

print(f"Files in folder: {count_files_in_folder(FOLDER_PATH)}.")

Files in folder: 349.


# 'INFO' DATA

In [6]:
def process_company_sector(folder_path):
    results = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            try:
                with pd.ExcelFile(file_path) as xls:
                    company_name = pd.read_excel(xls, 'Info', usecols="B", skiprows=1, nrows=1).values[0][0]
                    sector = pd.read_excel(xls, 'Info', usecols="E", skiprows=19, nrows=1).values[0][0]
                
                result = {
                    'filename': filename,
                    'Company Name': company_name,
                    'Sector': sector
                }
                results.append(result)
                
                print(f"Dane z {filename} zostały przetworzone.")
            except Exception as e:
                print(f"Błąd przy przetwarzaniu pliku {filename}: {e}")

    results_df = pd.DataFrame(results)
    return results_df

In [7]:
df_names = process_company_sector(FOLDER_PATH)

Dane z 11BIT.xlsx zostały przetworzone.
Dane z 3LPSA.xlsx zostały przetworzone.
Dane z 3RGAMES.xlsx zostały przetworzone.
Dane z AB.xlsx zostały przetworzone.
Dane z AC.xlsx zostały przetworzone.
Dane z ACTION.xlsx zostały przetworzone.
Dane z ADIUVO.xlsx zostały przetworzone.
Dane z AGORA.xlsx zostały przetworzone.
Dane z AGROPUBL.xlsx zostały przetworzone.
Dane z AGROWILL.xlsx zostały przetworzone.
Dane z AIGAMES.xlsx zostały przetworzone.
Dane z AILLERON.xlsx zostały przetworzone.
Dane z AIRWAY.xlsx zostały przetworzone.
Dane z ALLEGRO.xlsx zostały przetworzone.
Dane z ALUMETAL.xlsx zostały przetworzone.
Dane z AMBRA.xlsx zostały przetworzone.
Dane z AMICA.xlsx zostały przetworzone.
Dane z AMPLI.xlsx zostały przetworzone.
Dane z AMREST.xlsx zostały przetworzone.
Dane z ANSWEAR.xlsx zostały przetworzone.
Dane z APATOR.xlsx zostały przetworzone.
Dane z APLISENS.xlsx zostały przetworzone.
Dane z APSENERG.xlsx zostały przetworzone.
Dane z ARCTIC.xlsx zostały przetworzone.
Dane z ARTERIA

In [8]:
output_csv_path = 'dataframes/names_gpw.csv'
df_names.to_csv(output_csv_path, index=False)

# QS DATA

In [9]:
def process_excel_data(base_folder):
    folder_path = os.path.join(base_folder)
    all_data = []

    row_ranges = [
        range(2, 23),   # Rows 3 to 22 inclusive, zero-indexed
        range(30, 60),  # Rows 31 to 59 inclusive
        range(60, 94),  # Rows 61 to 93 inclusive
        range(254, 276),  # Rows 255 to 275 inclusive
        range(278, 284),  # Rows 279 to 283 inclusive
        range(290, 294)   # Rows 291 to 293 inclusive
    ]

    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {filename}")
            try:
                data = pd.read_excel(file_path, sheet_name='QS', header=None)

                for row_range in row_ranges:
                    headers = data.iloc[list(row_range), 2].tolist()
                    range_data = data.iloc[list(row_range), 3:102]
                    range_data = range_data.replace(0, np.nan)

                    transposed_data = range_data.transpose()
                    transposed_data.columns = headers
                    transposed_data['filename'] = filename

                    all_data.append(transposed_data)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    if all_data:
        results_df = pd.concat(all_data, ignore_index=True)
        results_df = results_df.dropna(how='all', subset=[col for col in results_df.columns if col != 'filename'])
        results_df = results_df.loc[:, ~results_df.columns.isna()]

        return results_df
    else:
        print("No data to concatenate")
        return pd.DataFrame()

In [None]:
df_results = process_excel_data(FOLDER_PATH)
df_results

In [11]:
df_cleaned_columns = df_results.dropna(how='all', axis=1)

In [12]:
df_cleaned_columns = df_cleaned_columns.drop(columns='Start of period')

In [13]:
df_cleaned_columns = df_cleaned_columns.dropna(subset=['End of period'])
df_cleaned_columns = df_cleaned_columns[df_cleaned_columns['End of period'] != 0]
print(f"After dropping rows with None, NaN or 0 in 'End of period': {df_cleaned_columns.shape}")

After dropping rows with None, NaN or 0 in 'End of period': (19906, 106)


In [14]:
df_cleaned_columns

Unnamed: 0,End of period,Revenues from sales,Operating profit/loss,Net profit/loss attributable to equity holders of the parent,Total comprehensive income attributable to equity holders,Depreciation,Cash flow from operating activities,Cash flow from investing activities,Cash flow from financing activities,Aktywa,...,Assets ratio,Debt ratio,Debt service ratio,Rate debt security,Leverage (EM),Asset utilization (AU),Load gross profit,Current financial liabilities,Non-current financial liabilities,Financial liabilities (total)
48,2010-03-31,66,,-16,,29,,,,,...,,,,,,,,,,
50,2010-09-30,129,,-26,,6,,,,1411,...,,,,,,,,,,
51,2010-12-31,228,,-198,,,,,,1352.36,...,,,,,,,,,,
52,2011-03-31,144,,-50,,,,,,1342,...,,,,,,,,,,
53,2011-06-30,959,,377,,7,,,,1659,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195460,2021-09-30,995385,103668,104744,106250,36985,354390,-22037,-433167,2649304,...,,,,,,,,,,
195461,2021-12-31,733974,46890,32175,42434,42311,50929,-65863,30091,2508447,...,,,,,,,,,,
195462,2022-03-31,696778,6649,-1246,41367,38391,13967,-33280,57099,2741947,...,,,,,,,,,,
195463,2022-06-30,1209418,101579,92625,45386,39932,101877,-21179,-81990,2981239,...,,,,,,,,,,


In [15]:
df_cleaned_columns.columns.tolist()

['End of period',
 'Revenues from sales',
 'Operating profit/loss',
 'Net profit/loss attributable to equity holders of the parent',
 'Total comprehensive income attributable to equity holders',
 'Depreciation',
 'Cash flow from operating activities',
 'Cash flow from investing activities',
 'Cash flow from financing activities',
 'Aktywa',
 'Non-current assets',
 'Current assets',
 'Equity shareholders of the parent',
 'Non-current liabilities',
 'Current liabilities',
 "Number of shares ('000)",
 'Earnings per share',
 'Total comprehensive income per share',
 'Book value per share',
 'Accounting standard',
 'filename',
 'Property, plant and equipment',
 'Exploration for and evaluation of mineral resources',
 'Intangible assets',
 'Goodwill',
 'Investment property',
 'Right-of-use assets',
 'Investment in affiliates',
 'Non-current financial assets',
 'Non-current loans and receivables',
 'Deferred income tax',
 'Non-current deferred charges and accruals',
 'Non-current derivative ins

In [16]:
output_csv_path = 'dataframes/dataframe_gpw.csv'
df_cleaned_columns.to_csv(output_csv_path, index=False)

# nn_gpw_data

In [17]:
columns_to_keep = [
    'filename',
    # Additional Information
    'Current financial liabilities',
    'Non-current financial liabilities',
    'Financial liabilities (total)',
    
    # DuPont Indicators
    'Return on equity (ROE)',
    'Return on assets (ROA)',
    'Leverage (EM)',
    'Net profit margin',
    'Asset utilization (AU)',

    # Indicators
    'Current ratio',
    'Quick ratio',
    'Cash ratio',
    'Receivables turnover',
    'Inventory turnover',
    'The operating cycle',
    'Rotation commitments',
    'Cash conversion cycle',
    'Rotation assets',
    'Rotation of assets',
    'Assets ratio',
    'Debt ratio',
    'Debt service ratio',
    'Rate debt security',
    
    # Basic Information
    'End of period',
    
    # Aktywa (Assets)
    'Non-current assets',
    'Current assets',
    
    # Pasywa (Liabilities)
    'Equity shareholders of the parent',
    'Non-current liabilities',
    'Current liabilities'
]

In [18]:
df_second = df_cleaned_columns[columns_to_keep]

df_second

Unnamed: 0,filename,Current financial liabilities,Non-current financial liabilities,Financial liabilities (total),Return on equity (ROE),Return on assets (ROA),Leverage (EM),Net profit margin,Asset utilization (AU),Current ratio,...,Assets ratio,Debt ratio,Debt service ratio,Rate debt security,End of period,Non-current assets,Current assets,Equity shareholders of the parent,Non-current liabilities,Current liabilities
48,11BIT.xlsx,,,,,,,,,,...,,,,,2010-03-31,,,,,
50,11BIT.xlsx,,,,,,,,,,...,,,,,2010-09-30,2,1409,709,,702
51,11BIT.xlsx,,,,,,,,,,...,,,,,2010-12-31,55.87,1296.49,1268.2,,84.15
52,11BIT.xlsx,,,,,,,,,,...,,,,,2011-03-31,86,1256,1221,,110
53,11BIT.xlsx,,,,,,,,,,...,,,,,2011-06-30,103,1556,1595,,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195460,ZYWIEC.xlsx,,,,,,,,,,...,,,,,2021-09-30,1796783,852521,395450,64565,2189289
195461,ZYWIEC.xlsx,,,,,,,,,,...,,,,,2021-12-31,1726721,781726,216089,83037,2209321
195462,ZYWIEC.xlsx,,,,,,,,,,...,,,,,2022-03-31,1723917,1018030,254238,620050,1867659
195463,ZYWIEC.xlsx,,,,,,,,,,...,,,,,2022-06-30,1730362,1250877,288397,1061326,1631516


In [19]:
output_csv_path = 'dataframes/nn_gpw_data.csv'
df_second.to_csv(output_csv_path, index=False)