In [35]:
import os
import pandas as pd
import numpy as np

In [36]:
# FULL DF
full_row_ranges = [
    range(290, 293+1),  # Current financial liabilities, Non-current financial liabilities, Financial liabilities (total)
    range(278, 283+1),  # Return on equity (ROE), Return on assets (ROA), Leverage (EM), Asset utilization (AU)
    range(260, 275+1),  # Working capital ratio, Current ratio, Quick ratio, Cash ratio, Receivables turnover, Inventory turnover, The operating cycle, Rotation commitments, Cash conversion cycle, Rotation assets, Rotation of assets, Assets ratio, Debt ratio, Debt service ratio, Rate debt security
    range(29, 59+1),    # ASSETS, Non-current assets, Property, plant and equipment, Exploration for and evaluation of mineral resources, Intangible assets, Goodwill, Investment property, Right-of-use assets, Investment in affiliates, Non-current financial assets, Non-current loans and receivables, Deferred income tax, Non-current deferred charges and accruals, Non-current derivative instruments, Other non-current assets, Current assets, Inventories, Current intangible assets, Biological assets, Trade receivables, Loans and other receivables, Financial assets, Cash and cash equivalents, Accruals, Assets from current tax, Derivative instruments, Other assets, Assets held for sale and discontinuing operations, Called up capital, Own shares
    range(60, 93+1),    # EQUITY & LIABILITIES, Equity shareholders of the parent, Share capital, Called up share capital, Treasury shares, Supplementary capital, Valuation and exchange differences, Other capitals, Retained earnings / accumulated losses, Non-controlling interests, Non-current liabilities, Non-current liabilities from derivatives, Non-current loans and borrowings, Non-current liabilities from bonds, Non-current liabilities from finance leases, Non-current trade payables, Long-term provision for employee benefits, Deferred tax liabilities, Non-current provision, Other non-current liabilities, Non-current accruals (liability), Current liabilities, Liabilities from derivatives, Financial liabilities (loans and borrowings), Bond liabilities, Liabilities from finance leases, Trade payables, Employee benefits, Current tax liabilities, Provisions, Other liabilities, Accruals (liability), Liabilities related to assets held for sale and discontinued operations
    range(2, 22+1),     # End of period, Revenues from sales, Operating profit/loss, Net profit/loss attributable to equity holders of the parent, Total comprehensive income attributable to equity holders, Depreciation, Cash flow from operating activities, Cash flow from investing activities, Cash flow from financing activities, Non-current assets, Current assets, Equity shareholders of the parent, Non-current liabilities, Current liabilities, Number of shares (‘000), Earnings per share, Total comprehensive income per share, Book value per share
]

# PREPARATION

In [37]:
FOLDER_PATH = 'data/'

In [38]:
if os.path.exists(FOLDER_PATH):
    print("Folder exists. Proceeding with file processing.")
else:
    print("Folder does not exist. Check the path.")

Folder exists. Proceeding with file processing.


In [39]:
def count_files_in_folder(folder_path):
    file_count = 0

    for entry in os.listdir(folder_path):
        full_path = os.path.join(folder_path, entry)
        if os.path.isfile(full_path):
            file_count += 1
    
    return file_count

print(f"Files in folder: {count_files_in_folder(FOLDER_PATH)}.")

Files in folder: 349.


# 'INFO' DATA

In [40]:
def process_company_sector(folder_path):
    results = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            try:
                with pd.ExcelFile(file_path) as xls:
                    company_name = pd.read_excel(xls, 'Info', usecols="B", skiprows=1, nrows=1).values[0][0]
                    sector = pd.read_excel(xls, 'Info', usecols="E", skiprows=19, nrows=1).values[0][0]
                    ticker = pd.read_excel(xls, 'Info', usecols="B", skiprows=11, nrows=1).values[0][0]
                
                result = {
                    'filename': filename,
                    'Company Name': company_name,
                    'Sector': sector,
                    'Ticker': ticker
                }
                results.append(result)
                
                print(f"The data from {filename} has been processed.")
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    results_df = pd.DataFrame(results)
    return results_df

In [41]:
df_names = process_company_sector(FOLDER_PATH)

The data from PKNORLEN.xlsx has been processed.
The data from ENERGOAP.xlsx has been processed.
The data from RELPOL.xlsx has been processed.
The data from ATREM.xlsx has been processed.
The data from DOMDEVEL.xlsx has been processed.
The data from ZUE.xlsx has been processed.
The data from FEERUM.xlsx has been processed.
The data from PRIMAMOD.xlsx has been processed.
The data from ASMGROUP.xlsx has been processed.
The data from KORBUDOM.xlsx has been processed.
The data from HUUUGE.xlsx has been processed.
The data from BUDIMEX.xlsx has been processed.
The data from RYVU.xlsx has been processed.
The data from IMCSA.xlsx has been processed.
The data from DEBICA.xlsx has been processed.
The data from LOTOS.xlsx has been processed.
The data from PHOTON.xlsx has been processed.
The data from AIRWAY.xlsx has been processed.
The data from BOOMBIT.xlsx has been processed.
The data from NANOGROUP.xlsx has been processed.
The data from HELIO.xlsx has been processed.
The data from TBULL.xlsx h

In [42]:
df_names

Unnamed: 0,filename,Company Name,Sector,Ticker
0,PKNORLEN.xlsx,PKN Orlen SA,wydobycie i produkcja,PKN
1,ENERGOAP.xlsx,Energoaparatura SA,budownictwo przemysłowe,ENP
2,RELPOL.xlsx,Relpol SA,urządzenia elektryczne,RLP
3,ATREM.xlsx,Atrem SA,instalacje budowlane i telekomunikacyjne,ATR
4,DOMDEVEL.xlsx,Dom Development SA,sprzedaż nieruchomości,DOM
...,...,...,...,...
344,ROPCZYCE.xlsx,ZM Ropczyce SA,materiały budowlane,RPC
345,MIRBUD.xlsx,Mirbud SA,budownictwo ogólne,MRB
346,PATENTUS.xlsx,Patentus SA,urządzenia mechaniczne,PAT
347,NEUCA.xlsx,Neuca SA,dystrybucja leków,NEU


In [43]:
output_csv_path = 'dataframes/names_gpw.csv'
df_names.to_csv(output_csv_path, index=False)

# QS DATA

In [44]:
def process_excel_data(base_folder):
    folder_path = os.path.join(base_folder)
    all_data = []

    row_ranges = [
        range(2, 23),   # Rows 3 to 22 inclusive, zero-indexed
        range(30, 60),  # Rows 31 to 59 inclusive
        range(60, 94),  # Rows 61 to 93 inclusive
        range(254, 276),  # Rows 255 to 275 inclusive
        range(278, 284),  # Rows 279 to 283 inclusive
        range(290, 294)   # Rows 291 to 293 inclusive
    ]

    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {filename}")
            try:
                data = pd.read_excel(file_path, sheet_name='QS', header=None)

                for row_range in row_ranges:
                    headers = data.iloc[list(row_range), 2].tolist()
                    range_data = data.iloc[list(row_range), 3:102]
                    range_data = range_data.replace(0, np.nan)

                    transposed_data = range_data.transpose()
                    transposed_data.columns = headers
                    transposed_data['filename'] = filename

                    all_data.append(transposed_data)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    if all_data:
        results_df = pd.concat(all_data, ignore_index=True)
        results_df = results_df.dropna(how='all', subset=[col for col in results_df.columns if col != 'filename'])
        results_df = results_df.loc[:, ~results_df.columns.isna()]

        return results_df
    else:
        print("No data to concatenate")
        return pd.DataFrame()

In [None]:
df_results = process_excel_data(FOLDER_PATH)
df_results

In [46]:
df_cleaned_columns = df_results.dropna(how='all', axis=1)

In [47]:
df_cleaned_columns = df_cleaned_columns.drop(columns='Start of period')

In [48]:
df_cleaned_columns = df_cleaned_columns.dropna(subset=['End of period'])
df_cleaned_columns = df_cleaned_columns[df_cleaned_columns['End of period'] != 0]
print(f"After dropping rows with None, NaN or 0 in 'End of period': {df_cleaned_columns.shape}")

After dropping rows with None, NaN or 0 in 'End of period': (19906, 106)


In [49]:
df_cleaned_columns

Unnamed: 0,End of period,Revenues from sales,Operating profit/loss,Net profit/loss attributable to equity holders of the parent,Total comprehensive income attributable to equity holders,Depreciation,Cash flow from operating activities,Cash flow from investing activities,Cash flow from financing activities,Aktywa,...,Assets ratio,Debt ratio,Debt service ratio,Rate debt security,Leverage (EM),Asset utilization (AU),Load gross profit,Current financial liabilities,Non-current financial liabilities,Financial liabilities (total)
3,1998-12-31,,,,,,,,,6800077,...,,,,,,,,,,
4,1999-03-31,2724513,128768,39830,,99445,213338,-343531,60988,7070985,...,,,,,,,,,,
5,1999-06-30,3734138,256866,196545,,115350,185744,-366017,128930,7691669,...,,,,,,,,,,
6,1999-09-30,4696719,261468,141367,,138261,184070,-393602,155891,9881281,...,,,,,,,,,,
7,1999-12-31,6009235,287898,266052,,163436,305390,-475840,168831,10623546,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195663,2007-12-31,,,,,,,,,25532,...,,,,,,,,,,
195667,2008-12-31,,,,,,,,,24275,...,,,,,,,,,,
195671,2009-12-31,,,,,,,,,15443,...,,,,,,,,,,
195675,2010-12-31,,,,,,,,,14960,...,,,,,,,,,,


In [50]:
df_cleaned_columns.columns.tolist()

['End of period',
 'Revenues from sales',
 'Operating profit/loss',
 'Net profit/loss attributable to equity holders of the parent',
 'Total comprehensive income attributable to equity holders',
 'Depreciation',
 'Cash flow from operating activities',
 'Cash flow from investing activities',
 'Cash flow from financing activities',
 'Aktywa',
 'Non-current assets',
 'Current assets',
 'Equity shareholders of the parent',
 'Non-current liabilities',
 'Current liabilities',
 "Number of shares ('000)",
 'Earnings per share',
 'Total comprehensive income per share',
 'Book value per share',
 'Accounting standard',
 'filename',
 'Property, plant and equipment',
 'Exploration for and evaluation of mineral resources',
 'Intangible assets',
 'Goodwill',
 'Investment property',
 'Right-of-use assets',
 'Investment in affiliates',
 'Non-current financial assets',
 'Non-current loans and receivables',
 'Deferred income tax',
 'Non-current deferred charges and accruals',
 'Non-current derivative ins

In [51]:
output_csv_path = 'dataframes/dataframe_gpw.csv'
df_cleaned_columns.to_csv(output_csv_path, index=False)