In [421]:
import pandas as pd
import numpy as np
import os

# Notes
WSZYSTKIE PLIKI SĄ IDENTYCZNE. WSZYSTKIE TO EXCEL - 14 kart
Czyszczenie:
•	Wszystkie NaN na 0
•	Skalowanie przed modelem
•	Inżynieria cech (wskaźniki ekonomiczne, różnica miedzy każym z elementów)

TESTY DO PRZEPROWADZENIA:
•	Uwzglednienie wszystkiego
•	Skupienie sie na zmianach kapitałowych
•	Ograniczenie tylko główne wartości
•	Grupowanie po przedsiębiorstwie? 

Podsumowanie co wyciągnąć z pliku:
Dodatkowa kolumna jako nazwa pliku
C – nazwa kolumny w DF

(Kolumny: OD M DO AB. Wiersze: Od 3 do 18 włącznie)
(wiersze 30 do 93 włącznie)
Wiersze 255 – 275 włącznie
Wiersze 279 – 287 włącznie (dane w %)
Wiersze 290 – 293 włącznie

In [422]:
folder_path = 'dataa/'

In [423]:
if os.path.exists(folder_path):
    print("Folder exists. Proceeding with file processing.")
else:
    print("Folder does not exist. Check the path.")

Folder exists. Proceeding with file processing.


In [424]:
# TEST, przejście po każdym pliku i wyciągnięcie z 'Info' nazwa firmy i sektora

In [425]:
def test_process_excel_files(folder_path):
    results = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            try:
                with pd.ExcelFile(file_path) as xls:
                    company_name = pd.read_excel(xls, 'Info', usecols="B", skiprows=1, nrows=1).values[0][0]
                    sector = pd.read_excel(xls, 'Info', usecols="E", skiprows=19, nrows=1).values[0][0]
                
                result = {
                    'filename': filename,
                    'Company Name': company_name,
                    'Sector': sector
                }
                results.append(result)
                
                print(f"Dane z {filename} zostały przetworzone.")
            except Exception as e:
                print(f"Błąd przy przetwarzaniu pliku {filename}: {e}")

    results_df = pd.DataFrame(results)
    return results_df

In [426]:
test_process_excel_files(folder_path)

Dane z ZREMBCH.xlsx zostały przetworzone.


Unnamed: 0,filename,Company Name,Sector
0,ZREMBCH.xlsx,Zremb-Chojnice SA,urządzenia mechaniczne


# Przygotowanie DF z danymi (kolumny)

In [427]:
# # Działająca implementacja pobierania wierszy z row_indices i wprowadzania ich do dataframu
# def process_excel_data(base_folder):
#     folder_path = os.path.join(base_folder)  # Access 'data' subfolder
#     all_data = []  # List to store all data frames for concatenation
# 
#     # Specific rows from which to pull header data, adjusted for zero-based index
#     row_indices = [2, 29, 254, 278, 289]
# 
#     for filename in os.listdir(folder_path):
#         if filename.endswith('.xlsx'):
#             file_path = os.path.join(folder_path, filename)
#             try:
#                 # Read only column 'C' (3rd column, index 2), assuming no headers (header=None)
#                 data = pd.read_excel(file_path, sheet_name='QS', usecols=[2], header=None)
# 
#                 # Extract headers from specified rows
#                 headers = data.iloc[row_indices].squeeze().tolist()  # Get a flat list of headers
# 
#                 # Create a new DataFrame with these headers, filling with example data
#                 # Let's use 1 for all cells for demonstration; replace as needed
#                 new_df = pd.DataFrame([1] * len(headers), index=headers).T  # Transpose to flip rows and columns
#                 new_df['filename'] = filename  # Add filename as a column
# 
#                 # Append new DataFrame to list
#                 all_data.append(new_df)
#             except Exception as e:
#                 print(f"Error processing file {filename}: {e}")
# 
#     # Concatenate all data frames into one
#     results_df = pd.concat(all_data, ignore_index=True)
#     return results_df
# 
# # Usage example
# df_results = process_excel_data(folder_path)
# df_results

In [428]:
# df_results = process_excel_data(folder_path)

In [429]:
# df_results

In [430]:
def process_excel_data(base_folder):
    folder_path = os.path.join(base_folder)  # Access 'data' subfolder
    all_data = []  # List to store all data frames for concatenation

    # Define ranges of rows to aggregate information from
    row_ranges = [
        range(2, 19),  # Rows 3 to 18 inclusive, zero-indexed
        range(29, 94),  # Rows 30 to 93 inclusive
        range(254, 276),  # Rows 255 to 275 inclusive
        range(278, 288),  # Rows 279 to 287 inclusive
        range(289, 294)  # Rows 290 to 293 inclusive
    ]

    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            try:
                # Read only column 'C' (3rd column, index 2), assuming no headers (header=None)
                data = pd.read_excel(file_path, sheet_name='QS', usecols=[2], header=None)

                # Aggregate data from specified row ranges
                aggregated_values = []
                for row_range in row_ranges:
                    extracted_data = data.iloc[row_range].squeeze().tolist()
                    # Example aggregation: join values into a single string separated by ', '
                    aggregated_value = ', '.join(map(str, extracted_data))
                    aggregated_values.append(aggregated_value)

                # Create a new DataFrame using the aggregated values
                new_df = pd.DataFrame([aggregated_values], columns=[f'Range_{i + 1}' for i in range(len(row_ranges))])
                new_df['filename'] = filename  # Add filename as a column

                # Append new DataFrame to list
                all_data.append(new_df)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    # Concatenate all data frames into one
    results_df = pd.concat(all_data, ignore_index=True)
    return results_df

# Usage example

df_results = process_excel_data(folder_path)

In [431]:
df_results

Unnamed: 0,Range_1,Range_2,Range_3,Range_4,Range_5,filename
0,"End of period, Start of period, Revenues from ...","ASSETS, Non-current assets, Property, plant an...","Gross profit margin on sales, Operating profit...","Return on equity (ROE), Return on assets (ROA)...","EBITDA, Current financial liabilities, Non-cur...",ZREMBCH.xlsx


In [432]:
import os
import pandas as pd

def process_excel_data(base_folder):
    folder_path = os.path.join(base_folder)
    all_data = []

    row_ranges = [
        range(2, 19),  # Rows 3 to 18 inclusive, zero-indexed
        range(29, 94),  # Rows 30 to 93 inclusive
        range(254, 276),  # Rows 255 to 275 inclusive
        range(278, 288),  # Rows 279 to 287 inclusive
        range(289, 294)  # Rows 290 to 293 inclusive
    ]

    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            try:
                data = pd.read_excel(file_path, sheet_name='QS', usecols=[2], header=None)

                extracted_data = []
                for row_range in row_ranges:
                    range_data = data.iloc[row_range].squeeze().tolist()
                    concatenated_data = ', '.join(map(str, range_data))
                    extracted_data.append(concatenated_data)

                new_row = {}
                for item in extracted_data:
                    for entry in item.split(', '):
                        new_row[entry.strip()] = None  # dummy value

                # Create a DataFrame from the dictionary
                new_df = pd.DataFrame([new_row])
                new_df['filename'] = filename  # Add filename as a column

                all_data.append(new_df)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    results_df = pd.concat(all_data, ignore_index=True)
    return results_df


df_results = process_excel_data(folder_path)
df_results

Unnamed: 0,End of period,Start of period,Revenues from sales,Operating profit/loss,Net profit/loss attributable to equity holders of the parent,Total comprehensive income attributable to equity holders,Depreciation,Cash flow from operating activities,Cash flow from investing activities,Cash flow from financing activities,...,Leverage (EM),Asset utilization (AU),Load gross profit,Load operating profit,EBITDA margin,EBITDA,Current financial liabilities,Non-current financial liabilities,Financial liabilities (total),filename
0,,,,,,,,,,,...,,,,,,,,,,ZREMBCH.xlsx
