In [23]:
import pandas as pd
import numpy as np
from langchain.document_loaders import PyPDFLoader 
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import re
import glob
import os
import tqdm

In [24]:
alphabet = "abcdefghijklmnopqrstuvwxyz"

In [25]:
def cosine_similarity(A, B):
    return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))

# Load Classes Descriptions

In [26]:
df = pd.read_csv("NACE_Rev2_Structure_Explanatory_Notes_EN__1_.tsv", sep="\t")
df

Unnamed: 0,ORDER_KEY,ID,CODE,NAME,PARENT_ID,LEVEL,Includes,IncludesAlso,Excludes
0,200,A,A,"AGRICULTURE, FORESTRY AND FISHING",,1,This section includes the exploitation of vege...,,
1,300,01,01,"Crop and animal production, hunting and relate...",A,2,"This division includes two basic activities, n...",This division also includes service activities...,Agricultural activities exclude any subsequent...
2,350,011,01.1,Growing of non-perennial crops,01,3,This group includes the growing of non-perenni...,,
3,355,0111,01.11,"Growing of cereals (except rice), leguminous c...",011,4,This class includes all forms of growing of ce...,,"This class excludes:\n- growing of rice, see 0..."
4,360,0112,01.12,Growing of rice,011,4,This class includes:\n- growing of rice (inclu...,,
...,...,...,...,...,...,...,...,...,...
991,51295,9820,98.20,Undifferentiated service-producing activities ...,982,4,This class includes the undifferentiated subsi...,,
992,51495,U,U,ACTIVITIES OF EXTRATERRITORIAL ORGANISATIONS A...,,1,,,
993,51595,99,99,Activities of extraterritorial organisations a...,U,2,,,
994,51645,990,99.0,Activities of extraterritorial organisations a...,99,3,,,


In [27]:
df_first_level = df[df["ID"].apply(lambda x: not x.isnumeric())]
df_first_level = df_first_level.dropna(subset=["Includes"])
df_first_level.reset_index(drop=True, inplace=True)
df_first_level

Unnamed: 0,ORDER_KEY,ID,CODE,NAME,PARENT_ID,LEVEL,Includes,IncludesAlso,Excludes
0,200,A,A,"AGRICULTURE, FORESTRY AND FISHING",,1,This section includes the exploitation of vege...,,
1,1945,B,B,MINING AND QUARRYING,,1,Mining and quarrying include the extraction of...,,This section excludes:\n- processing of the ex...
2,3220,C,C,MANUFACTURING,,1,This section includes the physical or chemical...,,
3,15650,D,D,"ELECTRICITY, GAS, STEAM AND AIR CONDITIONING S...",,1,This section includes the activity of providin...,Also included is the provision of steam and ai...,This section excludes the operation of water a...
4,16260,E,E,"WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND R...",,1,This section includes activities related to th...,Activities of water supply are also grouped in...,
5,17650,F,F,CONSTRUCTION,,1,This section includes general construction and...,This section also includes the development of ...,If these activities are carried out not for la...
6,21175,G,G,WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VE...,,1,This section includes wholesale and retail sal...,,
7,25630,H,H,TRANSPORTATION AND STORAGE,,1,This section includes the provision of passeng...,,This section excludes:\n- major repair or alte...
8,28260,I,I,ACCOMMODATION AND FOOD SERVICE ACTIVITIES,,1,This section includes the provision of short-s...,,This section excludes the provision of long-te...
9,29055,J,J,INFORMATION AND COMMUNICATION,,1,This section includes the production and distr...,,


# Load Reports

In [28]:
pdf_path = "annual_reports/mercedes-benz-annual-report-2023-incl-combined-management-report-mbg-ag-2.pdf"

In [29]:
loader = PyPDFLoader(pdf_path)
seiten_docs = loader.load()

In [30]:
paragraph_splitter = CharacterTextSplitter(
    separator="\n\n",  
    chunk_size=4_096,  
    chunk_overlap=0    
)

In [31]:
paragraph_docs = paragraph_splitter.split_documents(seiten_docs)
paragraph_docs

[Document(metadata={'source': 'annual_reports/mercedes-benz-annual-report-2023-incl-combined-management-report-mbg-ag-2.pdf', 'page': 0}, page_content='Annual Report 2023'),
 Document(metadata={'source': 'annual_reports/mercedes-benz-annual-report-2023-incl-combined-management-report-mbg-ag-2.pdf', 'page': 1}, page_content='Contents  To Our Shareholders Combined Management Report Corporate Governance Consolidated Financial Statements Further Information \n   \nAnnual Report 2023   |   Mercedes-Benz Group2\nContents To Our Shareholders Combined Management Report Corporate Governance  Consolidated Financial Statements Further Information \n   \nAnnual Report 2023   |   Mercedes-Benz Group2\n 5 TO OUR SHAREHOLDERS\n 30 COMBINED MANAGEMENT REPORT\n 162 CORPORATE GOVERNANCE \n 194 CONSOLIDATED FINANCIAL STATEMENTS\n 336 FURTHER INFORMATION'),
 Document(metadata={'source': 'annual_reports/mercedes-benz-annual-report-2023-incl-combined-management-report-mbg-ag-2.pdf', 'page': 3}, page_content

In [32]:
paragraphs = [re.sub(r"[^a-zA-ZäöüÄÖÜß.\s]", "", doc.page_content).lower().strip() for doc in paragraph_docs]
paragraphs

['annual report',
 'contents  to our shareholders combined management report corporate governance consolidated financial statements further information \n   \nannual report       mercedesbenz group\ncontents to our shareholders combined management report corporate governance  consolidated financial statements further information \n   \nannual report       mercedesbenz group\n  to our shareholders\n  combined management report\n  corporate governance \n  consolidated financial statements\n  further information',
 'to our \nshareholders',
 'contents  to our shareholders combined management report corporate governance consolidated financial statements further information \n   \nannual report       mercedesbenz group\nto our shareholders\n  letter from the ceo \n  the board of management\n  report of the supervisory board\n  the supervisory board\n  objectives and strategy\n  mercedesbenz cars strategy\n  mercedesbenz vans strategy\n  mercedesbenz mobility strategy',
 'contents  to our sha

In [33]:
embedding_model = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-mpnet-base-v2",
                #model_kwargs=model_kwargs,
                #encode_kwargs=encode_kwargs
            )

In [34]:
def embed_paragraph(paragraph: str): 
    sentences = paragraph.split(".")
    embeddings = np.array(embedding_model.embed_documents(sentences))
    paragraph_embedding = embeddings.mean(axis=0)

    return paragraph_embedding

### Embed sentences

In [35]:
df_paragraphs = pd.DataFrame(paragraphs, columns=["Paragraphs"])
df_paragraphs["Embeddings"] = df_paragraphs["Paragraphs"].apply(embed_paragraph)

In [36]:
df_paragraphs

Unnamed: 0,Paragraphs,Embeddings
0,annual report,"[-0.006913646589964628, 0.05985521897673607, 0..."
1,contents to our shareholders combined managem...,"[-0.022289827466011047, -0.008060635067522526,..."
2,to our \nshareholders,"[0.017315516248345375, 0.07084259390830994, -0..."
3,contents to our shareholders combined managem...,"[0.002538476139307022, -0.04403343424201012, 0..."
4,contents to our shareholders combined managem...,"[0.011623302330102387, 0.01717341643083712, -0..."
...,...,...
343,contents to our shareholders combined manageme...,"[-0.017827692131201427, -0.004083409633797904,..."
344,contents to our shareholders combined manageme...,"[-0.013661659749535223, 0.0391222031127351, -0..."
345,contents to our shareholders combined manageme...,"[-0.010045851104995094, 0.0008959129071709784,..."
346,contents to our shareholders combined manageme...,"[-0.006526590419179272, -0.001504494786439907,..."


### Embed classes

In [37]:
df_first_level["Embeddings"] = embedding_model.embed_documents(df_first_level["Includes"].to_list())

### Caclulate Similarities

In [38]:
for i, row in df_first_level.iterrows(): 
    similarities = df_paragraphs["Embeddings"].apply(lambda x: cosine_similarity(x, row.Embeddings))
    df_paragraphs[f"Scores_{alphabet[i]}_{row.NAME}"] = similarities

In [39]:
df_paragraphs

Unnamed: 0,Paragraphs,Embeddings,"Scores_a_AGRICULTURE, FORESTRY AND FISHING",Scores_b_MINING AND QUARRYING,Scores_c_MANUFACTURING,"Scores_d_ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY","Scores_e_WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES",Scores_f_CONSTRUCTION,Scores_g_WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES,Scores_h_TRANSPORTATION AND STORAGE,...,Scores_j_INFORMATION AND COMMUNICATION,Scores_k_FINANCIAL AND INSURANCE ACTIVITIES,Scores_l_REAL ESTATE ACTIVITIES,"Scores_m_PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES",Scores_n_ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES,Scores_o_PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY,Scores_p_EDUCATION,Scores_q_HUMAN HEALTH AND SOCIAL WORK ACTIVITIES,"Scores_r_ARTS, ENTERTAINMENT AND RECREATION",Scores_s_OTHER SERVICE ACTIVITIES
0,annual report,"[-0.006913646589964628, 0.05985521897673607, 0...",0.131346,0.174269,0.112855,0.102875,0.184829,0.129126,0.159021,0.176941,...,0.130393,0.274346,0.136749,0.233465,0.182769,0.164592,0.167947,0.154159,0.163842,0.227601
1,contents to our shareholders combined managem...,"[-0.022289827466011047, -0.008060635067522526,...",0.057490,0.228440,0.235181,0.168831,0.172532,0.177054,0.289306,0.250577,...,0.186617,0.303568,0.152856,0.134985,0.256748,0.065791,0.139274,0.044738,0.100163,0.189730
2,to our \nshareholders,"[0.017315516248345375, 0.07084259390830994, -0...",0.106430,0.093924,0.111387,0.159619,0.149587,0.099788,0.206487,0.235349,...,0.170232,0.250110,0.140586,0.156105,0.182338,0.066004,0.014767,0.042707,0.073823,0.204501
3,contents to our shareholders combined managem...,"[0.002538476139307022, -0.04403343424201012, 0...",0.075522,0.240978,0.253093,0.205314,0.201930,0.192142,0.346582,0.329523,...,0.219434,0.317416,0.140766,0.186468,0.287623,0.073870,0.133742,0.068543,0.116835,0.236575
4,contents to our shareholders combined managem...,"[0.011623302330102387, 0.01717341643083712, -0...",0.142651,0.201012,0.229479,0.220124,0.212644,0.216804,0.318695,0.303707,...,0.222782,0.327549,0.146197,0.288525,0.332984,0.125931,0.122539,0.121102,0.167788,0.301475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,contents to our shareholders combined manageme...,"[-0.017827692131201427, -0.004083409633797904,...",0.097195,0.137633,0.134290,0.101809,0.167812,0.112080,0.193147,0.212054,...,0.159801,0.234514,0.062237,0.160896,0.178241,0.036699,0.058105,-0.001851,0.120085,0.208315
344,contents to our shareholders combined manageme...,"[-0.013661659749535223, 0.0391222031127351, -0...",0.055982,0.000363,-0.011419,-0.008851,0.085830,0.008964,0.019226,0.098447,...,0.034965,0.082615,0.006768,0.081934,0.063419,-0.008076,-0.077439,-0.001774,0.079627,0.141207
345,contents to our shareholders combined manageme...,"[-0.010045851104995094, 0.0008959129071709784,...",0.179651,0.238452,0.304716,0.215107,0.249489,0.226567,0.320232,0.321956,...,0.228555,0.219555,0.114246,0.258308,0.264587,0.176108,0.161138,0.075335,0.128842,0.306299
346,contents to our shareholders combined manageme...,"[-0.006526590419179272, -0.001504494786439907,...",0.107899,0.254451,0.250091,0.203937,0.223861,0.233382,0.343590,0.318211,...,0.231455,0.326856,0.170231,0.302901,0.317869,0.125813,0.182852,0.118902,0.129416,0.306085


# Create Function and generate data for multiple reports!

In [40]:
def create_paragraph_nace_code_similarities(pdf_path): 
    
    # Load Classes Descriptions
    df = pd.read_csv("NACE_Rev2_Structure_Explanatory_Notes_EN__1_.tsv", sep="\t")

    df_first_level = df[df["ID"].apply(lambda x: not x.isnumeric())]
    df_first_level = df_first_level.dropna(subset=["Includes"])
    df_first_level.reset_index(drop=True, inplace=True)
    df_first_level
    
    loader = PyPDFLoader(pdf_path)
    seiten_docs = loader.load()
    paragraph_splitter = CharacterTextSplitter(
        separator="\n\n",  
        chunk_size=4_096,  
        chunk_overlap=0    
    )
    paragraph_docs = paragraph_splitter.split_documents(seiten_docs)
    
    paragraphs = [re.sub(r"[^a-zA-ZäöüÄÖÜß.\s]", "", doc.page_content).lower().strip() for doc in paragraph_docs]
    
    embedding_model = HuggingFaceEmbeddings(
                    #model_name="sentence-transformers/all-MiniLM-L12-v2",
                    model_name="sentence-transformers/all-mpnet-base-v2",
                    #model_kwargs=model_kwargs,
                    #encode_kwargs=encode_kwargs
                )
    
    def embed_paragraph(paragraph: str): 
        sentences = paragraph.split(".")
        embeddings = np.array(embedding_model.embed_documents(sentences))
        paragraph_embedding = embeddings.mean(axis=0)

        return paragraph_embedding

    ### Embed sentences
    df_paragraphs = pd.DataFrame(paragraphs, columns=["Paragraphs"])
    df_paragraphs["Embeddings"] = df_paragraphs["Paragraphs"].apply(embed_paragraph)

    ### Embed classes
    df_first_level["Embeddings"] = embedding_model.embed_documents(df_first_level["Includes"].to_list())

    ### Caclulate Similarities
    for i, row in df_first_level.iterrows(): 
        similarities = df_paragraphs["Embeddings"].apply(lambda x: cosine_similarity(x, row.Embeddings))
        df_paragraphs[f"Scores_{alphabet[i]}_{row.NAME}"] = similarities
    return df_paragraphs

In [41]:
pdf_paths = glob.glob("annual_reports/*.pdf")
pdf_paths

['annual_reports/Deutsche_Annual-Report-2023.pdf',
 'annual_reports/Zalando-SE_DE_2024.pdf',
 'annual_reports/bayer-annual-report-2023-2.pdf',
 'annual_reports/adidas-ar23.pdf',
 'annual_reports/mercedes-benz-annual-report-2023-incl-combined-management-report-mbg-ag-2.pdf',
 'annual_reports/Siemens-Annual-Report-2023.pdf',
 'annual_reports/thyssenkrupp-GB_2023-2024_EN_WEB.pdf',
 'annual_reports/heidelberg-materials_2023.pdf',
 'annual_reports/rheinmetall-ag_2023.pdf',
 'annual_reports/conti_annual-report-2023-data.pdf']

In [42]:
df = create_paragraph_nace_code_similarities(pdf_path)

In [43]:
for pdf_path in tqdm.tqdm(pdf_paths): 
    df = create_paragraph_nace_code_similarities(pdf_path)
    df.to_csv(f"paragraph_similarities/{os.path.basename(pdf_path).split('.')[0]}.csv")

100%|██████████| 10/10 [1:21:32<00:00, 489.22s/it]
