In [28]:
import pandas as pd
import numpy as np
from langchain.document_loaders import PyPDFLoader 
import re
from langchain_huggingface import HuggingFaceEmbeddings
import glob
import os

In [29]:
def cosine_similarity(A, B):
    return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))

In [30]:
alphabet = "abcdefghijklmnopqrstuvwxyz"

# Load Classes Descriptions

In [31]:
df = pd.read_csv("NACE_Rev2_Structure_Explanatory_Notes_EN__1_.tsv", sep="\t")
df

Unnamed: 0,ORDER_KEY,ID,CODE,NAME,PARENT_ID,LEVEL,Includes,IncludesAlso,Excludes
0,200,A,A,"AGRICULTURE, FORESTRY AND FISHING",,1,This section includes the exploitation of vege...,,
1,300,01,01,"Crop and animal production, hunting and relate...",A,2,"This division includes two basic activities, n...",This division also includes service activities...,Agricultural activities exclude any subsequent...
2,350,011,01.1,Growing of non-perennial crops,01,3,This group includes the growing of non-perenni...,,
3,355,0111,01.11,"Growing of cereals (except rice), leguminous c...",011,4,This class includes all forms of growing of ce...,,"This class excludes:\n- growing of rice, see 0..."
4,360,0112,01.12,Growing of rice,011,4,This class includes:\n- growing of rice (inclu...,,
...,...,...,...,...,...,...,...,...,...
991,51295,9820,98.20,Undifferentiated service-producing activities ...,982,4,This class includes the undifferentiated subsi...,,
992,51495,U,U,ACTIVITIES OF EXTRATERRITORIAL ORGANISATIONS A...,,1,,,
993,51595,99,99,Activities of extraterritorial organisations a...,U,2,,,
994,51645,990,99.0,Activities of extraterritorial organisations a...,99,3,,,


In [32]:
df_first_level = df[df["ID"].apply(lambda x: not x.isnumeric())]
df_first_level = df_first_level.dropna(subset=["Includes"])
df_first_level.reset_index(drop=True, inplace=True)
df_first_level

Unnamed: 0,ORDER_KEY,ID,CODE,NAME,PARENT_ID,LEVEL,Includes,IncludesAlso,Excludes
0,200,A,A,"AGRICULTURE, FORESTRY AND FISHING",,1,This section includes the exploitation of vege...,,
1,1945,B,B,MINING AND QUARRYING,,1,Mining and quarrying include the extraction of...,,This section excludes:\n- processing of the ex...
2,3220,C,C,MANUFACTURING,,1,This section includes the physical or chemical...,,
3,15650,D,D,"ELECTRICITY, GAS, STEAM AND AIR CONDITIONING S...",,1,This section includes the activity of providin...,Also included is the provision of steam and ai...,This section excludes the operation of water a...
4,16260,E,E,"WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND R...",,1,This section includes activities related to th...,Activities of water supply are also grouped in...,
5,17650,F,F,CONSTRUCTION,,1,This section includes general construction and...,This section also includes the development of ...,If these activities are carried out not for la...
6,21175,G,G,WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VE...,,1,This section includes wholesale and retail sal...,,
7,25630,H,H,TRANSPORTATION AND STORAGE,,1,This section includes the provision of passeng...,,This section excludes:\n- major repair or alte...
8,28260,I,I,ACCOMMODATION AND FOOD SERVICE ACTIVITIES,,1,This section includes the provision of short-s...,,This section excludes the provision of long-te...
9,29055,J,J,INFORMATION AND COMMUNICATION,,1,This section includes the production and distr...,,


In [57]:
df_first_level["IncludesAlso"] = df_first_level["IncludesAlso"].fillna("")
labels = [df_first_level.loc[i, "Includes"] + " " + df_first_level.loc[i, "IncludesAlso"] for i in range(len(df_first_level))]
df_first_level["Labels"] = labels

# Load Reports

In [33]:
pdf_path = "annual_reports/mercedes-benz-annual-report-2023-incl-combined-management-report-mbg-ag-2.pdf"

In [34]:
loader = PyPDFLoader(pdf_path)
seiten_docs = loader.load()

In [35]:
seiten_docs[0].page_content

'Annual Report 2023'

In [36]:
text = " ".join([page.page_content for page in seiten_docs])

In [44]:
sentences = text.split(".")

In [None]:
sentences = [sentence.lower().strip() for sentence in sentences]
sentences = [sentence.replace(" -", "") for sentence in sentences]
sentences = [sentence.replace("\n", "") for sentence in sentences]
sentences = [re.sub(r"[^a-zA-ZäöüÄÖÜß\s]", "", sentence) for sentence in sentences]

In [58]:
sentences = [". ".join(sentences[i:i+3]) for i in range(len(sentences)-2)]

In [59]:
embedding_model = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-mpnet-base-v2",
                #model_kwargs=model_kwargs,
                #encode_kwargs=encode_kwargs
            )

### Embed sentences

In [65]:
df_sentences = pd.DataFrame(sentences, columns=["Sentences"])
df_sentences

Unnamed: 0,Sentences
0,annual report 2023 contents to our shareholde...
1,we were able to assert ourselves strongly in t...
2,this is also reflected in our balance sheet. w...
3,we also continued to make progress in the imp...
4,our company has shown that we can achieve soli...
...,...
6786,"the words “anticipate”, “assume”, “believe”, “..."
6787,these statements are subject to many risks and...
6788,if any of these risks and uncertainties materi...
6789,we do not intend or assume any obligation to u...


In [66]:
# df_sentences = df_sentences[df_sentences["Sentences"].apply(len) > 10]
# # df_sentences

In [None]:
df_sentences["Embeddings"] = embedding_model.embed_documents(df_sentences["Sentences"].to_list())

In [39]:
df_sentences


Unnamed: 0,Sentences,Embeddings
0,annual report contents to our shareholders c...,"[0.006745141465216875, -0.02605125866830349, 0..."
1,we were \nable to assert ourselves strongly in...,"[0.02165076695382595, 0.028876876458525658, -0..."
2,this is also reflected in our balance sheet. w...,"[-0.007771308533847332, 0.03087206557393074, -..."
3,we also \ncontinued to make progress in the i...,"[0.005535706412047148, 0.006914899218827486, -..."
4,our company has shown that we can achieve soli...,"[-0.018054092302918434, -0.0022909939289093018..."
...,...,...
6786,the words anticipate assume believe estimate e...,"[0.017479166388511658, -0.005689783953130245, ..."
6787,these statements are subject to many risks and...,"[0.013225414790213108, -0.012154914438724518, ..."
6788,if any of \nthese risks and uncertainties mate...,"[-0.0029579626861959696, -0.027874447405338287..."
6789,we do not intend or assume any \nobligation to...,"[0.003985402174293995, -0.03476681560277939, 0..."


### Embed classes

In [None]:
df_first_level["Embeddings"] = embedding_model.embed_documents(df_first_level["Labels"].to_list())

### Caclulate Similarities

In [41]:
for i, row in df_first_level.iterrows(): 
    similarities = df_sentences["Embeddings"].apply(lambda x: cosine_similarity(x, row.Embeddings))
    df_sentences[f"Scores_{alphabet[i]}_{row.NAME}"] = similarities

In [42]:
df_sentences

Unnamed: 0,Sentences,Embeddings,"Scores_a_AGRICULTURE, FORESTRY AND FISHING",Scores_b_MINING AND QUARRYING,Scores_c_MANUFACTURING,"Scores_d_ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY","Scores_e_WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES",Scores_f_CONSTRUCTION,Scores_g_WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES,Scores_h_TRANSPORTATION AND STORAGE,...,Scores_j_INFORMATION AND COMMUNICATION,Scores_k_FINANCIAL AND INSURANCE ACTIVITIES,Scores_l_REAL ESTATE ACTIVITIES,"Scores_m_PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES",Scores_n_ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES,Scores_o_PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY,Scores_p_EDUCATION,Scores_q_HUMAN HEALTH AND SOCIAL WORK ACTIVITIES,"Scores_r_ARTS, ENTERTAINMENT AND RECREATION",Scores_s_OTHER SERVICE ACTIVITIES
0,annual report contents to our shareholders c...,"[0.006745141465216875, -0.02605125866830349, 0...",0.053221,0.218621,0.226157,0.172885,0.154571,0.175512,0.313312,0.271190,...,0.150071,0.268123,0.100351,0.149901,0.239133,0.034433,0.113469,0.019348,0.079214,0.198717
1,we were \nable to assert ourselves strongly in...,"[0.02165076695382595, 0.028876876458525658, -0...",0.107877,0.126508,0.099025,0.141467,0.110167,0.118488,0.154061,0.144755,...,0.131387,0.249468,0.031527,0.199895,0.164086,0.100500,0.057380,0.097813,0.050175,0.163281
2,this is also reflected in our balance sheet. w...,"[-0.007771308533847332, 0.03087206557393074, -...",0.113854,0.184989,0.141328,0.131966,0.162314,0.133162,0.152213,0.186818,...,0.117234,0.298575,0.025631,0.191575,0.192109,0.072389,0.055427,0.078322,0.091393,0.181165
3,we also \ncontinued to make progress in the i...,"[0.005535706412047148, 0.006914899218827486, -...",0.085710,0.172668,0.183356,0.200209,0.155730,0.119029,0.201825,0.195692,...,0.108926,0.195692,0.023566,0.159675,0.175295,-0.017772,0.034972,-0.003299,0.056280,0.152856
4,our company has shown that we can achieve soli...,"[-0.018054092302918434, -0.0022909939289093018...",0.090642,0.137103,0.167834,0.254343,0.115490,0.110802,0.208414,0.195591,...,0.092041,0.169739,0.022026,0.108945,0.133978,-0.038080,0.035440,-0.026524,0.048150,0.114146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6786,the words anticipate assume believe estimate e...,"[0.017479166388511658, -0.005689783953130245, ...",0.176344,0.234997,0.273353,0.228834,0.249176,0.223838,0.343639,0.289318,...,0.211135,0.312293,0.147959,0.274838,0.327322,0.194322,0.174369,0.148016,0.144245,0.226000
6787,these statements are subject to many risks and...,"[0.013225414790213108, -0.012154914438724518, ...",0.195309,0.250534,0.274509,0.227151,0.225247,0.194244,0.359238,0.304014,...,0.205666,0.323856,0.106472,0.232427,0.294869,0.136679,0.134686,0.084007,0.132333,0.229933
6788,if any of \nthese risks and uncertainties mate...,"[-0.0029579626861959696, -0.027874447405338287...",0.147926,0.294370,0.319057,0.303856,0.244651,0.232128,0.271365,0.257622,...,0.174219,0.138818,0.062370,0.222347,0.174135,0.154577,0.208455,0.085658,0.039856,0.162408
6789,we do not intend or assume any \nobligation to...,"[0.003985402174293995, -0.03476681560277939, 0...",0.118962,0.282397,0.279907,0.274509,0.170187,0.185968,0.246568,0.238639,...,0.171048,0.086916,0.038899,0.133927,0.105758,0.105269,0.181645,0.007480,0.014592,0.103599


# Create Function and generate data for multiple reports!

In [None]:
def create_sentence_nace_code_similarities(pdf_path): 
    # Load Classes Descriptions
    df = pd.read_csv("NACE_Rev2_Structure_Explanatory_Notes_EN__1_.tsv", sep="\t")

    df_first_level = df[df["ID"].apply(lambda x: not x.isnumeric())]
    df_first_level = df_first_level.dropna(subset=["Includes"])
    df_first_level.reset_index(drop=True, inplace=True)
    df_first_level["IncludesAlso"] = df_first_level["IncludesAlso"].fillna("")
    labels = [df_first_level.loc[i, "Includes"] + " " + df_first_level.loc[i, "IncludesAlso"] for i in range(len(df_first_level))]
    df_first_level["Labels"] = labels

    # Load Reports
    loader = PyPDFLoader(pdf_path)
    seiten_docs = loader.load()
    seiten_docs[0].page_content
    text = " ".join([page.page_content for page in seiten_docs])
    sentences = text.split(".")
    sentences = [sentence.lower().strip() for sentence in sentences]
    sentences = [re.sub(r"[^a-zA-ZäöüÄÖÜß\s]", "", sentence) for sentence in sentences]

    embedding_model = HuggingFaceEmbeddings(
                    #model_name="sentence-transformers/all-MiniLM-L12-v2",
                    model_name="sentence-transformers/all-mpnet-base-v2",
                    #model_kwargs=model_kwargs,
                    #encode_kwargs=encode_kwargs
                )
    
    ### Embed sentences
    df_sentences = pd.DataFrame(sentences, columns=["Sentences"])
    df_sentences["Embeddings"] = embedding_model.embed_documents(df_sentences["Sentences"].to_list())



    ### Embed classes
    df_first_level["Embeddings"] = embedding_model.embed_documents(df_first_level["Labels"].to_list())
    ### Caclulate Similarities
    for i, row in df_first_level.iterrows(): 
        similarities = df_sentences["Embeddings"].apply(lambda x: cosine_similarity(x, row.Embeddings))
        df_sentences[f"Scores_{alphabet[i]}_{row.NAME}"] = similarities
    return df_sentences

In [44]:
pdf_paths = glob.glob("annual_reports/*.pdf")
pdf_paths

['annual_reports/Deutsche_Annual-Report-2023.pdf',
 'annual_reports/Zalando-SE_DE_2024.pdf',
 'annual_reports/bayer-annual-report-2023-2.pdf',
 'annual_reports/adidas-ar23.pdf',
 'annual_reports/mercedes-benz-annual-report-2023-incl-combined-management-report-mbg-ag-2.pdf',
 'annual_reports/Siemens-Annual-Report-2023.pdf',
 'annual_reports/thyssenkrupp-GB_2023-2024_EN_WEB.pdf',
 'annual_reports/heidelberg-materials_2023.pdf',
 'annual_reports/rheinmetall-ag_2023.pdf',
 'annual_reports/conti_annual-report-2023-data.pdf']

In [45]:
df = create_sentence_nace_code_similarities(pdf_path)

In [46]:
for pdf_path in pdf_paths: 
    df = create_sentence_nace_code_similarities(pdf_path)
    df.to_csv(f"paragraph_sliding_window_3_similarities/{os.path.basename(pdf_path).split('.')[0]}.csv")