## term analysis and statistics

### encoding TNFD glossary terms

In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

In [None]:
model = SentenceTransformer('all-mpnet-base-v2', device='cpu')
df_tfnd_glossary_2023["embedding"] = df_tfnd_glossary_2023["Definition"].apply(lambda term: model.encode(term, convert_to_numpy=True))

In [None]:
df_tfnd_glossary_2023.to_json("data/df_tfnd_glossary_2023_embedded.json", orient="records")

In [7]:
df_tfnd_glossary_2023 = pd.read_json("data/df_tfnd_glossary_2023_embedded.json", orient="records")
df_tfnd_glossary_2023["embedding"] = df_tfnd_glossary_2023["embedding"].apply(lambda x: np.array(x, dtype=np.float32))

In [8]:
df_tfnd_glossary_2023

Unnamed: 0.1,Unnamed: 0,Term,Definition,embedding
0,0,Abiotic flows,Abiotic flows are contributions to benefits fr...,"[-0.029361257, 0.006075624, -0.018649256, 0.04..."
1,1,Acute risk,"Occurrence of short-term, specific events that...","[-0.038341668, -0.0003466105, 0.009580128, -0...."
2,2,Adaptation,Adjustment in natural or human systems to a ne...,"[-0.022162942, -0.044189833, 0.0013946677, -0...."
3,3,Additional conservation actions,A broad range of activities intended to benefi...,"[0.009307631, 0.02553423, -0.012623155, 0.0058..."
4,4,Additional disclosure metrics,Metrics suggested by the TNFD that a company o...,"[-0.023433544, 0.009369389, -0.014957044, -0.0..."
...,...,...,...,...
352,352,Water sources,Water sources include water withdrawn from sur...,"[0.017979847, 0.00714735, 0.009741537, 0.02206..."
353,353,Water stress (areas of),Water stressed (region): defined in three lev...,"[-0.01709339, -0.08745881, -0.0017973423, -0.0..."
354,354,Water withdrawal,The sum of all water drawn into the boundaries...,"[0.013102424, -0.07435216, 0.013924917, 0.0146..."
355,355,Wild species,Refers to populations of any native species th...,"[-0.007327114, -0.020553412, 0.007337141, -0.0..."


### relevant terms

In [9]:
terms = ['Nature-related systemic risks', 
        'Nature-related physical risks',
        'Nature-related transition risks',
        'Nature-related opportunities',
        'Ecosystem protection, restoration and regeneration opportunity']
risks_opportunities_tnfd_glossary = df_tfnd_glossary_2023[df_tfnd_glossary_2023['Term'].isin(terms)]

In [9]:
risk_glossary_terms = ['Ecosystem stability risk', 
                       'Nature-related systemic risks',
                       'Nature-related transition risks']
opportunities_terms = ['Sustainable use of natural resources opportunity',
                       'Ecosystem protection, restoration and regeneration opportunity']


### fetching relevant chunks from the ECC database based on the glossary terms

In [1]:
from glossary_similarity import fetch_chunks_for_term_for_years_biodiv_subset, get_biodiversity_subset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
time_frame = [2015,2016,2017,2018,2019,2020,2021,2022,2023]

In [10]:
all_terms_similar_embeddings = pd.DataFrame()

# fetch biodiv. subset from database
driver, biodiversity_subset = get_biodiversity_subset(time_frame,chunks_per_year=100000,streamlit_secret=False)

for i, row in risks_opportunities_tnfd_glossary.iterrows():
    
    term = row['Term']
    embedding = row['embedding']

    print(f'processing term {term} [{i} out of {len(df_tfnd_glossary_2023)}] ..')

    chunks = fetch_chunks_for_term_for_years_biodiv_subset(driver,
                                                           time_frame,
                                                           term,
                                                           embedding,
                                                           biodiversity_subset, 
                                                           streamlit_secret=False,
                                                           chunks_per_year=100)
    results = pd.DataFrame(chunks)
    results['term_embedding'] = [np.array(embedding, dtype=np.float32)] * len(results)
    results['term'] = term

    print(f'results for {term}: {len(results)}')
    
    all_terms_similar_embeddings = pd.concat([all_terms_similar_embeddings, results])


returning biodiv also
processing term Ecosystem protection, restoration and regeneration opportunity [80 out of 357] ..
results for Ecosystem protection, restoration and regeneration opportunity: 66
processing term Nature-related opportunities [192 out of 357] ..
results for Nature-related opportunities: 86
processing term Nature-related physical risks [193 out of 357] ..
results for Nature-related physical risks: 64
processing term Nature-related systemic risks [196 out of 357] ..
results for Nature-related systemic risks: 43
processing term Nature-related transition risks [197 out of 357] ..
results for Nature-related transition risks: 60
