# Data Retrieving

In [5]:
import pandas as pd
from neo4j_access import *
import numpy as np

In [2]:
query = "MATCH (s:Statement)-[:WAS_GIVEN_AT]->(e:ECC) WHERE toLower(s.text) CONTAINS 'biodiversity' RETURN e.time as datetime, e.title as conference_name, s.text as statement"

In [3]:
QueryNeo4J.initialize()
df = QueryNeo4J.commit_query(query=query)

Planner COST

Runtime SLOTTED

Runtime version 5.12

+------------------+----+------------------------------------------------------------------------------------------+----------------+
| Operator         | Id | Details                                                                                  | Estimated Rows |
+------------------+----+------------------------------------------------------------------------------------------+----------------+
| +ProduceResults  |  0 | datetime, conference_name, statement                                                     |       12722848 |
| |                +----+------------------------------------------------------------------------------------------+----------------+
| +Projection      |  1 | cache[e.time] AS datetime, cache[e.title] AS conference_name, cache[s.text] AS statement |       12722848 |
| |                +----+------------------------------------------------------------------------------------------+----------------+
| +Filter

In [4]:
df["datetime"] = df["datetime"].astype(str)  # Convert Neo4j DateTime to string
df["datetime"] = pd.to_datetime(df["datetime"])  # Convert string to Pandas datetime

In [15]:
df.info()
df.to_pickle('data/biodiversity_dataset.pkl')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   datetime          1289 non-null   datetime64[ns, UTC]
 1   conference_name   1289 non-null   object             
 2   statement         1289 non-null   object             
 3   before_tnfd_2022  1289 non-null   int64              
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 40.4+ KB


In [6]:
df['before_tnfd_2022'] = np.where(pd.to_datetime(df['datetime']) < pd.Timestamp("2022-01-01", tz="UTC"), 1, 0)

In [7]:
df["before_tnfd_2022"].describe()

count    1289.000000
mean        0.666408
std         0.471679
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: before_tnfd_2022, dtype: float64

In [8]:
df["before_tnfd_2022"].value_counts()

before_tnfd_2022
1    859
0    430
Name: count, dtype: int64

# Models

In [6]:
df = pd.read_pickle('data/biodiversity_dataset.pkl')

In [8]:
df.head(1)

Unnamed: 0,datetime,conference_name,statement,before_tnfd_2022
0,2021-01-21 05:18:00+00:00,AA - Earnings call Q4 2020,"Thanks, Bill. Let me turn to our market. In th...",1


In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import spacy

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [22]:
def classify_df(df, text_column, pipe, pipe_name):
    """
    Function that classifies text from a DataFrame.
    
    Parameters:
    - df: Pandas DataFrame containing the text to classify.
    - text_column: Column name in the DataFrame that contains report texts.
    - pipe: Hugging Face pipeline for classification.
    - pipe_name: Hugging Face pipeline for action classification.
    
    Returns:
    - df_result: DataFrame with classifications.
    """

    print(f"\nClassifying {pipe_name} labels...")
    classifications = pipe(df[text_column].tolist(), padding=True, truncation=True, batch_size=16)
    df[pipe_name] = [x["label"] for x in classifications]

    df_result = df[df[pipe_name] == pipe_name].copy()

    return df_result

In [None]:
name = "ESGBERT/EnvironmentalBERT-environmental" # path to download from HuggingFace

tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)
# The pipeline combines tokenizer and model to one process.
pipe_env = pipeline("text-classification", model=model, tokenizer=tokenizer)


Device set to use mps:0
Device set to use mps:0


In [23]:
df = classify_df(df, text_column="statement", pipe=pipe_env, pipe_name='environmental')


Classifying environmental labels...


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

tokenizer_name = "ESGBERT/EnvironmentalBERT-biodiversity"
model_name = "ESGBERT/EnvironmentalBERT-biodiversity"
 
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, max_len=512)

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) # set device=0 to use GPU

In [None]:
df = classify_df(df,text_column='statement',pipe=pipe,pipe_name='biodiversity')


Classifying environmental labels...


In [17]:
df['biodiversity'].value_counts()

biodiversity
biodiversity    314
Name: count, dtype: int64

In [31]:
df.sort_values(by='datetime', ascending=False).head(20)

Unnamed: 0,datetime,conference_name,statement,before_tnfd_2022,environmental,action,biodiversity
711,2022-12-15 22:31:00+00:00,Shareholder/Analyst Call - National Australia ...,Thank you. And thank you for raising that. Thi...,0,environmental,none,biodiversity
709,2022-12-15 22:31:00+00:00,Shareholder/Analyst Call - National Australia ...,Thank you. The reference to countries in the E...,0,environmental,none,biodiversity
708,2022-12-15 22:31:00+00:00,Shareholder/Analyst Call - National Australia ...,"If I may, I'd just like to ask a follow-up que...",0,environmental,action,biodiversity
106,2022-12-14 23:31:00+00:00,Shareholder/Analyst Call - Australia and New Z...,"Thank you for the question. And certainly, we'...",0,environmental,action,biodiversity
105,2022-12-14 23:31:00+00:00,Shareholder/Analyst Call - Australia and New Z...,"Thanks for the question, Susan. I do have the ...",0,environmental,none,biodiversity
104,2022-12-14 23:31:00+00:00,Shareholder/Analyst Call - Australia and New Z...,Good afternoon. So as some of you may be aware...,0,environmental,action,biodiversity
103,2022-12-14 23:31:00+00:00,Shareholder/Analyst Call - Australia and New Z...,"Thank you, Emilia. And actually, as I said, we...",0,environmental,none,biodiversity
102,2022-12-14 23:31:00+00:00,Shareholder/Analyst Call - Australia and New Z...,"Thanks, John. Thanks, actually very important ...",0,environmental,none,biodiversity
101,2022-12-14 23:31:00+00:00,Shareholder/Analyst Call - Australia and New Z...,"Thank you, Margaret, for your point and perhap...",0,environmental,none,biodiversity
615,2022-11-30 15:01:00+00:00,2022 Earnings Call,"So again, background, so the TNFD is a taskfor...",0,environmental,none,biodiversity
