# Data Retrieving

In [6]:
import pandas as pd
from neo4j_access import *
import numpy as np



In [8]:
query = "MATCH (s:Statement)-[:WAS_GIVEN_AT]->(e:ECC) RETURN e.time as datetime, e.title as conference_name, s.text as statement LIMIT 100000" 

In [26]:
QueryNeo4J.initialize()
df = QueryNeo4J.commit_query(query=query)

Planner COST

Runtime SLOTTED

Runtime version 5.12

+------------------+----+-----------------------------------------------------------------------------------+----------------+
| Operator         | Id | Details                                                                           | Estimated Rows |
+------------------+----+-----------------------------------------------------------------------------------+----------------+
| +ProduceResults  |  0 | datetime, conference_name, statement                                              |         100000 |
| |                +----+-----------------------------------------------------------------------------------+----------------+
| +Projection      |  1 | cache[e.time] AS datetime, cache[e.title] AS conference_name, s.text AS statement |         100000 |
| |                +----+-----------------------------------------------------------------------------------+----------------+
| +Limit           |  2 | 100000                          

KeyboardInterrupt: 

In [10]:
df["datetime"] = df["datetime"].astype(str)  # Convert Neo4j DateTime to string
df["datetime"] = pd.to_datetime(df["datetime"])  # Convert string to Pandas datetime

In [11]:
df.info()
df.to_pickle('data/biodiversity_dataset_random_subset.pkl')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype              
---  ------           --------------   -----              
 0   datetime         100000 non-null  datetime64[ns, UTC]
 1   conference_name  100000 non-null  object             
 2   statement        100000 non-null  object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 2.3+ MB


In [12]:
df['before_tnfd_2022'] = np.where(pd.to_datetime(df['datetime']) < pd.Timestamp("2022-01-01", tz="UTC"), 1, 0)

In [13]:
df["before_tnfd_2022"].value_counts()

before_tnfd_2022
1    92926
0     7074
Name: count, dtype: int64

# Models

In [27]:
df = pd.read_pickle('data/biodiversity_dataset_random_subset.pkl')

In [29]:
df.head(1)

Unnamed: 0,datetime,conference_name,statement
0,2022-07-21 03:01:00+00:00,005490.KS - Earnings call Q2 2022,"Next, we love to hear from Han Sung of Yuanta ..."


In [17]:
def classify_df(df, text_column, pipe, pipe_name):
    """
    Function that classifies text from a DataFrame using Hugging Face pipeline.
    
    Parameters:
    - df: Pandas DataFrame containing the text to classify.
    - text_column: Column name in the DataFrame that contains report texts.
    - pipe: Hugging Face pipeline for classification.
    - pipe_name: Column name for classification output.
    
    Returns:
    - df_result: DataFrame with classifications.
    """

    print(f"\nClassifying {pipe_name} labels...")

    # Use apply() to process each row independently (vectorized approach)
    df[pipe_name] = df[text_column].apply(lambda text: pipe(text, truncation=True, padding=True)[0]["label"])

    # Filter only rows that were classified as the desired label
    df_result = df.copy()

    return df_result

In [None]:
name = "ESGBERT/EnvironmentalBERT-environmental" # path to download from HuggingFace

tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)
# The pipeline combines tokenizer and model to one process.
pipe_env = pipeline("text-classification", model=model, tokenizer=tokenizer)


Device set to use mps:0
Device set to use mps:0


In [23]:
df = classify_df(df, text_column="statement", pipe=pipe_env, pipe_name='environmental')


Classifying environmental labels...


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

model = AutoModelForSequenceClassification.from_pretrained("ESGBERT/EnvironmentalBERT-biodiversity")
tokenizer = AutoTokenizer.from_pretrained("ESGBERT/EnvironmentalBERT-biodiversity", max_len=512)

pipe_biodiversity = pipeline("text-classification", model=model, tokenizer=tokenizer) # set device=0 to use GPU

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


In [None]:
df_ = classify_df(df, text_column='statement', pipe=pipe_biodiversity, pipe_name='biodiversity')


Classifying biodiversity labels...


In [25]:
df.describe()

Unnamed: 0,datetime,conference_name,statement,biodiversity
count,5000,5000,5000,5000
unique,,30,4637,1
top,,Earnings call Q1 2012,Yes.,none
freq,,402,49,5000
mean,2014-04-14 20:12:53.760000+00:00,,,
min,2007-07-26 03:01:00+00:00,,,
25%,2013-01-24 08:01:00+00:00,,,
50%,2014-01-23 14:44:00+00:00,,,
75%,2015-07-30 11:37:00+00:00,,,
max,2022-07-27 21:01:00+00:00,,,


In [24]:
df_['biodiversity'].value_counts()

biodiversity
none    5000
Name: count, dtype: int64

In [22]:
df.sort_values(by='datetime', ascending=False).head(20)

Unnamed: 0,datetime,conference_name,statement,biodiversity
36,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,I would like to thank everyone who can share t...,none
30,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,Good morning. I'm Young Moo-Kim from Sales and...,none
22,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,"Yes, to answer your question, I think I can st...",none
23,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,To answer your question about the Exynos busin...,none
24,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,You've asked about server demand. I think one ...,none
26,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,"Finally, we will answer questions that were su...",none
27,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,[Operator Instructions]. The first question wi...,none
28,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,"To answer your question about DRAM supply, yes...",none
29,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,Thank you. That sums up the second quarter res...,none
31,2022-07-27 21:01:00+00:00,005930.KS - Earnings call Q2 2022,"My first question is about the Memory, the ser...",none


In [4]:
df.head(10000)

NameError: name 'df' is not defined