# NLP Project

## Load Data

### Scraped CDP Scores
See [./download_csp_scores.py](./download_cdp_scores.py) for the CDP scores scraping code

In [6]:
import pandas as pd
import numpy as np
import platform
from transformers import pipeline
import tensorflow as tf
import torch
from flax import nnx
pd.options.display.max_colwidth = 100

In [2]:
p = platform.platform()

esg_documents_df = pd.read_csv('./dax_esg_media_dataset/esg_documents_for_dax_companies.csv', sep='|')
cdp_scores_df = pd.read_csv('./cdp_scores.csv')
esg_documents_df = esg_documents_df.drop("Unnamed: 0", axis=1)



In [47]:
merged_df.head(1)

Unnamed: 0,company,content,datatype,date,domain,esg_topics,internal,symbol,title,url,Ticker,CDP Score
0,Beiersdorf AG,Sustainability Highlight Report CARE BEYOND SKIN 2021 03 Foreword OUR SUSTAINABILITY COMMITMENT ...,sustainability_report,2021-03-31,,"['CleanWater', 'GHGEmission', 'ProductLiability', 'ValueChain', 'GenderDiversity', 'Upcycling', ...",1,BEI,BeiersdorfAG Sustainability Report 2021,,BEI,A


In [None]:
# merge CDP score with the esg data so that we have a score for each document
merged_df = pd.merge(esg_documents_df, cdp_scores_df, how='left', left_on='symbol', right_on='Ticker')

merged_df.dropna(subset=['CDP Score'], inplace=True) # drop rows without a CDP score

# print(merged_df[['symbol', 'CDP Score']])
# print(len(merged_df))
s = merged_df.head(1)['content'].item() # this is how to get the text alone
s.count()

'Sustainability Highlight Report CARE BEYOND SKIN 2021 03 Foreword OUR SUSTAINABILITY COMMITMENT 06 Our Values, our Brands, our Strategy 07 Our Sustainability Agenda CARE BEYOND SKIN 08 Our Partnerships 09 Our Promise toward Consumers MINIMIZING OUR ENVIRONMENTAL FOOTPRINT 11 Our Targets Climate Care: Our Holistic Approach to Climate Protection 12 14 16 17 18 People and Nature in Balance – Innovative NIVEA Products Eucerin: Dermocosmetics Meets Sustainability La Prairie Combines Sustainability and Luxury Climate-neutral Production in Leipzig and Berlin KEY FOR NAVIGATION Jump to the table MAXIMIZING OUR SOCIAL IMPACT 20 Our Targets Our Engagement for Sustainable Palm Oil Cultivation in Indonesia 21 23 24 25 27 The Power of Human Touch – NIVEA’s New Social Mission Eucerin’s Social Mission for Greater Social Participation We Stand Strong for Women and Girls Worldwide Diversity and Inclusion as Key to Success OUTLOOK 30 Continuing our Sustainability Journey in 2022 ANNEX 33 Key Figures at

In [50]:
# big companies that I know
# Adidas, Volkswagen, Porsche, Siemens (AG and Energy), Merck, Airbus, Puma, Allianz, Brenntag
merged_df['company'].unique()

array(['Beiersdorf AG', 'Deutsche Telekom AG', 'Vonovia SE', 'Merck KGaA',
       'MTU', 'E ONSE', 'RWE AG', 'Heidelberg Cement AG', 'Siemens AG',
       'Qiagen', 'Continental AG', 'Bayer AG', 'Volkswagen AG',
       'Fresenius', 'Symrise AG', 'Sartorius AG', 'Porsche', 'SAP',
       'Adidas AG', 'Deutsche Bank AG', 'Puma SE', 'Airbus SE',
       'Covestro AG', 'Allianz SE', 'Infineon Technologies AG', 'BMW',
       'Hannover R AG', 'Siemens Energy', 'Zalando SE',
       'Muenchener Rueckversicherungs Gesellschaft AGin Muenchen',
       'BASF SE', 'Deutsche Boerse AG', 'Brenntag', 'AkzoNobelNV',
       'Vonovia'], dtype=object)

## Preprocessing

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    lowercase=True,
    stop_words='english',
    token_pattern=r'\b\w+\b' # Only keep word tokens
)

bow_matrix = vectorizer.fit(esg_documents_df['content'].fillna(''))
# can use the bow_matrix in the next step for building a model

## Model Fitting

In [None]:
# TODO here we should try and build a model that correlates the preprocessed
# data to the column of our choice in the sp500_risk_ratings dataset

In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
positive_example = "we have reduced our carbon emissions by 12.7% (vs. 2018)"
neutral_example = "We are proud of this result"
negative_example = "We have increased our carbon emissions by 12.7% (vs. 2018), which is a very bad thing"

inputs = tokenizer(negative_example, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

    

In [23]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'negative'

## Model Analysis

In [None]:
# TODO check how well our model did here

# Data Sources
- https://www.kaggle.com/datasets/pritish509/s-and-p-500-esg-risk-ratings
- https://www.kaggle.com/datasets/equintel/dax-esg-media-dataset

# References

# Scratch Work

In [4]:
import pandas as pd
pd.read_csv("~/class/f24/nlp/project/sustain-nlp/sp500_esg_risk_ratings/sp500_esg_risk_ratings.csv")

Unnamed: 0,Symbol,Name,Address,Sector,Industry,Full Time Employees,Description,Total ESG Risk score,Environment Risk Score,Governance Risk Score,Social Risk Score,Controversy Level,Controversy Score,ESG Risk Percentile,ESG Risk Level
0,ENPH,"Enphase Energy, Inc.","47281 Bayside Parkway\nFremont, CA 94538\nUnit...",Technology,Solar,3157,"Enphase Energy, Inc., together with its subsid...",,,,,,,,
1,EMN,Eastman Chemical Company,"200 South Wilcox Drive\nKingsport, TN 37662\nU...",Basic Materials,Specialty Chemicals,14000,Eastman Chemical Company operates as a special...,25.3,12.8,6.6,5.8,Moderate Controversy Level,2.0,50th percentile,Medium
2,DPZ,Domino's Pizza Inc.,"30 Frank Lloyd Wright Drive\nAnn Arbor, MI 481...",Consumer Cyclical,Restaurants,6500,"Domino's Pizza, Inc., through its subsidiaries...",29.2,10.6,6.3,12.2,Moderate Controversy Level,2.0,66th percentile,Medium
3,DAY,"Dayforce, Inc.","3311 East Old Shakopee Road\nMinneapolis, MN 5...",Technology,Software - Application,9084,"Dayforce Inc., together with its subsidiaries,...",,,,,,,,
4,DVA,Davita Inc.,"2000 16th Street\nDenver, CO 80202\nUnited States",Healthcare,Medical Care Facilities,70000,DaVita Inc. provides kidney dialysis services ...,22.6,0.1,8.4,14.1,Moderate Controversy Level,2.0,38th percentile,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,ACN,Accenture Plc,1 Grand Canal Square\nGrand Canal Harbour\nDub...,Technology,Information Technology Services,742000,"Accenture plc, a professional services company...",9.8,0.8,4.4,4.6,Moderate Controversy Level,2.0,3rd percentile,Negligible
499,ABBV,Abbvie Inc.,"1 North Waukegan Road\nNorth Chicago, IL 60064...",Healthcare,Drug Manufacturers - General,50000,"AbbVie Inc. discovers, develops, manufactures,...",29.9,2.4,10.4,17.2,Significant Controversy Level,3.0,69th percentile,Medium
500,ABT,Abbott Laboratories,100 Abbott Park Road\nAbbott Park\nNorth Chica...,Healthcare,Medical Devices,114000,"Abbott Laboratories, together with its subsidi...",24.8,2.3,8.3,14.2,Significant Controversy Level,3.0,48th percentile,Medium
501,AOS,A.O. Smith Corporation,11270 West Park Place\nSuite 170 PO Box 245008...,Industrials,Specialty Industrial Machinery,12000,A. O. Smith Corporation manufactures and marke...,25.4,7.2,6.4,11.9,Low Controversy Level,1.0,51st percentile,Medium


In [None]:
# if you want to find text data from S&P companies - this is the code to load it
# source: https://www.kaggle.com/datasets/jaidityachopra/esg-sustainability-reports-of-s-and-p-500-companies 
# pd.read_csv("~/class/f24/nlp/project/extra_data/preprocessed_content.csv").head(1)['preprocessed_content']

0    style guide colour colour use imagecolour prof...
Name: preprocessed_content, dtype: object