# NLP Project

## Load Data

In [4]:
import pandas as pd

esg_documents_df = pd.read_csv('./dax_esg_media_dataset/esg_documents_for_dax_companies.csv', sep='|')
# main columns: symbol, content (full document text), esg_topics

sdg_descriptions_with_targets_df = pd.read_csv('./dax_esg_media_dataset/sdg_descriptions_with_targetsText.csv')
# haven't really looked into this one yet, but it could be useful

sp500_risk_ratings_df = pd.read_csv('./sp500_esg_risk_ratings/sp500_esg_risk_ratings.csv')
# main columns: symbol, Total ESG Risk score, Environment Risk Score, Governance Risk Score

In [41]:
# keep only the unique symbols (NOTE: this will only keep the first occurrence for each company, not sure how this will impact our later processing)
esg_documents_df = esg_documents_df.drop_duplicates(subset='symbol')

merged_df = pd.merge(esg_documents_df, sp500_risk_ratings_df, left_on='symbol', right_on='Symbol', how='inner')

# print the columns, which shows that we now have the ESG risk scores as columns
# print(merged_df.columns)

# print the first row
# print(merged_df.head(1))

# print the AAPL row
# print(merged_df.loc[merged_df['symbol'] == 'AAPL']['Environment Risk Score'])
print(merged_df[['symbol', 'Environment Risk Score']])
# very small overlap :(


  symbol  Environment Risk Score
0    DTE                    15.0
1    MRK                     1.9
2    LIN                     NaN


In [7]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m223.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Preprocessing

In [15]:
# TODO tokenize/vectorize/convert some column of esg_documents_df into a bag of
# words representation so that it's more usable for fitting later
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import spacy

nlp = spacy.load("en_core_web_sm")

# def tokenize_cell(text):
#     if type(text) != str:
#         return ''
#     return text.lower().split()

def process_line(input_line):
    lowercased = input_line.lower()
    doc = nlp(lowercased)
    output_line = []
    for token in doc:
      if token.is_digit:
        output_line.append('<num>')
      elif not token.is_punct:
        output_line.append(token.text)
    return ' '.join(output_line)

def tokenize_with_spacy(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    return [token.text.lower() for token in doc if not token.is_punct and not token.is_digit]

nlp.max_length = 1500000 
# duplicate the content column so we don't modify the original
esg_documents_df['content_bow'] = esg_documents_df['content']

esg_documents_df['content_bow'] = esg_documents_df['content_bow'].apply(tokenize_with_spacy)

KeyboardInterrupt: 

In [8]:
print(esg_documents_df['content_bow'][0])

['sustainability', 'highlight', 'report', 'care', 'beyond', 'skin', '2021', '03', 'foreword', 'our', 'sustainability', 'commitment', '06', 'our', 'values,', 'our', 'brands,', 'our', 'strategy', '07', 'our', 'sustainability', 'agenda', 'care', 'beyond', 'skin', '08', 'our', 'partnerships', '09', 'our', 'promise', 'toward', 'consumers', 'minimizing', 'our', 'environmental', 'footprint', '11', 'our', 'targets', 'climate', 'care:', 'our', 'holistic', 'approach', 'to', 'climate', 'protection', '12', '14', '16', '17', '18', 'people', 'and', 'nature', 'in', 'balance', '–', 'innovative', 'nivea', 'products', 'eucerin:', 'dermocosmetics', 'meets', 'sustainability', 'la', 'prairie', 'combines', 'sustainability', 'and', 'luxury', 'climate-neutral', 'production', 'in', 'leipzig', 'and', 'berlin', 'key', 'for', 'navigation', 'jump', 'to', 'the', 'table', 'maximizing', 'our', 'social', 'impact', '20', 'our', 'targets', 'our', 'engagement', 'for', 'sustainable', 'palm', 'oil', 'cultivation', 'in', 'i

## Model Fitting

In [None]:
# TODO here we should try and build a model that correlates the preprocessed
# data to the column of our choice in the sp500_risk_ratings dataset

## Model Analysis

In [None]:
# TODO check how well our model did here

# Data Sources
- https://www.kaggle.com/datasets/pritish509/s-and-p-500-esg-risk-ratings
- https://www.kaggle.com/datasets/equintel/dax-esg-media-dataset

# References