### 

### Imports

In [83]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from pathlib import Path
import pandas as pd
import numpy as np
import os
import csv

### Pre-trained model import

HuggingFace has many pre-trained sentiment analysis models.
We pick amongst a couple to get different scores to analyze the amount of emphasis and positivity the companies put into their messaging about their climate pledge.

In [38]:
modelName = "siebert/sentiment-roberta-large-english"  # great positive vs negative
modelName = "finiteautomata/bertweet-base-sentiment-analysis"  # has neutral sentiment
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForSequenceClassification.from_pretrained(modelName)

Downloading (…)okenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Downloading (…)lve/main/config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

#### Fine-tuning on data

As a continuation of this work, one could fine-tune this data to incorporate large datasets of tweets about climate change.

#### Importing data from website

We have a webscraper that is able to get text off websites where companies talk about how well they do.
Here we import that data into a tokenized dataset as an instance of the class SimpleDataset.

In [89]:
# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}


#### Selecting one company to analyze

Each company file has the text data of the website where they discuss their climate change pledges.
What we are doing here is separating the text into many 'lines' of 250 words each, so that the text is roughly less than 512 tokens.
This data is then imported into a dataset class.

In [90]:
def create_dataset_company(filepath):
    with open(filepath, 'r') as file:
        text = file.read().replace('\n', '')
    text = text.split()
    n = 250
    preds_text = [' '.join(text[i: i+n]) for i in range(0, len(text), n)]
    tokenID = tokenizer(preds_text, truncation=True, padding=True)
    tokenDataset = SimpleDataset(tokenID)
    
    return tokenDataset, Path(filepath).stem

The company is then scored on how positive most of their website is.

In [91]:
def positivity(model, tokenDataset):
    trainer = Trainer(model=model)
    predictions = trainer.predict(tokenDataset)

    #preds = predictions.predictions.argmax(-1)
    
    #labels = pd.Series(preds).map(model.config.id2label)
    #scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
    # Create DataFrame with texts, predictions, labels, and scores
    # df = pd.DataFrame(list(zip(preds_text, preds, labels, scores)), columns=['text','pred','label','score'])
    # df.head()
    average_score = np.mean(np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True), axis=0)
    return average_score

#### Running through all the different company files



In [93]:
pathdir = str(Path(os.getcwd()).parent) + os.sep + 'company_data' + os.sep
textfiles = [pathdir + file for file in os.listdir(pathdir)]
savedir = str(Path(os.getcwd()).parent) + os.sep + 'climate-change-accountability' + os.sep

with open(savedir + "Bert-positivity-score.csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for file in textfiles:
        dataset, companyName = create_dataset_company(file)
        posScore = positivity(model, dataset)
        writer.writerow([companyName, posScore])

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]