# ClimateBert

by Harsh Vardhan Pachisia



Links used:
1. https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270
2. https://towardsdatascience.com/part-1-data-cleaning-does-bert-need-clean-data-6a50c9c6e9fd
3. https://towardsdatascience.com/does-bert-need-clean-data-part-2-classification-d29adf9f745a
4. https://huggingface.co/climatebert/distilroberta-base-climate-sentiment
5. https://ai.plainenglish.io/bert-pytorch-implementation-prepare-dataset-part-1-efd259113e5a

### 1. Packages and reading in data

In [1]:
#!pip install datasets transformers accelerate -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/536.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m522.2/536.7 kB[0m [31m15.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [29]:
#importing required packages
import pandas as pd
import torch
import matplotlib.pyplot as plt
import csv

from google.colab import drive
from transformers import pipeline

In [30]:
# setting differences between cuda and cput depending on what device we are on
USE_CUDA = torch.cuda.is_available()

if USE_CUDA:
    DEVICE = torch.device('cuda')
    train_obs = 15000
    test_obs = 1500
    print("Using cuda.")
else:
    DEVICE = torch.device('cpu')
    train_obs = 2000
    test_obs = 200
    print("Using cpu.")

Using cuda.


In [31]:
# Mount GDrive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
# get data
small_data = True
dtype_dict = {'label': int}
data_path = "/content/drive/Shareddrives/adv-ml-project/Data/"
comments = pd.read_csv(data_path + "by_threshold/comments_filtered.csv",
                       quoting=csv.QUOTE_NONNUMERIC, dtype=dtype_dict)

### 2. Preprocessing data for ClimateBert

Doing a test-train split

In [None]:
import random
from datasets import Dataset
#double check seed, we should have it be the same across
random.seed(56)

In [33]:
def train_test_split(df, percent_test_obs):
  # Identify the observations to assign as test
  num_test_obs = round(df.shape[0] * percent_test_obs)
  ids_test_obs = random.sample(range(df.shape[0]), num_test_obs)
  df['test_split'] = 0
  df.loc[ids_test_obs,'test_split'] = 1

  # Fill NaN values with empty strings, otherwise from_dict will raise an error
  df['body'] = df['body'].fillna('')

  # Creates Dataset from dictionary
  train_dict = {"text": df.loc[df['test_split'] == 0, 'body'].tolist(),
                "label": df.loc[df['test_split'] == 0, 'label'].tolist()}
  test_dict = {"text": df.loc[df['test_split'] == 1, 'body'].tolist(),
               "label": df.loc[df['test_split'] == 1, 'label'].tolist()}
  train_df = Dataset.from_dict(train_dict)
  test_df = Dataset.from_dict(test_dict)

  return train_df, test_df

# Create train and test data
train_data, test_data = train_test_split(comments, 0.2)

In [34]:
# Creating a small training dataset for faster training times
small_train_dataset = train_data.shuffle(seed=42).select(
    [i for i in list(range(train_obs))])
small_test_dataset = test_data.shuffle(seed=42).select(
    [i for i in list(range(test_obs))])

In [36]:
print(small_train_dataset[10])

{'text': 'And climate change is a good example of why I lean liberal, but as far as I can tell moderates also consider that a huge issue.', 'label': 1}


For pre-processing, we are going to use the distilroberta climate sentiment tokenizer: https://huggingface.co/climatebert/distilroberta-base-climate-sentiment

In [37]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-sentiment")

In [38]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


#if using small development data or larger full model
if small_data:
    tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
    tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
else:
    tokenized_train = train_data.map(preprocess_function, batched=True)
    tokenized_test = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [39]:
#pad the data
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

##3. Climate Bert model

https://huggingface.co/climatebert/distilroberta-base-climate-sentiment

### Training the model

In [41]:
from transformers import AutoModelForSequenceClassification

# need to add num_labels = 2 (since we only have 2 labels)
# since climate bert sentiment has 3 labels, the ignore_mismatched_sizes is
# required. By finetuning/training our model, this will be resolved.

model = AutoModelForSequenceClassification.from_pretrained(
    "climatebert/distilroberta-base-climate-sentiment",
    num_labels=2, ignore_mismatched_sizes=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Compute metrics: Taken from class

In [42]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [45]:
from transformers import TrainingArguments, Trainer

output_name = data_path + "climatebert_output"

training_args = TrainingArguments(
   output_dir=output_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=3,
   weight_decay=0.01,
   save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [47]:
trainer.train()

Step,Training Loss
500,0.3131
1000,0.2532
1500,0.2764
2000,0.2464
2500,0.1947


TrainOutput(global_step=2814, training_loss=0.24900728129459426, metrics={'train_runtime': 1792.7343, 'train_samples_per_second': 25.101, 'train_steps_per_second': 1.57, 'total_flos': 5068711844142144.0, 'train_loss': 0.24900728129459426, 'epoch': 3.0})

Compute evaluation metrics

In [48]:
trainer.evaluate()

  load_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.5046602487564087,
 'eval_accuracy': 0.84,
 'eval_f1': 0.8265895953757225,
 'eval_runtime': 22.2247,
 'eval_samples_per_second': 67.492,
 'eval_steps_per_second': 4.23,
 'epoch': 3.0}

#### Saving the model

In [52]:
# Use below when on Google Colab.
model_save_directory = "/content/drive/Shareddrives/adv-ml-project/SavedModels/climate_bert_simple_model"
from google.colab import drive
drive.mount("/content/drive")
!mkdir $model_save_directory

# To save a fine-tuned model
trainer.save_model(model_save_directory)

Mounted at /content/drive
mkdir: cannot create directory ‘/content/drive/Shareddrives/adv-ml-project/SavedModels/climate_bert_simple_model’: File exists


In [None]:
# To reload a saved model

#from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast
#from google.colab import drive
#drive.mount('/content/drive')
#tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
#model = DistilBertForQuestionAnswering.from_pretrained(model_save_directory)

## Create Climate Bert Sentiment Models over different subreddits

In [None]:
### To-do
# get the subsets of the data, pre-process (tokenize), run models on each