In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer_climate_related = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-detector")
model_climate_related = AutoModelForSequenceClassification.from_pretrained("climatebert/distilroberta-base-climate-detector")

tokenizer_ron = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-sentiment")
model_ron = AutoModelForSequenceClassification.from_pretrained("climatebert/distilroberta-base-climate-sentiment")

tokenizer_tcfd = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-tcfd")
model_tcfd = AutoModelForSequenceClassification.from_pretrained("climatebert/distilroberta-base-climate-tcfd")


In [2]:
from transformers import AutoTokenizer, RobertaModel

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
inputs = tokenizer_climate_related("Paris Agreement", return_tensors="pt")
inputs2 = tokenizer_ron("Paris Agreement", return_tensors="pt")
inputs3 = tokenizer_tcfd("Paris Agreement", return_tensors="pt")


In [20]:
type(model_tcfd)


transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification

In [16]:
# a = model_climate_related.forward(**inputs)
# b = model_climate_related(**inputs)
# a.logits
# tokenizer.decode([a.logits.argmax()])

output1 = model_climate_related(**inputs)
output2 = model_ron(**inputs2)
output3 = model_tcfd(**inputs3)

output1.logits
output2.logits
output3.logits


tensor([[-0.3996, -0.0702,  0.5137, -0.0117]], grad_fn=<AddmmBackward0>)

In [24]:
import torch.nn.functional as F
probabilities1 = F.softmax(output1.logits, dim=1)
probabilities2 = F.softmax(output2.logits, dim=1)
probabilities3 = F.softmax(output3.logits, dim=1)

# Get the predicted class (index with the maximum probability)
predicted_class1 = torch.argmax(probabilities1, dim=1).item()
predicted_class2 = torch.argmax(probabilities2, dim=1).item()
predicted_class3 = torch.argmax(probabilities3, dim=1).item()

# class labels: "no": 0, "yes": 1
print("Climate Related:")
print("Probabilities:", probabilities1)
print(probabilities1[0][0])
print("Predicted Class:", predicted_class1)

# class labels: "neutral": 1, "opportunity": 0, "risk": 2
print("RON:")
print("Probabilities:", probabilities2)
print("Predicted Class:", predicted_class2)

# class labels: "governance": 0, "risk": 1, "strategy": 2, "metrics": 3
print("TCFD:")
print("Probabilities:", probabilities3)
print("Predicted Class:", predicted_class3)


Climate Related:
Probabilities: tensor([[0.0122, 0.9878]], grad_fn=<SoftmaxBackward0>)
tensor(0.0122, grad_fn=<SelectBackward0>)
Predicted Class: 1
RON:
Probabilities: tensor([[0.3221, 0.4492, 0.2287]], grad_fn=<SoftmaxBackward0>)
Predicted Class: 1
TCFD:
Probabilities: tensor([[0.1573, 0.2187, 0.3921, 0.2319]], grad_fn=<SoftmaxBackward0>)
Predicted Class: 2


In [19]:
# Print the model's configuration, which includes the number of classes
print("Model Configuration:", model_tcfd.config)

num_classes = model_tcfd.config.num_labels
print("Number of Classes:", num_classes)


Model Configuration: RobertaConfig {
  "_name_or_path": "climatebert/distilroberta-base-climate-tcfd",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "governance",
    "1": "risk",
    "2": "strategy",
    "3": "metrics"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "governance": 0,
    "metrics": 3,
    "risk": 1,
    "strategy": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "type_vocab_size": 1,
  "use_cache": t

In [22]:
model_tcfd.config.id2label[0]


'governance'

In [52]:
tokenizer_climate_related.batch_decode(a.logits.argmax(dim=-1))


['<pad>']

In [60]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)


In [63]:
last_hidden_states = outputs.last_hidden_state
last_hidden_states


tensor([[[-0.0478,  0.0886, -0.0098,  ..., -0.0544, -0.0672, -0.0039],
         [-0.0712,  0.0150, -0.1299,  ...,  0.0638,  0.0296, -0.0860],
         [ 0.0906,  0.1437,  0.0828,  ...,  0.0509, -0.0320, -0.0490],
         ...,
         [ 0.0853,  0.2155,  0.0849,  ..., -0.1150,  0.0330, -0.0790],
         [ 0.1679,  0.1288,  0.0065,  ...,  0.0367, -0.0631,  0.0276],
         [-0.0436,  0.0892, -0.0389,  ..., -0.0957, -0.0744, -0.0284]]],
       grad_fn=<NativeLayerNormBackward0>)

In [65]:
dir(tokenizer_climate_related)


['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_bos_token',
 '_call_one',
 '_cls_token',
 '_compile_jinja_template',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_mask_token',
 '