In [1]:
! pip install datasets transformers evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collect

In [2]:
import pandas as pd
import torch
import numpy as np 
from datasets import Dataset, DatasetDict
import ast 

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
path_to_folder = './'


In [7]:
corpus_df = pd.read_csv(path_to_folder + 'cleaned_data_fasttext_full.csv')
corpus_df.drop(['pretrained_fasttext','custom_trained_fasttext','pretrained_fasttext_title','pretrained_fasttext_title&summary'],axis= 1, inplace= True)

In [8]:
corpus_df['label'] = corpus_df['main_theme'].replace(np.sort(corpus_df.main_theme.unique()),list(range(len(corpus_df.main_theme.unique()))))

In [9]:
corpus_df['cleaned_summary_text'] = corpus_df['clean_summary'].apply(lambda text: ' '.join(ast.literal_eval(text)))

In [10]:
train, test, eval = np.split(corpus_df.sample(frac=1, random_state=42), [int(.7*len(corpus_df)), int(.85*len(corpus_df))])
eval.reset_index(drop=True,inplace=True)
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [11]:
datasettrain_summary = Dataset.from_pandas(train[['summary','label']])
dataseteval_summary = Dataset.from_pandas(eval[['summary','label']])
datasettrain_cleanedsummary = Dataset.from_pandas(train[['cleaned_summary_text','label']])
dataseteval_cleanedsummary = Dataset.from_pandas(eval[['cleaned_summary_text','label']])
datasettrain_titlesummary = Dataset.from_pandas(train[['title&summary','label']])
dataseteval_titlesummary = Dataset.from_pandas(eval[['title&summary','label']])

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:

def preprocess_function_csummary(examples, column):
    return tokenizer(examples[column], truncation=True)

datasettrain_summary = datasettrain_summary.map(preprocess_function_csummary, batched=True, fn_kwargs={"column": "summary"})
dataseteval_summary = dataseteval_summary.map(preprocess_function_csummary, batched=True, fn_kwargs={"column": "summary"})

datasettrain_cleanedsummary = datasettrain_cleanedsummary.map(preprocess_function_csummary, batched=True, fn_kwargs={"column": "cleaned_summary_text"})
dataseteval_cleanedsummary = dataseteval_cleanedsummary.map(preprocess_function_csummary, batched=True, fn_kwargs={"column": "cleaned_summary_text"})

datasettrain_titlesummary = datasettrain_titlesummary.map(preprocess_function_csummary, batched=True, fn_kwargs={"column": "title&summary"})
dataseteval_titlesummary = dataseteval_titlesummary.map(preprocess_function_csummary, batched=True, fn_kwargs={"column": "title&summary"})


In [14]:
#full_dataset = DatasetDict({'train':datasettrain, 'test': datasettest, 'unsupervised': datasetunsupervised})
full_dataset = DatasetDict({'train_cleaned':datasettrain_cleanedsummary, 'eval_cleaned': dataseteval_cleanedsummary,
                            'train_summary':datasettrain_summary,'eval_summary':dataseteval_summary,
                            'train_titlesymmary':datasettrain_titlesummary,'eval_titlesummary':dataseteval_titlesummary})

In [15]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [18]:
label2id = {21: 0, 23:1, 25:2, 27:3, 29:4, 31:5 }
id2label = {0:21, 1:23, 2:25, 3:27, 4:29, 5:31}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback

model_cleaned_summary = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": 21,
    "1": 23,
    "2": 25,
    "3": 27,
    "4": 29,
    "5": 31
  },
  "initializer_range": 0.02,
  "label2id": {
    "21": 0,
    "23": 1,
    "25": 2,
    "27": 3,
    "29": 4,
    "31": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggin

In [None]:
training_args = TrainingArguments(
    output_dir= path_to_folder + "bert_clean_summary_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model_cleaned_summary,
    args=training_args,
    train_dataset=full_dataset["train_cleaned"],
    eval_dataset=full_dataset["eval_cleaned"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: cleaned_summary_text. If cleaned_summary_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 22436
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 14030
  Number of trainable parameters = 66958086


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8702,0.805658,0.68074
2,0.7418,0.783248,0.697795
3,0.6316,0.790662,0.706947
4,0.51,0.854244,0.702995
5,0.4129,0.929207,0.697379
6,0.3096,1.066587,0.700499
7,0.2385,1.291635,0.686564
8,0.1799,1.469021,0.68594
9,0.1382,1.591297,0.685524
10,0.1179,1.649166,0.685108


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: cleaned_summary_text. If cleaned_summary_text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4808
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/MasterIA/Semi-CuatriII/NLP /Practica Final/bert_clean_summary_model/checkpoint-1403
Configuration saved in /content/drive/MyDrive/MasterIA/Semi-CuatriII/NLP /Practica Final/bert_clean_summary_model/checkpoint-1403/config.json
Model weights saved in /content/drive/MyDrive/MasterIA/Semi-CuatriII/NLP /Practica Final/bert_clean_summary_model/checkpoint-1403/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/MasterIA/Semi-CuatriII/NLP /Practica Final/bert_clean_summary_model/checkpoint-1403/tokenizer_config.json
Special tokens file saved in /content/drive

TrainOutput(global_step=14030, training_loss=0.42052973572560404, metrics={'train_runtime': 6920.0394, 'train_samples_per_second': 32.422, 'train_steps_per_second': 2.027, 'total_flos': 1.758215133773424e+16, 'train_loss': 0.42052973572560404, 'epoch': 10.0})

In [19]:
model_summary = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id
)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [21]:
training_args = TrainingArguments(
    output_dir= path_to_folder + "bert_notcleaned_summary_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model_summary,
    args=training_args,
    train_dataset=full_dataset["train_summary"],
    eval_dataset=full_dataset["eval_summary"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: summary. If summary are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 22436
  Num Epochs = 8
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 11224
  Number of trainable parameters = 66958086
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8674,0.798891,0.687396
2,0.7299,0.766982,0.704243
3,0.6105,0.796256,0.709027
4,0.4785,0.850698,0.703827
5,0.3832,0.965101,0.699459


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: summary. If summary are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4808
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Universidad/bert_notcleaned_summary_model/checkpoint-1403
Configuration saved in /content/drive/MyDrive/Universidad/bert_notcleaned_summary_model/checkpoint-1403/config.json
Model weights saved in /content/drive/MyDrive/Universidad/bert_notcleaned_summary_model/checkpoint-1403/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Universidad/bert_notcleaned_summary_model/checkpoint-1403/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Universidad/bert_notcleaned_summary_model/checkpoint-1403/special_tokens_map.json
The following columns in the evaluation

TrainOutput(global_step=7015, training_loss=0.6243113219355653, metrics={'train_runtime': 4761.6686, 'train_samples_per_second': 37.694, 'train_steps_per_second': 2.357, 'total_flos': 1.2454544888082288e+16, 'train_loss': 0.6243113219355653, 'epoch': 5.0})

In [22]:
model_titlesummary = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": 21,
    "1": 23,
    "2": 25,
    "3": 27,
    "4": 29,
    "5": 31
  },
  "initializer_range": 0.02,
  "label2id": {
    "21": 0,
    "23": 1,
    "25": 2,
    "27": 3,
    "29": 4,
    "31": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggin

In [23]:
training_args = TrainingArguments(
    output_dir= path_to_folder + "bert_titlesummary_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model_summary,
    args=training_args,
    train_dataset=full_dataset["train_titlesymmary"],
    eval_dataset=full_dataset["eval_titlesummary"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: title&summary. If title&summary are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 22436
  Num Epochs = 8
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 11224
  Number of trainable parameters = 66958086


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6336,0.764679,0.710691
2,0.5289,0.836588,0.703619
3,0.4034,0.938722,0.711106
4,0.2833,1.13936,0.697795


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: title&summary. If title&summary are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4808
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Universidad/bert_titlesummary_model/checkpoint-1403
Configuration saved in /content/drive/MyDrive/Universidad/bert_titlesummary_model/checkpoint-1403/config.json
Model weights saved in /content/drive/MyDrive/Universidad/bert_titlesummary_model/checkpoint-1403/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Universidad/bert_titlesummary_model/checkpoint-1403/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Universidad/bert_titlesummary_model/checkpoint-1403/special_tokens_map.json
The following columns in the evaluation set don't have a 

TrainOutput(global_step=5612, training_loss=0.4631930182681964, metrics={'train_runtime': 4019.7754, 'train_samples_per_second': 44.651, 'train_steps_per_second': 2.792, 'total_flos': 1.0423331194455216e+16, 'train_loss': 0.4631930182681964, 'epoch': 4.0})

In [134]:
import ast

In [159]:
def compute_accuracy_f(model_file,data,column):
  with torch.no_grad():
    accuracy = 0
    real_accuracy = 0
    tokenizer = AutoTokenizer.from_pretrained(model_file,local_files_only=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_file,local_files_only=True)
    for summary,label,rest_labels in zip(data['summary'],data.main_theme,data.rest_themes):

      inputs = tokenizer(summary, return_tensors="pt")
      logits = model(**inputs).logits
      predicted_class_id = logits.argmax().item()
      predited_label = model.config.id2label[predicted_class_id]
      if label == predited_label: 
        
        accuracy += 1
        real_accuracy += 1
      elif predited_label in ast.literal_eval(rest_labels): 

        real_accuracy +=1
  return accuracy/len(data), real_accuracy/len(data)


In [None]:
compute_accuracy_f("./bert_notclean_summary_model/checkpoint-4209",test,'summary')

In [None]:
compute_accuracy_f("./bert_titlesummary_model/checkpoint-4209",test,'title&summary')

In [None]:
compute_accuracy_f("./bert_clean_summary_model/checkpoint-4209",test,'cleaned_summary_text')