In [23]:
AVAILABLE_GPU = 1 # Available GPU with 0% usage
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = f"{AVAILABLE_GPU}"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
tf_device=f'/gpu:{AVAILABLE_GPU}'

In [24]:
import sys
sys.path.append('./Utils')

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch

from DataManagement import PeriodisticDataset
from DataManagement import collate_fn
from Constants import BATCH_SIZE, CATEGORIES_DICT
from Models import SentimentAnalysisPretrainedBert
from Models import SentimentAnalysis
from Utils import get_predictions

In [36]:
#Dispositivo sobre el que se corre el modelo de ML


#path que define donde se ubica el dataset completo
DATASET_COMPLETO = 'Data/all_data.csv'

model = "StoredModels/Mejores_guardados_beto_cased.pt"

BERT_MODEL = '../output/beto/old-spanish-beto-base-cased.pt'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
torch.cuda.empty_cache()
DEVICE

device(type='cuda')

In [37]:
#@title Definicion de constantes

#tokenizer = AutoTokenizer.from_pretrained("dccuchile/albert-base-spanish")

classificator = SentimentAnalysisPretrainedBert(BERT_MODEL, 0.4, 4, 256, 128,  True).to(DEVICE)


# Cargar un modelo que ya había sido Finetuneado
checkpoint_classifier = torch.load(
    model,
    map_location=torch.device(DEVICE)
)

classificator.load_state_dict(checkpoint_classifier['model_state_dict'])

<All keys matched successfully>

In [38]:
data = pd.read_csv(DATASET_COMPLETO)
data.head()

Unnamed: 0.1,Unnamed: 0,file,year,newspaper,city,text
0,0,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,publicacion semanal ignoramos si sus restos f...
1,1,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,se presente en su oficina el que monto en la g...
2,2,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,"de baron de las tenazas, a don agustin castill..."
3,3,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,"al articulo presente, en una oracion ferviente..."
4,4,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,quien es el que desde aca tantas noticias envi...


In [39]:
#Dataset de entrenamiento
data_set = PeriodisticDataset(
    data.text.to_numpy(),
    np.zeros(len(data)),
    tokenizer
)

In [40]:
#Dataloader de entrenamiento
dataset_loader = DataLoader(data_set, batch_size = BATCH_SIZE, collate_fn=collate_fn)

In [41]:
def categorize_prediction_probs(prediction_probs):
    '''
    Categorize prediction probabilities.

    Args:
    - prediction_probs: List of prediction probabilities.

    Returns:
    - negatives: List of negative probabilities.
    - neutral: List of neutral probabilities.
    - positives: List of positive probabilities.
    - ironic: List of ironic probabilities.
    '''
    negatives = []
    neutral = []
    positives = []
    ironic = []
    for i in prediction_probs:
        negatives.append(float(i[0]))
        neutral.append(float(i[1]))
        positives.append(float(i[2]))
        ironic.append(float(i[3]))
    return negatives, neutral, positives, ironic

In [42]:
def order_predictions(review_texts, data, predictions, predictions_written, negatives, neutral, positives, ironic):
    '''
    Order predictions and update the data accordingly.

    Args:
    - review_texts: List of review texts.
    - data: DataFrame containing the data.
    - predictions: Predicted labels.
    - predictions_written: Predicted labels in written form.
    - negatives: List of negative probabilities.
    - neutral: List of neutral probabilities.
    - positives: List of positive probabilities.
    - ironic: List of ironic probabilities.
    '''
    negatives_ordered = np.zeros(len(review_texts))
    neutral_ordered = np.zeros(len(review_texts))
    positive_ordered = np.zeros(len(review_texts))
    ironic_ordered = np.zeros(len(review_texts))

    predictions_written_ordered = np.empty(len(review_texts), dtype='<U20')
    predictions_written_ordered.fill('')
    predictions_ordered = np.zeros((len(review_texts)))

    for i, t in enumerate(review_texts):
        index = data[data['text'] == t].index[0]
        negatives_ordered[index] = negatives[i]*100
        neutral_ordered[index] = neutral[i]*100
        positive_ordered[index] = positives[i]*100
        ironic_ordered[index] = ironic[i]*100

        predictions_written_ordered[index] = predictions_written[i]
        predictions_ordered[index] = predictions[i]

    data['predictions_written'] = predictions_written_ordered
    data['predictions'] = predictions_ordered
    data['negative_probs'] = negatives_ordered
    data['neutral_probs'] = neutral_ordered
    data['positive_probs'] = positive_ordered
    data['ironic_probs'] = ironic_ordered

In [43]:
def process_data(model, dataloader, data):
    '''
    Process data using the given model and dataloader.

    Args:
    - model: The trained model for processing.
    - dataloader: Dataloader for loading data.
    - data: DataFrame containing the data.
    '''
    review_texts, predictions, prediction_probs, real_values = get_predictions(model, dataloader)
    negatives, neutral, positives, ironic = categorize_prediction_probs(prediction_probs)
    predictions_written = [CATEGORIES_DICT[int(i)] for i in predictions]
    order_predictions(review_texts, data, predictions, predictions_written, negatives, neutral, positives, ironic)

In [44]:
#%%capture datos
process_data(classificator, dataset_loader, data)
data.head()

100%|███████████████████████████████████████| 4710/4710 [02:50<00:00, 27.67it/s]


Unnamed: 0.1,Unnamed: 0,file,year,newspaper,city,text,predictions_written,predictions,negative_probs,neutral_probs,positive_probs,ironic_probs
0,0,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,publicacion semanal ignoramos si sus restos f...,NEUTRO,1.0,9.99646,55.532199,33.521801,0.949539
1,1,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,se presente en su oficina el que monto en la g...,NEUTRO,1.0,17.339712,63.219225,18.328328,1.112728
2,2,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,"de baron de las tenazas, a don agustin castill...",POSITIVO,2.0,7.05914,42.583779,49.531049,0.826027
3,3,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,"al articulo presente, en una oracion ferviente...",POSITIVO,2.0,6.416973,21.443181,71.021354,1.118488
4,4,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,quien es el que desde aca tantas noticias envi...,NEUTRO,1.0,11.500288,60.579348,27.018249,0.902115


In [45]:
data.to_csv('TaggedData/Beto_tagged.csv', sep=';')

In [14]:
data.drop(columns=['Unnamed: 0'], inplace=True, axis=1)
data.drop_duplicates(inplace=True)

In [15]:
data.head()

Unnamed: 0,file,year,newspaper,city,text,predictions_written,predictions,negative_probs,neutral_probs,positive_probs,ironic_probs
0,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,publicacion semanal ignoramos si sus restos f...,NEUTRO,1.0,13.698448,38.375241,37.603813,10.322498
1,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,se presente en su oficina el que monto en la g...,NEUTRO,1.0,14.930455,53.223807,21.394636,10.451109
2,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,"de baron de las tenazas, a don agustin castill...",NEUTRO,1.0,7.400613,44.595921,41.839513,6.163949
3,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,"al articulo presente, en una oracion ferviente...",POSITIVO,2.0,7.706013,26.470718,59.172243,6.651025
4,../../Datos/19th_century_Latam_Newspapers_1/PF...,1870-1871,Los loros,Popayán,quien es el que desde aca tantas noticias envi...,NEUTRO,1.0,10.124923,57.009411,24.979652,7.886012


In [16]:
len(data)

18826