In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline
from transformers import RobertaTokenizer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import gc
import os
from huggingface_hub import InferenceClient
from transformers import AutoTokenizer
import logging
import time
import requests
from tqdm import tqdm

In [4]:
from transformers import RobertaForSequenceClassification

# Configuración para el uso de la GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Haciendo logging para monitorear errores
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers = [
                        logging.StreamHandler(),
                    ]
)

# Cargar el tokenizador y el modelo localmente en la GPU
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model.to(device)  # Mover el modelo a la GPU

# Función para dividir el texto en chunks respetando el límite de tokens
def chunk_text(text, tokenizer, chunk_size=512):
    tokens = tokenizer(text, truncation=True, max_length=chunk_size, return_tensors='pt')
    input_ids = tokens.input_ids[0]
    for i in range(0, len(input_ids), chunk_size):
        chunk_ids = input_ids[i:i + chunk_size]
        yield tokenizer.decode(chunk_ids, skip_special_tokens=True)

# Función para guardar el progreso en CSV
def guardar_progreso(df, ruta_guardado):
    if not df.empty:
        try:
            logging.info("Guardando el progreso...")
            arch_existe = os.path.isfile(ruta_guardado)
            df.to_csv(ruta_guardado, mode='a', header=not arch_existe, index=False)
            logging.info(f"Se han guardado {len(df)} instancias correctamente")
        except Exception as e:
            logging.error(f"Ha habido un error al guardar el progreso: {e}")
            logging.info(f"{len(df)} instancias no guardadas")

# Función para analizar los sentimientos en el dataframe en chunks
def analyze_sentiments_chunked(df, tokenizer, model, chunk_size=512, save_threshold=10000, process_chunk_size=5000):
    processed_count = 0
    ch_num = 0
    # Procesar el dataframe en chunks de `process_chunk_size`
    for start in range(0, len(df), process_chunk_size):
        ch_num += 1
        end = min(start + process_chunk_size, len(df))
        chunk_df = df.iloc[start:end]

        # defining lists
        sentiment_list = []
        score_list = []
        negative_scores = []
        neutral_scores = []
        positive_scores = []

        logging.info(f"Analyzing chunk n.{ch_num}")
        print(f"Procesando chunk-group n.{ch_num}")
        
        for idx, text in enumerate(chunk_df['text']):
            chunks = list(chunk_text(text, tokenizer, chunk_size=chunk_size))
            overall_sentiment = None
            max_score = -1  # Inicializar a un valor bajo
            
            for chunk in chunks:
                inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
                
                # Realizar la inferencia en la GPU
                with torch.no_grad():
                    outputs = model(**inputs)
                    logits = outputs.logits
                    probs = torch.nn.functional.softmax(logits, dim=-1)
                    sentiment_score, sentiment_idx = torch.max(probs, dim=-1)
                
                label = model.config.id2label[sentiment_idx.item()]
                
                # Si el score es mayor que el máximo actual, actualiza
                if sentiment_score.item() > max_score:
                    max_score = sentiment_score.item()
                    overall_sentiment = label
            
            sentiment_list.append(overall_sentiment)
            score_list.append(max_score)
        
        # Asignar sentimientos y puntajes al chunk
        df.loc[start:end-1, 'sentiment'] = sentiment_list
        df.loc[start:end-1, 'score'] = score_list
        processed_count += len(chunk_df)
        
        # Guardar el progreso después de cada `save_threshold` filas
        if processed_count >= save_threshold:
            guardar_progreso(df.iloc[start:end], ruta_guardado)
            processed_count = 0
    
    # Guardar el progreso final
    guardar_progreso(df, ruta_guardado)

# Ruta para guardar el archivo de progreso
ruta_guardado = r"C:\Users\34616\Documents\4GEEKS\datos_gordos\roBERTa results\df_2nda_corrida_3sentiments.csv"

Usando dispositivo: cuda




In [5]:
df = pd.read_csv(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\reddit\Bipolar\df_bipolar_full.csv")
analyze_sentiments_chunked(df, tokenizer, model)

2024-09-10 16:06:28,693 - INFO - Analyzing chunk n.1


Procesando chunk-group n.1


2024-09-10 16:07:33,232 - INFO - Analyzing chunk n.2


Procesando chunk-group n.2


2024-09-10 16:08:34,637 - INFO - Guardando el progreso...
2024-09-10 16:08:34,672 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:08:34,672 - INFO - Analyzing chunk n.3


Procesando chunk-group n.3


2024-09-10 16:09:32,829 - INFO - Analyzing chunk n.4


Procesando chunk-group n.4


2024-09-10 16:10:30,940 - INFO - Guardando el progreso...
2024-09-10 16:10:30,984 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:10:30,984 - INFO - Analyzing chunk n.5


Procesando chunk-group n.5


2024-09-10 16:11:33,683 - INFO - Analyzing chunk n.6


Procesando chunk-group n.6


2024-09-10 16:12:35,892 - INFO - Guardando el progreso...
2024-09-10 16:12:35,935 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:12:35,937 - INFO - Analyzing chunk n.7


Procesando chunk-group n.7


2024-09-10 16:13:39,612 - INFO - Analyzing chunk n.8


Procesando chunk-group n.8


2024-09-10 16:14:38,935 - INFO - Guardando el progreso...
2024-09-10 16:14:38,981 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:14:38,982 - INFO - Analyzing chunk n.9


Procesando chunk-group n.9


2024-09-10 16:15:41,378 - INFO - Analyzing chunk n.10


Procesando chunk-group n.10


2024-09-10 16:16:38,735 - INFO - Guardando el progreso...
2024-09-10 16:16:38,770 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:16:38,771 - INFO - Analyzing chunk n.11


Procesando chunk-group n.11


2024-09-10 16:17:33,101 - INFO - Analyzing chunk n.12


Procesando chunk-group n.12


2024-09-10 16:18:28,128 - INFO - Guardando el progreso...
2024-09-10 16:18:28,163 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:18:28,164 - INFO - Analyzing chunk n.13


Procesando chunk-group n.13


2024-09-10 16:19:23,034 - INFO - Analyzing chunk n.14


Procesando chunk-group n.14


2024-09-10 16:20:18,422 - INFO - Guardando el progreso...
2024-09-10 16:20:18,463 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:20:18,464 - INFO - Analyzing chunk n.15


Procesando chunk-group n.15


2024-09-10 16:21:14,026 - INFO - Analyzing chunk n.16


Procesando chunk-group n.16


2024-09-10 16:22:09,274 - INFO - Guardando el progreso...
2024-09-10 16:22:09,318 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:22:09,319 - INFO - Analyzing chunk n.17


Procesando chunk-group n.17


2024-09-10 16:22:52,593 - INFO - Guardando el progreso...
2024-09-10 16:22:53,216 - INFO - Se han guardado 84045 instancias correctamente


In [7]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)  

True
2.4.1+cu118


In [3]:
import pandas as pd
df_corrido = pd.read_csv(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\roBERTa results\df_primera_corrida.csv")
df_corrido

Unnamed: 0.1,Unnamed: 0,text,submission_type,subreddit,labels,sentiment,score
0,0,Media: Nobody knows what kamala is about\n\nMe...,comment,politics,Democrat,LABEL_1,0.564157
1,1,NYT breaking news that Netanyahu has agreed to...,comment,politics,Democrat,LABEL_2,0.930298
2,2,I was thinking this morning about how freaking...,comment,politics,Democrat,LABEL_1,0.433811
3,3,I was on a break talking to a 25-year old cowo...,comment,politics,Republican,LABEL_0,0.885671
4,4,Went to the Kamala rally today in WI and it wa...,comment,politics,Democrat,LABEL_2,0.604938
...,...,...,...,...,...,...,...
84040,84040,"> No, Some Republicans aren’t motivated to vot...",comment,Republican,Republican,LABEL_1,0.482519
84041,84041,That would apply if Trump hadn’t won and lost....,comment,Republican,Republican,LABEL_1,0.593608
84042,84042,How many pardons has trump issued?,comment,Republican,Republican,LABEL_1,0.729522
84043,84043,Should post this all over the subs who keep bl...,comment,Republican,Democrat,LABEL_0,0.776307


ahora pa que funcionen las 3 cosas:

In [8]:
from transformers import RobertaForSequenceClassification

# Configuración para el uso de la GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Haciendo logging para monitorear errores
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers = [
                        logging.StreamHandler(),
                    ]
)

# Cargar el tokenizador y el modelo localmente en la GPU
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model.to(device)  # Mover el modelo a la GPU

# Función para dividir el texto en chunks respetando el límite de tokens
def chunk_text(text, tokenizer, chunk_size=512):
    tokens = tokenizer(text, truncation=True, max_length=chunk_size, return_tensors='pt')
    input_ids = tokens.input_ids[0]
    for i in range(0, len(input_ids), chunk_size):
        chunk_ids = input_ids[i:i + chunk_size]
        yield tokenizer.decode(chunk_ids, skip_special_tokens=True)

# Función para guardar el progreso en CSV
def guardar_progreso(df, ruta_guardado):
    if not df.empty:
        try:
            logging.info("Guardando el progreso...")
            arch_existe = os.path.isfile(ruta_guardado)
            df.to_csv(ruta_guardado, mode='a', header=not arch_existe, index=False)
            logging.info(f"Se han guardado {len(df)} instancias correctamente")
        except Exception as e:
            logging.error(f"Ha habido un error al guardar el progreso: {e}")
            logging.info(f"{len(df)} instancias no guardadas")

# Función para analizar los sentimientos en el dataframe en chunks
def analyze_sentiments_chunked(df, tokenizer, model, chunk_size=512, save_threshold=10000, process_chunk_size=5000):
    processed_count = 0
    ch_num = 0
    # Procesar el dataframe en chunks de `process_chunk_size`
    for start in range(0, len(df), process_chunk_size):
        ch_num += 1
        end = min(start + process_chunk_size, len(df))
        chunk_df = df.iloc[start:end]

        # defining lists for all sentiment labels and scores
        sentiment_list = []
        score_list = []
        negative_scores = []
        neutral_scores = []
        positive_scores = []

        logging.info(f"Analyzing chunk n.{ch_num}")
        print(f"Procesando chunk-group n.{ch_num}")
        
        for idx, text in enumerate(chunk_df['text']):
            chunks = list(chunk_text(text, tokenizer, chunk_size=chunk_size))
            overall_sentiment = None
            max_score = -1  # Inicializar a un valor bajo
            neg_score, neu_score, pos_score = 0, 0, 0  # Initialize the score variables
            
            for chunk in chunks:
                inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
                
                # Realizar la inferencia en la GPU
                with torch.no_grad():
                    outputs = model(**inputs)
                    logits = outputs.logits
                    probs = torch.nn.functional.softmax(logits, dim=-1)
                
                # Extract sentiment scores for each label
                neg_score += probs[0][0].item()
                neu_score += probs[0][1].item()
                pos_score += probs[0][2].item()

            # Normalizing the sentiment scores across chunks
            num_chunks = len(chunks)
            neg_score /= num_chunks
            neu_score /= num_chunks
            pos_score /= num_chunks

            # Assign the label with the highest score
            sentiment_scores = [neg_score, neu_score, pos_score]
            overall_sentiment = ['Negative', 'Neutral', 'Positive'][sentiment_scores.index(max(sentiment_scores))]
            
            # Append the results
            sentiment_list.append(overall_sentiment)
            score_list.append(max(sentiment_scores))
            negative_scores.append(neg_score)
            neutral_scores.append(neu_score)
            positive_scores.append(pos_score)
        
        # Asignar sentimientos y puntajes al chunk
        df.loc[start:end-1, 'sentiment'] = sentiment_list
        df.loc[start:end-1, 'max_score'] = score_list
        df.loc[start:end-1, 'negative_score'] = negative_scores
        df.loc[start:end-1, 'neutral_score'] = neutral_scores
        df.loc[start:end-1, 'positive_score'] = positive_scores

        processed_count += len(chunk_df)
        
        # Guardar el progreso después de cada `save_threshold` filas
        if processed_count >= save_threshold:
            guardar_progreso(df.iloc[start:end], ruta_guardado)
            processed_count = 0
    
    # Guardar el progreso final
    guardar_progreso(df, ruta_guardado)

# Ruta para guardar el archivo de progreso
ruta_guardado = r"C:\Users\34616\Documents\4GEEKS\datos_gordos\roBERTa results\df_2nda_corrida_3sentiments.csv"


Usando dispositivo: cuda




In [9]:
df = pd.read_csv(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\reddit\Bipolar\df_bipolar_full.csv")
df_run = analyze_sentiments_chunked(df, tokenizer, model)

2024-09-10 16:32:00,447 - INFO - Analyzing chunk n.1


Procesando chunk-group n.1


2024-09-10 16:32:54,508 - INFO - Analyzing chunk n.2


Procesando chunk-group n.2


2024-09-10 16:33:47,516 - INFO - Guardando el progreso...
2024-09-10 16:33:47,566 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:33:47,567 - INFO - Analyzing chunk n.3


Procesando chunk-group n.3


2024-09-10 16:34:39,873 - INFO - Analyzing chunk n.4


Procesando chunk-group n.4


2024-09-10 16:35:32,857 - INFO - Guardando el progreso...
2024-09-10 16:35:32,914 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:35:32,915 - INFO - Analyzing chunk n.5


Procesando chunk-group n.5


2024-09-10 16:36:26,212 - INFO - Analyzing chunk n.6


Procesando chunk-group n.6


2024-09-10 16:37:20,795 - INFO - Guardando el progreso...
2024-09-10 16:37:20,851 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:37:20,852 - INFO - Analyzing chunk n.7


Procesando chunk-group n.7


2024-09-10 16:38:18,418 - INFO - Analyzing chunk n.8


Procesando chunk-group n.8


2024-09-10 16:39:11,889 - INFO - Guardando el progreso...
2024-09-10 16:39:11,946 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:39:11,947 - INFO - Analyzing chunk n.9


Procesando chunk-group n.9


2024-09-10 16:40:05,220 - INFO - Analyzing chunk n.10


Procesando chunk-group n.10


2024-09-10 16:40:57,508 - INFO - Guardando el progreso...
2024-09-10 16:40:57,554 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:40:57,554 - INFO - Analyzing chunk n.11


Procesando chunk-group n.11


2024-09-10 16:41:49,910 - INFO - Analyzing chunk n.12


Procesando chunk-group n.12


2024-09-10 16:42:42,011 - INFO - Guardando el progreso...
2024-09-10 16:42:42,062 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:42:42,063 - INFO - Analyzing chunk n.13


Procesando chunk-group n.13


2024-09-10 16:43:34,361 - INFO - Analyzing chunk n.14


Procesando chunk-group n.14


2024-09-10 16:44:26,952 - INFO - Guardando el progreso...
2024-09-10 16:44:27,001 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:44:27,002 - INFO - Analyzing chunk n.15


Procesando chunk-group n.15


2024-09-10 16:45:19,889 - INFO - Analyzing chunk n.16


Procesando chunk-group n.16


2024-09-10 16:46:14,056 - INFO - Guardando el progreso...
2024-09-10 16:46:14,107 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 16:46:14,108 - INFO - Analyzing chunk n.17


Procesando chunk-group n.17


2024-09-10 16:46:56,564 - INFO - Guardando el progreso...
2024-09-10 16:46:57,352 - INFO - Se han guardado 84045 instancias correctamente


In [11]:
df_run = pd.read_csv(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\roBERTa results\df_2nda_corrida_3sentiments.csv")
df_run

Unnamed: 0,text,submission_type,subreddit,labels,sentiment,max_score,negative_score,neutral_score,positive_score
0,It’s a bunch of conservatives looking for rede...,comment,politics,Democrat,Negative,0.882348,0.882348,0.108581,0.009071
1,The best part of all this is the maga cult hat...,comment,politics,Republican,Negative,0.912243,0.912243,0.082726,0.005031
2,They will grab onto anything to make trump int...,comment,politics,Republican,Negative,0.863834,0.863834,0.121701,0.014466
3,I got into an argument with a guy last night t...,comment,politics,Republican,Negative,0.889723,0.889723,0.107158,0.003119
4,What's amazing is that Trump's 34 felonies are...,comment,politics,Republican,Negative,0.835902,0.835902,0.152027,0.012071
...,...,...,...,...,...,...,...,...,...
124040,"> No, Some Republicans aren’t motivated to vot...",comment,Republican,Republican,Neutral,0.482519,0.284823,0.482519,0.232658
124041,That would apply if Trump hadn’t won and lost....,comment,Republican,Republican,Neutral,0.593608,0.336064,0.593608,0.070328
124042,How many pardons has trump issued?,comment,Republican,Republican,Neutral,0.729522,0.210550,0.729522,0.059928
124043,Should post this all over the subs who keep bl...,comment,Republican,Democrat,Negative,0.776307,0.776307,0.209625,0.014067


In [None]:
df_run.shape

In [13]:
df_run.duplicated().sum()

40000

In [14]:
df_run = df_run.drop_duplicates()
df_run.shape

(84045, 9)

Arreglando los duplicados y quitando columnas irrelevantes

In [15]:
import pandas as pd
import logging
import os
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer

# Configuración para el uso de la GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Haciendo logging para monitorear errores
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler()])

# Cargar el tokenizador y el modelo localmente en la GPU
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model.to(device)  # Mover el modelo a la GPU

# Función para dividir el texto en chunks respetando el límite de tokens
def chunk_text(text, tokenizer, chunk_size=512):
    tokens = tokenizer(text, truncation=True, max_length=chunk_size, return_tensors='pt')
    input_ids = tokens.input_ids[0]
    for i in range(0, len(input_ids), chunk_size):
        chunk_ids = input_ids[i:i + chunk_size]
        yield tokenizer.decode(chunk_ids, skip_special_tokens=True)

# Función para guardar el progreso en CSV
def guardar_progreso(df, ruta_guardado):
    if not df.empty:
        try:
            logging.info("Guardando el progreso...")
            arch_existe = os.path.isfile(ruta_guardado)
            df.to_csv(ruta_guardado, mode='a', header=not arch_existe, index=False)
            logging.info(f"Se han guardado {len(df)} instancias correctamente")
        except Exception as e:
            logging.error(f"Ha habido un error al guardar el progreso: {e}")
            logging.info(f"{len(df)} instancias no guardadas")

# Función para analizar los sentimientos en el dataframe en chunks
def analyze_sentiments_chunked(df, tokenizer, model, chunk_size=512, save_threshold=10000, process_chunk_size=5000):
    processed_count = 0
    ch_num = 0
    # Procesar el dataframe en chunks de `process_chunk_size`
    for start in range(0, len(df), process_chunk_size):
        ch_num += 1
        end = min(start + process_chunk_size, len(df))
        chunk_df = df.iloc[start:end]

        # Definir listas
        sentiment_list = []
        negative_scores = []
        neutral_scores = []
        positive_scores = []

        logging.info(f"Analyzing chunk n.{ch_num}")
        print(f"Procesando chunk-group n.{ch_num}")

        for idx, text in enumerate(chunk_df['text']):
            chunks = list(chunk_text(text, tokenizer, chunk_size=chunk_size))
            overall_sentiment = None

            # Inicializar scores para cada chunk
            negative_score_total = 0
            neutral_score_total = 0
            positive_score_total = 0
            
            for chunk in chunks:
                inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)

                # Realizar la inferencia en la GPU
                with torch.no_grad():
                    outputs = model(**inputs)
                    logits = outputs.logits
                    probs = torch.nn.functional.softmax(logits, dim=-1)

                # Sumar los scores de cada chunk
                negative_score_total += probs[0][0].item()
                neutral_score_total += probs[0][1].item()
                positive_score_total += probs[0][2].item()

            # Agregar los scores promediados
            negative_scores.append(negative_score_total / len(chunks))
            neutral_scores.append(neutral_score_total / len(chunks))
            positive_scores.append(positive_score_total / len(chunks))

            # Determinar el sentimiento principal
            if max(negative_score_total, neutral_score_total, positive_score_total) == negative_score_total:
                sentiment_list.append("Negative")
            elif max(negative_score_total, neutral_score_total, positive_score_total) == neutral_score_total:
                sentiment_list.append("Neutral")
            else:
                sentiment_list.append("Positive")

        # Asignar sentimientos y puntajes al chunk
        df.loc[start:end-1, 'sentiment'] = sentiment_list
        df.loc[start:end-1, 'negative_score'] = negative_scores
        df.loc[start:end-1, 'neutral_score'] = neutral_scores
        df.loc[start:end-1, 'positive_score'] = positive_scores
        processed_count += len(chunk_df)

        # Guardar el progreso después de cada `save_threshold` filas
        if processed_count >= save_threshold:
            
            # Remover duplicados antes de guardar
            df.drop_duplicates(inplace=True)
            guardar_progreso(df.iloc[start:end], ruta_guardado)
            processed_count = 0

    # Guardar el progreso final y remover duplicados
    df.drop_duplicates(inplace=True)
    guardar_progreso(df, ruta_guardado)

# Ruta para guardar el archivo de progreso
ruta_guardado = r"C:\Users\34616\Documents\4GEEKS\datos_gordos\roBERTa results\df_3era_corrida_3sentiments.csv"


Usando dispositivo: cuda




In [17]:
df = pd.read_csv(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\reddit\Bipolar\df_bipolar_full.csv")
analyze_sentiments_chunked(df, tokenizer, model)

2024-09-10 17:00:02,503 - INFO - Analyzing chunk n.1


Procesando chunk-group n.1


2024-09-10 17:00:56,521 - INFO - Analyzing chunk n.2


Procesando chunk-group n.2


2024-09-10 17:01:49,989 - INFO - Guardando el progreso...
2024-09-10 17:01:50,038 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 17:01:50,041 - INFO - Analyzing chunk n.3


Procesando chunk-group n.3


2024-09-10 17:02:42,185 - INFO - Analyzing chunk n.4


Procesando chunk-group n.4


2024-09-10 17:03:34,837 - INFO - Guardando el progreso...
2024-09-10 17:03:34,887 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 17:03:34,890 - INFO - Analyzing chunk n.5


Procesando chunk-group n.5


2024-09-10 17:04:27,357 - INFO - Analyzing chunk n.6


Procesando chunk-group n.6


2024-09-10 17:05:19,744 - INFO - Guardando el progreso...
2024-09-10 17:05:19,789 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 17:05:19,791 - INFO - Analyzing chunk n.7


Procesando chunk-group n.7


2024-09-10 17:06:12,751 - INFO - Analyzing chunk n.8


Procesando chunk-group n.8


2024-09-10 17:07:05,635 - INFO - Guardando el progreso...
2024-09-10 17:07:05,683 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 17:07:05,686 - INFO - Analyzing chunk n.9


Procesando chunk-group n.9


2024-09-10 17:07:58,413 - INFO - Analyzing chunk n.10


Procesando chunk-group n.10


2024-09-10 17:08:50,608 - INFO - Guardando el progreso...
2024-09-10 17:08:50,649 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 17:08:50,651 - INFO - Analyzing chunk n.11


Procesando chunk-group n.11


2024-09-10 17:09:43,069 - INFO - Analyzing chunk n.12


Procesando chunk-group n.12


2024-09-10 17:10:35,027 - INFO - Guardando el progreso...
2024-09-10 17:10:35,068 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 17:10:35,070 - INFO - Analyzing chunk n.13


Procesando chunk-group n.13


2024-09-10 17:11:28,128 - INFO - Analyzing chunk n.14


Procesando chunk-group n.14


2024-09-10 17:12:20,713 - INFO - Guardando el progreso...
2024-09-10 17:12:20,762 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 17:12:20,765 - INFO - Analyzing chunk n.15


Procesando chunk-group n.15


2024-09-10 17:13:13,332 - INFO - Analyzing chunk n.16


Procesando chunk-group n.16


2024-09-10 17:14:06,301 - INFO - Guardando el progreso...
2024-09-10 17:14:06,349 - INFO - Se han guardado 5000 instancias correctamente
2024-09-10 17:14:06,351 - INFO - Analyzing chunk n.17


Procesando chunk-group n.17


2024-09-10 17:14:48,169 - INFO - Guardando el progreso...
2024-09-10 17:14:48,879 - INFO - Se han guardado 84045 instancias correctamente


In [1]:
import pandas as pd
df = pd.read_csv(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\roBERTa results\df_3era_corrida_3sentiments.csv")
df

Unnamed: 0,text,submission_type,subreddit,labels,sentiment,negative_score,neutral_score,positive_score
0,It’s a bunch of conservatives looking for rede...,comment,politics,Democrat,Negative,0.882348,0.108581,0.009071
1,The best part of all this is the maga cult hat...,comment,politics,Republican,Negative,0.912243,0.082726,0.005031
2,They will grab onto anything to make trump int...,comment,politics,Republican,Negative,0.863834,0.121701,0.014466
3,I got into an argument with a guy last night t...,comment,politics,Republican,Negative,0.889723,0.107158,0.003119
4,What's amazing is that Trump's 34 felonies are...,comment,politics,Republican,Negative,0.835902,0.152027,0.012071
...,...,...,...,...,...,...,...,...
124040,"> No, Some Republicans aren’t motivated to vot...",comment,Republican,Republican,Neutral,0.284823,0.482519,0.232658
124041,That would apply if Trump hadn’t won and lost....,comment,Republican,Republican,Neutral,0.336064,0.593608,0.070328
124042,How many pardons has trump issued?,comment,Republican,Republican,Neutral,0.210550,0.729522,0.059928
124043,Should post this all over the subs who keep bl...,comment,Republican,Democrat,Negative,0.776307,0.209625,0.014067


In [2]:
df = df.drop_duplicates()
df = df.reset_index()
df

Unnamed: 0,index,text,submission_type,subreddit,labels,sentiment,negative_score,neutral_score,positive_score
0,0,It’s a bunch of conservatives looking for rede...,comment,politics,Democrat,Negative,0.882348,0.108581,0.009071
1,1,The best part of all this is the maga cult hat...,comment,politics,Republican,Negative,0.912243,0.082726,0.005031
2,2,They will grab onto anything to make trump int...,comment,politics,Republican,Negative,0.863834,0.121701,0.014466
3,3,I got into an argument with a guy last night t...,comment,politics,Republican,Negative,0.889723,0.107158,0.003119
4,4,What's amazing is that Trump's 34 felonies are...,comment,politics,Republican,Negative,0.835902,0.152027,0.012071
...,...,...,...,...,...,...,...,...,...
84040,124040,"> No, Some Republicans aren’t motivated to vot...",comment,Republican,Republican,Neutral,0.284823,0.482519,0.232658
84041,124041,That would apply if Trump hadn’t won and lost....,comment,Republican,Republican,Neutral,0.336064,0.593608,0.070328
84042,124042,How many pardons has trump issued?,comment,Republican,Republican,Neutral,0.210550,0.729522,0.059928
84043,124043,Should post this all over the subs who keep bl...,comment,Republican,Democrat,Negative,0.776307,0.209625,0.014067


In [3]:
df = df.drop(columns=['index', 'submission_type', 'subreddit'])
df

Unnamed: 0,text,labels,sentiment,negative_score,neutral_score,positive_score
0,It’s a bunch of conservatives looking for rede...,Democrat,Negative,0.882348,0.108581,0.009071
1,The best part of all this is the maga cult hat...,Republican,Negative,0.912243,0.082726,0.005031
2,They will grab onto anything to make trump int...,Republican,Negative,0.863834,0.121701,0.014466
3,I got into an argument with a guy last night t...,Republican,Negative,0.889723,0.107158,0.003119
4,What's amazing is that Trump's 34 felonies are...,Republican,Negative,0.835902,0.152027,0.012071
...,...,...,...,...,...,...
84040,"> No, Some Republicans aren’t motivated to vot...",Republican,Neutral,0.284823,0.482519,0.232658
84041,That would apply if Trump hadn’t won and lost....,Republican,Neutral,0.336064,0.593608,0.070328
84042,How many pardons has trump issued?,Republican,Neutral,0.210550,0.729522,0.059928
84043,Should post this all over the subs who keep bl...,Democrat,Negative,0.776307,0.209625,0.014067


In [4]:
# guardado
df.to_csv(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\roBERTa results\df_3era_corrida_3sentiments_clean.csv")

### Results analysis

In [40]:
df['sentiment'].value_counts()

sentiment
Negative    54468
Neutral     25372
Positive     4205
Name: count, dtype: int64

In [43]:
print(f"Average Negative Score: {round(df['negative_score'].mean(), 2)}")
print(f"Average Neutral Score: {round(df['neutral_score'].mean(), 2)}")
print(f"Average Positive Score: {round(df['positive_score'].mean(), 2)}")

Average Negative Score: 0.55
Average Neutral Score: 0.36
Average Positive Score: 0.09


### Republicans vs Democrats analysis

Value Counts

In [26]:
democrats = df[df['labels'] == 'Democrat'] 
republicans = df[df['labels'] == 'Republican']

print("Democrats Sentiments: ")
print(democrats['sentiment'].value_counts())
print("Republican Sentiments: ")
print(republicans['sentiment'].value_counts())

Democrats Sentiments: 
sentiment
Negative    15459
Neutral     10744
Positive     2509
Name: count, dtype: int64
Republican Sentiments: 
sentiment
Negative    39009
Neutral     14628
Positive     1696
Name: count, dtype: int64


Percentages

In [46]:
print("Democrat Sentiments %")
print(f"Number of Democrat mentions: {len(democrats)} - ({round(len(democrats)/len(df)*100, 2)}% of total)")
print(f"Negative: {round((democrats['sentiment'].value_counts()['Negative'] / len(democrats)*100), 2)}%")
print(f"Neutral: {round(democrats['sentiment'].value_counts()['Neutral'] / len(democrats)*100, 2)}%")
print(f"Positive: {round(democrats['sentiment'].value_counts()['Positive'] / len(democrats)*100, 2)}%")

Democrat Sentiments %
Number of Democrat mentions: 28712 - (34.16% of total)
Negative: 53.84%
Neutral: 37.42%
Positive: 8.74%


In [47]:
print("Republican Sentiments %")
print(f"Number of Democrat mentions: {len(republicans)} - ({round(len(republicans)/len(df)*100, 2)}% of total)")
print(f"Negative: {round((republicans['sentiment'].value_counts()['Negative'] / len(republicans)*100), 2)}%")
print(f"Neutral: {round(republicans['sentiment'].value_counts()['Neutral'] / len(republicans)*100, 2)}%")
print(f"Positive: {round(republicans['sentiment'].value_counts()['Positive'] / len(republicans)*100, 2)}%")


Republican Sentiments %
Number of Democrat mentions: 55333 - (65.84% of total)
Negative: 70.5%
Neutral: 26.44%
Positive: 3.07%


Averages

In [44]:
print("Democrat Score Averages: ")
print(f"Average Negative Score: {round(democrats['negative_score'].mean(), 2)}")
print(f"Average Neutral Score: {round(democrats['neutral_score'].mean(), 2)}")
print(f"Average Positive Score: {round(democrats['positive_score'].mean(), 2)}")
print("")

Democrat Score Averages: 
Average Negative Score: 0.47
Average Neutral Score: 0.39
Average Positive Score: 0.13


In [45]:
print("Republican Score Averages: ")
print(f"Average Negative Score: {round(republicans['negative_score'].mean(), 2)}")
print(f"Average Neutral Score: {round(republicans['neutral_score'].mean(), 2)}")
print(f"Average Positive Score: {round(republicans['positive_score'].mean(), 2)}")

Republican Score Averages: 
Average Negative Score: 0.59
Average Neutral Score: 0.34
Average Positive Score: 0.07
