In [1]:
! pip install nltk textblob deep_translator pandas scrapy

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from deep_translator import GoogleTranslator
from textblob import TextBlob

# Descargar los recursos necesarios de NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/md/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/md/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import threading

stop_words = set(stopwords.words('english'))

def filtrar_texto(texto):
    palabras = word_tokenize(texto)
    palabras_filtradas = [palabra for palabra in palabras if palabra not in stop_words and TextBlob(palabra).sentiment.polarity != 0]
    return ' '.join(palabras_filtradas)

def dividir_texto(texto, limite_caracteres=4000):
    palabras = texto.split()
    secciones = []
    current_section = palabras[0]

    for palabra in palabras[1:]:
        if len(current_section) + len(palabra) + 1 <= limite_caracteres:
            current_section += ' ' + palabra
        else:
            secciones.append(current_section)
            current_section = palabra

    secciones.append(current_section)
    return secciones

def translate(seccion, translated_text_list):
    translated_text_list.append(GoogleTranslator(source='auto', target='en').translate(seccion))

i = 0
def preprocesar_texto(discurso):
    global i
    i+=1
    secciones = dividir_texto(discurso)
    threads = []
    lista_traducida = []
    for seccion in secciones:
        thread = threading.Thread(target=translate, args=(seccion, lista_traducida))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    texto_traducido = ' '.join(lista_traducida)
    print("Fila preprocesada: ", i)
    return filtrar_texto(texto_traducido)

In [4]:
import pandas as pd
df = pd.read_csv("discursos.csv")

In [5]:
import concurrent.futures

def preprocesar_df(df):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        df["palabras_claves"] = list(executor.map(preprocesar_texto, df["discurso"]))

preprocesar_df(df)

df

Fila preprocesada:  16
Fila preprocesada:  17
Fila preprocesada:  18
Fila preprocesada:  19
Fila preprocesada:  20
Fila preprocesada:  21
Fila preprocesada:  22
Fila preprocesada:  23
Fila preprocesada:  24
Fila preprocesada:  25
Fila preprocesada:  26
Fila preprocesada:  27
Fila preprocesada:  28
Fila preprocesada:  29
Fila preprocesada:  30
Fila preprocesada:  31
Fila preprocesada:  32
Fila preprocesada:  33
Fila preprocesada:  34
Fila preprocesada:  35
Fila preprocesada:  36
Fila preprocesada:  37
Fila preprocesada:  38
Fila preprocesada:  39
Fila preprocesada:  40
Fila preprocesada:  41
Fila preprocesada:  42
Fila preprocesada:  43
Fila preprocesada:  44
Fila preprocesada:  45
Fila preprocesada:  46
Fila preprocesada:  47
Fila preprocesada:  48
Fila preprocesada:  49
Fila preprocesada:  50
Fila preprocesada:  51
Fila preprocesada:  52
Fila preprocesada:  53
Fila preprocesada:  54
Fila preprocesada:  55
Fila preprocesada:  56
Fila preprocesada:  56
Fila preprocesada:  58
Fila prepro

Unnamed: 0,orador,fecha,discurso,palabras_claves
0,Alberto Fernández,05/10/2023,de hablar. Yo les propongo un poquito reflexi...,insulting live drunk famous Honestly full bett...
1,Alberto Fernández,30/08/2023,Buen día a todos y todas. ¿Cómo les va? ¿Cómo ...,difficult many closed closed much much much be...
2,Alberto Fernández,06/10/2023,"Palabras del presidente de la Nación, Alberto ...",new first good welcome joy particular characte...
3,Alberto Fernández,05/10/2023,Buenos a todos y a todas. Gracias a quienes no...,experienced ethical good bad mental young whol...
4,Alberto Fernández,30/08/2023,Hay dos cosas que quería contarte que pasaron ...,important first closed hard important economic...
...,...,...,...,...
1293,Alberto Fernández,29/08/2023,"Ayer, el ministro Sergio Massa te contó la ser...",single less little better difficult many socia...
1294,Alberto Fernández,19/09/2023,Buenas tardes Señor Presidente. El mundo está ...,Forced completely greater win hate relevant po...
1295,Alberto Fernández,29/08/2023,"MARCÓ DEL PONT. - A nosotros nos parece que, e...",important Economic Social first common common ...
1296,Alberto Fernández,25/09/2023,"Es un momento para reflexionar, un poco, sobre...",little difficult marked first economic able im...


In [21]:
'''
Archivo obtenido de:
https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm

The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions 
(anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). 
The annotations were manually done by crowdsourcing.
'''

NRC_LEXICON = 'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'

emolex = {}
with open(NRC_LEXICON, 'r') as f:
    for line in f:
        word, emotion, flag = line.strip().split()
        if word not in emolex:
            emolex[word] = {}
        emolex[word][emotion] = int(flag)

def calcular_emociones(texto_preprocesado):
    emotions = {emotion: 0 for emotion in emolex[list(emolex.keys())[0]].keys()}
    for word in texto_preprocesado.split():
        if emolex.get(word.lower()):
            for emotion in emolex[word.lower()]:
                if emolex[word.lower()][emotion] == 1:
                    emotions[emotion] += emolex[word.lower()][emotion]
    return emotions

In [22]:
def asignar_emociones(df):
    for index, row in df.iterrows():
        emociones_dic = calcular_emociones(row["palabras_claves"])
        for emocion, valor in emociones_dic.items():
            df.loc[index, emocion] = valor

asignar_emociones(df)

In [9]:
df

Unnamed: 0,orador,fecha,discurso,palabras_claves,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,Alberto Fernández,05/10/2023,de hablar. Yo les propongo un poquito reflexi...,insulting live drunk famous Honestly full bett...,6.0,6.0,7.0,12.0,13.0,21.0,50.0,6.0,3.0,35.0
1,Alberto Fernández,30/08/2023,Buen día a todos y todas. ¿Cómo les va? ¿Cómo ...,difficult many closed closed much much much be...,2.0,9.0,2.0,5.0,14.0,3.0,17.0,1.0,4.0,12.0
2,Alberto Fernández,06/10/2023,"Palabras del presidente de la Nación, Alberto ...",new first good welcome joy particular characte...,0.0,1.0,0.0,0.0,3.0,0.0,3.0,0.0,1.0,1.0
3,Alberto Fernández,05/10/2023,Buenos a todos y a todas. Gracias a quienes no...,experienced ethical good bad mental young whol...,2.0,5.0,4.0,3.0,4.0,6.0,15.0,3.0,2.0,9.0
4,Alberto Fernández,30/08/2023,Hay dos cosas que quería contarte que pasaron ...,important first closed hard important economic...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1293,Alberto Fernández,29/08/2023,"Ayer, el ministro Sergio Massa te contó la ser...",single less little better difficult many socia...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1294,Alberto Fernández,19/09/2023,Buenas tardes Señor Presidente. El mundo está ...,Forced completely greater win hate relevant po...,4.0,5.0,3.0,6.0,6.0,9.0,19.0,4.0,3.0,10.0
1295,Alberto Fernández,29/08/2023,"MARCÓ DEL PONT. - A nosotros nos parece que, e...",important Economic Social first common common ...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
1296,Alberto Fernández,25/09/2023,"Es un momento para reflexionar, un poco, sobre...",little difficult marked first economic able im...,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0


In [10]:
df.to_csv('discursos_emociones.csv', index=False)

In [23]:
columnas_seleccionadas = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']
df_seleccionado = df[columnas_seleccionadas]

# Calcular el máximo de las columnas seleccionadas
maximos = df_seleccionado.max()

# Calcular el promedio de las columnas seleccionadas
promedios = df_seleccionado.mean()

# Calcular la moda de las columnas seleccionadas
modas = df_seleccionado.mode().iloc[0]

# Imprimir los resultados
print("Máximos:")
print(maximos)
print("\nPromedios:")
print(promedios)
print("\nModas:")
print(modas)

Máximos:
anger            18.0
anticipation     32.0
disgust          17.0
fear             26.0
joy              41.0
negative         39.0
positive        121.0
sadness          19.0
surprise         18.0
trust            75.0
dtype: float64

Promedios:
anger            1.266564
anticipation     5.461479
disgust          1.320493
fear             2.458398
joy              8.119414
negative         3.313559
positive        14.932203
sadness          1.597072
surprise         3.673344
trust            9.796610
dtype: float64

Modas:
anger            0.0
anticipation     3.0
disgust          0.0
fear             0.0
joy              6.0
negative         0.0
positive        10.0
sadness          0.0
surprise         2.0
trust            7.0
Name: 0, dtype: float64
