In [1]:
! pip install nltk textblob deep_translator pandas scrapy

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from deep_translator import GoogleTranslator
from textblob import TextBlob

# Descargar los recursos necesarios de NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/md/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/md/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Definir la lista de palabras vacías
stop_words = set(stopwords.words('english'))

# Función para eliminar palabras vacías y neutras, conservando solo palabras con carga sentimental
def filtrar_texto(texto):
    palabras = word_tokenize(texto)
    palabras_filtradas = [palabra for palabra in palabras if not palabra in stop_words and TextBlob(palabra).sentiment.polarity != 0]
    return ' '.join(palabras_filtradas)

def dividir_texto(texto, limite_caracteres=4000):
    palabras = texto.split()
    secciones = []
    current_section = palabras[0]

    for palabra in palabras[1:]:
        if len(current_section) + len(palabra) + 1 <= limite_caracteres:
            current_section += ' ' + palabra
        else:
            secciones.append(current_section)
            current_section = palabra

    secciones.append(current_section)
    return secciones


def preprocesar_texto(discurso):
    secciones = dividir_texto(discurso)
    
    texto_traducido = ''
    for seccion in secciones:
        texto_traducido += GoogleTranslator(source='auto', target='en').translate(seccion) + ' '

    return filtrar_texto(texto_traducido)

In [None]:
import pandas as pd
df = pd.read_csv("discursos.csv")
df["palabras_claves"] = df["discurso"].apply(preprocesar_texto)

In [None]:
print(df)

In [None]:
'''
Archivo obtenido de:
https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm

The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions 
(anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). 
The annotations were manually done by crowdsourcing.
'''

NRC_LEXICON = 'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'

emolex = {}
with open(NRC_LEXICON, 'r') as f:
    for line in f:
        word, emotion, flag = line.strip().split()
        if word not in emolex:
            emolex[word] = {}
        emolex[word][emotion] = int(flag)

emotions = {emotion: 0 for emotion in emolex[list(emolex.keys())[0]].keys()}
for word in texto_preprocesado.split():
    if word in emolex:
        for emotion in emolex[word]:
            if emolex[word][emotion] == 1:
                emotions[emotion] += 1

print("Emociones:", emotions)