# Entrenamiento y uso del modelo LDA
En este notebook se cubriran los siguientes temas:
- Bigramas, trigramas y palabras comunes.
- Eliminación de palabras irrelevantes y/o poco comunes.
- Entrenamiento del modelo LDA.
- Visualización.
- Utilización del modelo LDA.

# Importando las librerías necesarias.

In [5]:
import numpy as np
import pandas as pd
import csv
import glob
import json

#Gensim

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel

#spacy Lemmatization processes

import nltk
import spacy
from nltk.corpus import stopwords

#Visualization


import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.gensim

Aquí empezamos a preparar los datos para el entrenamiento del modelo.

In [26]:
def load_data(file):
    csv.field_size_limit(1000000)
    with open (file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        data = [row for row in reader]
    return data

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data,f,indent=4 )
        

In [27]:
stopwords = stopwords.words("spanish")

In [28]:
print(stopwords)

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas', 'estoy', 'estás', 'está', 'estamos', 'estáis', 'están', 'e

In [29]:
data = load_data("clean_corpus.csv")

In [30]:
words = []
for row in data:
    words.append(row["text"])

In [31]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(words)

print (data_words[0][0:20])

['buen', 'dia', 'terminar', 'reunion', 'seguridad', 'informo', 'obtener', 'informacion', 'delito', 'cometido', 'establecer', 'existir', 'sistema', 'informacion', 'reporte', 'incidencia', 'delictivo', 'diariamente', 'sistema', 'existir']


## Bigramas, trigramas y palabras comunes

In [32]:
#Bigramas y trigramas
bigrams_phrases = gensim.models.Phrases(data, min_count = 5, threshold = 50)
trigram_phrases = gensim.models.Phrases(bigrams_phrases[data_words], threshold=50)

bigram=gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(bigram[doc] for doc in texts)

def make_trigrams(texts):
    return(trigram[bigram[doc]] for doc in texts)

data_bigrams =  list(make_bigrams(data_words))
data_bigrams_trigrams = list(make_trigrams(data_bigrams))


In [33]:
print(data_bigrams_trigrams[0][0:20])

['buen', 'dia', 'terminar', 'reunion', 'seguridad', 'informo', 'obtener', 'informacion', 'delito_cometido', 'establecer', 'existir', 'sistema', 'informacion', 'reporte', 'incidencia_delictivo', 'diariamente', 'sistema', 'existir', 'estadistico', 'tener']


## Eliminación de palabras irrelevantes y/o poco comunes.

Utilizando la técnica TF-IDF (Term Frecuency-Inverse Document Frecuency) utilizando los bigramas y trigramas generados anteriormente, después entrenamos el modelo TF-IDF utilizando este mismo corpus. Definimos un umbral para usarlo como filtro de palabras para eliminar las que tengan una menor puntuación a esta y las que no aparecen en el propio modelo.

In [34]:
#TF-IDF Removal

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]


tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []

for i in range(0,len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i]=new_bow


## Entrenamiento del modelo LDA.

In [37]:
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, 
                                            id2word = id2word, 
                                            num_topics= 14, 
                                            random_state = 100, 
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto")

## Visualización

In [38]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds = "mmds", R = 30)
vis



In [39]:
lda_model.save('my_lda_model')

## Utilización del modelo LDA.

Una vez entrenado el modelo este se carga para su utilización.

In [6]:
# Load the trained LDA model
lda_model = gensim.models.ldamodel.LdaModel.load('my_lda_model')

# Load the new data
new_data = pd.read_csv('clean_corpus.csv')

# Load the SpaCy model for tokenization
nlp = spacy.load('es_core_news_sm')

In [7]:
topics = lda_model.show_topics(num_topics=15, num_words=10)
for topic in topics:
    print(f"Topic {topic[0]}: {topic[1]}")

Topic 0: 0.033*"aeropuerto_internacional" + 0.026*"minuto" + 0.025*"aifa" + 0.018*"pasajero" + 0.018*"acarreado" + 0.015*"futbol" + 0.015*"club" + 0.012*"terminal" + 0.012*"conectividad" + 0.009*"conectar"
Topic 1: 0.027*"inteligencia" + 0.024*"cartel" + 0.022*"feminicidio" + 0.021*"estadounidense" + 0.011*"fentanilo" + 0.008*"droga" + 0.008*"bloqueado" + 0.007*"aseguramiento" + 0.006*"influyente" + 0.006*"entramado"
Topic 2: 0.011*"nuevo_leon" + 0.008*"tamaulipa" + 0.008*"productor" + 0.008*"epidemia" + 0.008*"coronavirus" + 0.006*"evento" + 0.006*"recaudacion" + 0.005*"virus" + 0.005*"transmision" + 0.005*"enfermedad"
Topic 3: 0.027*"ducto" + 0.013*"zacateco" + 0.013*"barril" + 0.013*"refineria" + 0.008*"pipar" + 0.007*"hidalgo" + 0.007*"robo_combustible" + 0.006*"culpable" + 0.005*"forense" + 0.005*"crudo"
Topic 4: 0.016*"credito" + 0.014*"salario_minimo" + 0.009*"octubre" + 0.009*"descenso" + 0.008*"ejido" + 0.007*"cofepris" + 0.007*"ruben" + 0.006*"narrativo" + 0.006*"hospitalizac

Nombramos cada uno de los topicos acorde al tema según nuestras palabras asociadas a este, creamos una bolsa de palabras e inferimos los topicos de la bolsa de palabras utilizando el modelo LDA anteriormente entrenado con todo el corpus.

In [8]:
# Create a bag of words representation of the preprocessed new corpus
new_corpus = [doc.split() for doc in new_data['text']]
new_bow_corpus = [lda_model.id2word.doc2bow(doc) for doc in new_corpus]

# Infer topics for the new corpus using the trained LDA model
new_topics = [lda_model.get_document_topics(doc, minimum_probability=0.0) for doc in new_bow_corpus]

topics_dict = {
    1: "Aeropuertos y vuelos",
    2: "Ley y crimen",
    3: "La pandemia del COVID-19 y su impacto",
    4: "La industria petrolera",
    5: "Economía",
    6: "Tren Maya",
    7: "Salud y medicina",
    8: "Reformas políticas",
    9: "Fronteras e inmigración",
    10: "Energía y minería",
    11: "Vacunación",
    12: "Problemas sociales y justicia penal",
    13: "Deportes y turismo",
    14: "Relaciones políticas entre México y Perú"
}

Obtenemos las probabilidades de los 14 topicos que nos dio en cada semana, para después guardar los resultados en un dataframe

In [9]:
# Obtener las probabilidades de los 14 tópicos para cada documento
topic_probs = []
for topics in new_topics:
    probs = [topic[1] for topic in topics]
    topic_probs.append(probs)

# Crear el dataframe de probabilidades de los 14 tópicos
df_probs = pd.DataFrame(topic_probs, columns=topics_dict.values())
df_probs.insert(0, "week", new_data["week"])

In [10]:
df_probs

Unnamed: 0,week,Aeropuertos y vuelos,Ley y crimen,La pandemia del COVID-19 y su impacto,La industria petrolera,Economía,Tren Maya,Salud y medicina,Reformas políticas,Fronteras e inmigración,Energía y minería,Vacunación,Problemas sociales y justicia penal,Deportes y turismo,Relaciones políticas entre México y Perú
0,2018-12-03,0.028691,0.015211,0.015710,0.145398,0.276190,0.032176,0.017541,0.197006,0.010322,0.026354,0.138923,0.044397,0.033343,0.018738
1,2018-12-10,0.525716,0.041977,0.023965,0.096246,0.030492,0.014790,0.009365,0.036887,0.005134,0.053385,0.101162,0.039852,0.020649,0.000382
2,2018-12-24,0.025931,0.011382,0.017638,0.578971,0.033275,0.032946,0.010113,0.071404,0.008078,0.011284,0.076060,0.062460,0.031380,0.029079
3,2018-12-31,0.017002,0.021354,0.031591,0.182798,0.089951,0.038740,0.015976,0.078147,0.015553,0.016371,0.116781,0.329740,0.031036,0.014960
4,2019-01-07,0.030686,0.021270,0.020566,0.574797,0.060565,0.027770,0.015156,0.061348,0.014206,0.023336,0.063180,0.042350,0.023754,0.021016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,2023-02-13,0.039516,0.025875,0.040953,0.058573,0.068454,0.192499,0.023955,0.199830,0.030543,0.023150,0.153272,0.050515,0.062046,0.030819
215,2023-02-20,0.136937,0.030411,0.048812,0.084357,0.055069,0.171948,0.019902,0.121892,0.018386,0.027539,0.118835,0.053104,0.072960,0.039848
216,2023-02-27,0.036833,0.028563,0.051232,0.079745,0.084962,0.154437,0.037653,0.157018,0.023832,0.026523,0.180176,0.041791,0.060989,0.036247
217,2023-03-06,0.020691,0.138175,0.048526,0.073987,0.068486,0.135085,0.028012,0.135631,0.021693,0.059122,0.127059,0.049951,0.060722,0.032861


Creamos un segundo dataframe donde guardemos el tópico y su lista de palabras asociada.

In [11]:
topic_words = []
for topic_num, topic_words_str in lda_model.show_topics():
    words = topic_words_str.split('+')
    word_list = []
    for word in words:
        weight, word = word.split('*')
        word = word.strip().replace('"', '')
        weight = float(weight.strip())
        word_list.append(f"{word}:{weight:.4f}")
    topic_word = {
        'palabra_asociada': topics_dict[topic_num + 1],
        'lista_palabras': word_list
    }
    topic_words.append(topic_word)

# Crear el dataframe de palabras asociadas y lista de palabras para cada tópico
df_words = pd.DataFrame(topic_words)

In [12]:
df_words

Unnamed: 0,palabra_asociada,lista_palabras
0,Fronteras e inmigración,"[centroamerico:0.0100, espanol:0.0080, cancill..."
1,Salud y medicina,"[medicamento:0.0250, avion:0.0150, hospital:0...."
2,La pandemia del COVID-19 y su impacto,"[nuevo_leon:0.0110, tamaulipa:0.0080, producto..."
3,Ley y crimen,"[inteligencia:0.0270, cartel:0.0240, feminicid..."
4,Aeropuertos y vuelos,"[aeropuerto_internacional:0.0330, minuto:0.026..."
5,Relaciones políticas entre México y Perú,"[peru:0.0600, asilo:0.0210, noviembre:0.0200, ..."
6,Problemas sociales y justicia penal,"[penal:0.0160, medicina_tradicional:0.0160, is..."
7,Tren Maya,"[tramo:0.0350, maya:0.0340, fentanilo:0.0250, ..."
8,Reformas políticas,"[bidir:0.0240, marchar:0.0180, marcha:0.0150, ..."
9,Vacunación,"[vacunacion:0.0470, dosis:0.0400, vacunar:0.03..."


Guardamos estos dos ultimos dataframes creados para su futura utilización

In [64]:
df_probs.to_csv('probs_topics.csv', index=False)
df_words.to_csv('words_topics.csv', index=False)