# Understand the different clusters generated

In [1]:
# Load cluster label for each instance [0-3]
import numpy as np
clusters_labels = np.load('/home/jovyan/work/RestMex23/Rest_Mex-2023_Track_Clustering/labels_spacy/labels_Kmeans_neig10_comp5.npy')

In [8]:
# load processed dataset (no punctuation, decoding, lemmatization, etc)
import pandas as pd
df = pd.read_csv('/home/jovyan/work/RestMex23/Rest_Mex-2023_Track_Clustering/df_spacymd.csv')
print(df.head())

   Unnamed: 0  ID                                               News
0           0   0  28 marzo 2022 firma alianza estrategico uanl o...
1           1   1  sección amenazar tiroteo escuela san nicolas n...
2           2   2  amenazar tiroteo secundario nl movilizar autor...
3           3   3  amenazar tiroteo alarmar papas secundario 17 s...
4           4   4  tendencia mercado foto shutterstock lunes 28 m...


In [9]:
df = df.drop(["ID", "Unnamed: 0"], axis=1)

In [18]:
docs_df = df.copy()
docs_df["News"] = docs_df["News"].map(str)

In [19]:
docs_df

Unnamed: 0,News
0,28 marzo 2022 firma alianza estrategico uanl o...
1,sección amenazar tiroteo escuela san nicolas n...
2,amenazar tiroteo secundario nl movilizar autor...
3,amenazar tiroteo alarmar papas secundario 17 s...
4,tendencia mercado foto shutterstock lunes 28 m...
...,...
114545,acerca cooki sitio utilizamos cooki personaliz...
114546,construir duna costero previo autorizacion sec...
114547,historia leonel reina interpretado ricardo aba...
114548,cantante infantil tatián encabezar programa mo...


In [26]:
# Insert inferred cluster labels as "Topic"
docs_df = pd.DataFrame(df['News'].tolist(), columns=["Doc"])
docs_df['Topic'] = clusters_labels
docs_df['Doc_ID'] = range(len(docs_df))

In [29]:
docs_df

Unnamed: 0,Doc,Topic,Doc_ID
0,28 marzo 2022 firma alianza estrategico uanl o...,2,0
1,sección amenazar tiroteo escuela san nicolas n...,3,1
2,amenazar tiroteo secundario nl movilizar autor...,3,2
3,amenazar tiroteo alarmar papas secundario 17 s...,3,3
4,tendencia mercado foto shutterstock lunes 28 m...,2,4
...,...,...,...
114545,acerca cooki sitio utilizamos cooki personaliz...,1,114545
114546,construir duna costero previo autorizacion sec...,2,114546
114547,historia leonel reina interpretado ricardo aba...,0,114547
114548,cantante infantil tatián encabezar programa mo...,0,114548


# TF-IDF model

In [34]:
import pandas as pd
docs_df = pd.DataFrame(df['News'].map(str).tolist(), columns=["Doc"])
docs_df['Topic'] = clusters_labels
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [38]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(clusters_labels))

In [39]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

Unnamed: 0,Topic,Size
3,3,45091
2,2,31441
0,0,22023
1,1,15995


# Most common words per cluster

In [41]:
top_n_words

{0: [('escrito', 0.0031776270268811893),
  ('por', 0.002866297305800841),
  ('nota', 0.002648821119025581),
  ('publicado', 0.0025479616410024808),
  ('notificación', 0.002536468272579652),
  ('actriz', 0.002300699795688645),
  ('cantante', 0.0020015843253430553),
  ('instagram', 0.0020013393383514506),
  ('compartir', 0.0019925841766464196),
  ('video', 0.0019819187018973853),
  ('actor', 0.0019750202003005262),
  ('minuto', 0.0019421704896314669),
  ('debate', 0.0019297807394859342),
  ('amor', 0.0019113414518992893),
  ('historia', 0.0018335172537408575),
  ('yo', 0.0018207845473598547),
  ('azteco', 0.0018197315453195987),
  ('mundo', 0.0017779640640272305),
  ('azteca', 0.0017763536850371946),
  ('siempre', 0.0017717556396960382)],
 1: [('cooki', 0.004481418497399702),
  ('personalizar', 0.0033974771976245134),
  ('utilizamos', 0.0033953436208281193),
  ('sol', 0.0033913288924845753),
  ('optimizar', 0.003382895656647558),
  ('clic', 0.003370072845716296),
  ('acordar', 0.00315591