# Punto 1
## Automatic identification of “trending/viral” topics around COVID in time in the World using deep learning architectures. Type: Different approaches.

In [1]:
import pickle as pk
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words

## Files used
The datasets and the embedings generated with a pretrained Sentence Bert model

In [None]:
#prefix = './drive/MyDrive/'
prefix = ''
spanish_path = prefix + 'datasets/italian/spanish_out_hash.json'
italian_path = prefix + 'datasets/spanish/italian_out_hash.json'
english_path = prefix + 'datasets/english/english_out_hash.json'

#prefix = './drive/MyDrive/'
prefix = ''
italian_embeddings_path = prefix + 'models_hpc/embeddings_ita_hash.pickle'
spanish_embeddings_path = prefix + 'models_hpc/embeddings_es_hash.pickle'
english_embeddings_path = prefix + 'models_hpc/embeddings_en_hash.pickle'


In [3]:
'''
------------------------ENGLISH---------------------------

------------------------ENGLISH---------------------------

------------------------ENGLISH---------------------------
'''

'\n------------------------ENGLISH---------------------------\n\n------------------------ENGLISH---------------------------\n\n------------------------ENGLISH---------------------------\n'

## English

In [4]:
file = open(english_path, 'r')
data = []
for line in file:
    data.append(json.loads(line))
text_raw_df = pd.json_normalize(data)

print(text_raw_df.shape)
text_raw_df.head(10)

(10000, 4)


Unnamed: 0,id,publication_date,source,text
0,1295929115770593287,1597809000.0,twitter,Info Source:
1,1296738518216011777,1598002000.0,twitter,#PostponeJEE_NEETSept #ProtestAgainstExamsInCO...
2,1252450676015198210,1587442000.0,twitter,Coronavirus-spreader Chris Cuomo got a lecture...
3,1380684968880406528,1618016000.0,twitter,Any military member that refuses to get vaccin...
4,1368958702150156290,1615220000.0,twitter,#Covid19 is staying around for a while. your ...
5,1317169175203401735,1602873000.0,twitter,LIES!!! LIES!!!
6,1288154256449708032,1595955000.0,twitter,"@GregMannarino Deborah BirxWhite House ""Expert"""
7,1283867253222502400,1594933000.0,twitter,Kayleigh McEnany: ‘Science Should Not Stand in...
8,1286909343909240832,1595658000.0,twitter,Amazing effort from the guys! Please donate if...
9,1235895985009811461,1583496000.0,twitter,@SulaiOdus They said it was suspended due to c...


In [5]:
'''
Turn text to Numpy Array
'''
texts_column = text_raw_df.loc[:,'text']
raw_texts = texts_column.values
raw_texts[0]

'Info Source: '

In [6]:
with open(english_embeddings_path, "rb") as output_file:
    embeddings = pk.load(output_file)

### UMAP para reducir la dimensionalidad de los embeddings
#### Esto permite una mejor clasificación para el clustering

In [None]:
import umap.umap_ as umap
umap_embeddings = umap.UMAP(n_neighbors=30,
                            n_components=5,
                            metric='cosine').fit_transform(embeddings)

### HDBSCAN para el clustering

In [None]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=60,
                          metric='euclidean',
                          cluster_selection_method='eom').fit(umap_embeddings)

### Visualisación de los clusters

In [None]:
import matplotlib.pyplot as plt

# Prepare data
umap_data = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

### c-TF-IDF

In [None]:
docs_df = pd.DataFrame(raw_texts, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [None]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words=get_stop_words('english')).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(raw_texts))

In [None]:

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

In [None]:
for topic in topic_sizes['Topic']:
  if topic == -1:
    continue
  print(str([t[0] for t in top_n_words[topic][:10]]) + '\n')



In [None]:
'''
------------------------SPANISH---------------------------

------------------------SPANISH---------------------------

------------------------SPANISH---------------------------
'''

## SPANISH

In [None]:
file = open(spanish_path, 'r')
data = []
for line in file:
    data.append(json.loads(line))
text_raw_df = pd.json_normalize(data)

print(text_raw_df.shape)
text_raw_df.head(10)

In [None]:
'''
Turn text to Numpy Array
'''
texts_column = text_raw_df.loc[:,'text']
raw_texts = texts_column.values
raw_texts[0]

In [None]:
with open(spanish_embeddings_path, "rb") as output_file:
    embeddings = pk.load(output_file)

### UMAP para reducir la dimensionalidad de los embeddings
#### Esto permite una mejor clasificación para el clustering

In [None]:
import umap.umap_ as umap
umap_embeddings = umap.UMAP(n_neighbors=30,
                            n_components=5,
                            metric='cosine').fit_transform(embeddings)

### HDBSCAN para el clustering

In [None]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=60,
                          metric='euclidean',
                          cluster_selection_method='eom').fit(umap_embeddings)

### Visualisación de los clusters

In [None]:
import matplotlib.pyplot as plt

# Prepare data
umap_data = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

### c-TF-IDF

In [None]:
docs_df = pd.DataFrame(raw_texts, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [None]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words=get_stop_words('spanish')).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(raw_texts))

In [None]:

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

In [None]:
for topic in topic_sizes['Topic']:
  if topic == -1:
    continue
  print(str([t[0] for t in top_n_words[topic][:10]]) + '\n')


In [None]:
'''
------------------------ITALIAN---------------------------

------------------------ITALIAN---------------------------

------------------------ITALIAN---------------------------
'''

## ITALIAN

In [None]:
file = open(italian_path, 'r')
data = []
for line in file:
    data.append(json.loads(line))
text_raw_df = pd.json_normalize(data)

print(text_raw_df.shape)
text_raw_df.head(10)

In [None]:
'''
Turn text to Numpy Array
'''
texts_column = text_raw_df.loc[:,'text']
raw_texts = texts_column.values
raw_texts[0]

In [None]:
with open(italian_embeddings_path, "rb") as output_file:
    embeddings = pk.load(output_file)

### UMAP para reducir la dimensionalidad de los embeddings
#### Esto permite una mejor clasificación para el clustering

In [None]:
import umap.umap_ as umap
umap_embeddings = umap.UMAP(n_neighbors=30,
                            n_components=5,
                            metric='cosine').fit_transform(embeddings)

### HDBSCAN para el clustering

In [None]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=60,
                          metric='euclidean',
                          cluster_selection_method='eom').fit(umap_embeddings)

### Visualisación de los clusters

In [None]:
import matplotlib.pyplot as plt

# Prepare data
umap_data = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

### c-TF-IDF

In [None]:
docs_df = pd.DataFrame(raw_texts, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [None]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words=get_stop_words('italian')).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(raw_texts))

In [None]:

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

In [None]:
for topic in topic_sizes['Topic']:
  if topic == -1:
    continue
  print(str([t[0] for t in top_n_words[topic][:10]]) + '\n')
