# Punto 1 - Italian
## Automatic identification of “trending/viral” topics around COVID in time in the World using deep learning architectures. Type: Different approaches.

In [1]:
import pickle as pk
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
import umap.umap_ as umap


## Files used
The datasets and the embedings generated with a pretrained Sentence Bert model

In [2]:
#prefix = './drive/MyDrive/'
prefix = ''
italian_path = prefix + 'datasets/italian/italian_out_hash_bien.json'

#prefix = './drive/MyDrive/'
prefix = ''
italian_embeddings_path = prefix + 'models_hpc/embeddings_ita_v2.pickle'

#prefix = './drive/MyDrive/'
prefix = ''
umap_italian_embeddings_path = prefix + 'models_hpc/umap_ita_v2.pickle'

In [3]:
file = open(italian_path, 'r')
data = []
for line in file:
    data.append(json.loads(line))
text_raw_df = pd.json_normalize(data)

print(text_raw_df.shape)
text_raw_df.head(10)

(814485, 4)


Unnamed: 0,id,publication_date,source,text
0,1324811856859533313,1604695000.0,twitter,Il prof. #Galli è molto preoccupato da quanto ...
1,1288746494368124928,1596096000.0,twitter,Il COVID-19 coincide con un momento cruciale n...
2,1326191597785059329,1605024000.0,twitter,Muoviamoci #vaccinoCovid #coronavirus #COVID19...
3,1275134048722132993,1592851000.0,twitter,@mosllerdd @Cartabellotta Si sarebbero potute ...
4,1243118079091118080,1585217000.0,twitter,@morzo6 @CNN Purtroppo sì. State a casa. Disin...
5,1267645840455278594,1591065000.0,twitter,"@orticArya * ""il coronavirus non c'è più, ma c..."
6,1233340720460726272,1582886000.0,twitter,"Coronavirus, l’ultima trovata: mascherine grif..."
7,1279712208612667393,1593942000.0,twitter,"""Penne in quarantena"". Tredici #racconti di al..."
8,1271709738301095936,1592034000.0,twitter,"Cina, torna l’incubo Coronavirus: alcuni quart..."
9,1296853994686513152,1598029000.0,twitter,Questo è il punto


In [4]:
'''
Turn text to Numpy Array
'''
texts_column = text_raw_df.loc[:,'text']
raw_texts = texts_column.values
raw_texts[0]

'Il prof. #Galli è molto preoccupato da quanto sta accadendo a #Milano '

In [6]:
#with open(italian_embeddings_path, "rb") as output_file:
#    embeddings = pk.load(output_file)

with open(umap_italian_embeddings_path, "rb") as output_file:
    umap_embeddings = pk.load(output_file)

### HDBSCAN para el clustering

In [None]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=70,
                          metric='euclidean',
                          cluster_selection_method='eom',
                          min_samples=500).fit(umap_embeddings)

### Visualisación de los clusters

In [None]:
import matplotlib.pyplot as plt

# Prepare data
umap_data = umap.UMAP(n_neighbors=30, n_components=2, min_dist=0.0, metric='cosine').fit_transform(umap_embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

### c-TF-IDF

In [18]:
docs_df = pd.DataFrame(raw_texts, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [19]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words=get_stop_words('italian')).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(raw_texts))

In [20]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df);
topic_sizes.head(10)

Unnamed: 0,Topic,Size
0,-1,557212
104,103,11999
333,332,11279
279,278,7076
210,209,6888
360,359,6745
367,366,5424
370,369,5203
160,159,5065
68,67,4926


In [None]:
len(topic_sizes)

In [11]:
for topic in topic_sizes['Topic']:
  if topic == -1:
    continue
  print('Topic #' + str(topic) +":"+ '\n')
  print(str([t[0] for t in top_n_words[topic][:10]]) + '\n')


Topic #154:

['cina', 'cinese', 'cinesi', 'coronavirus', 'pechino', 'wuhan', 'china', 'italia', 'virus', 'oms']

Topic #221:

['scuole', 'scuola', 'chiuse', 'coronavirus', 'studenti', 'azzolina', 'chiusura', 'università', 'lezioni', 'classe']

Topic #391:

['calcio', 'serie', 'sport', 'coronavirus', 'campionato', 'seriea', 'positivo', 'inter', 'giocatori', 'porte']

Topic #408:

['vaccino', 'vaccini', 'covid19', 'covid', '19', 'anti', 'vaccine', 'vaccinazione', 'vaccinoanticovid', 'vaccinocovid']

Topic #410:

['coronavirus', 'conte', 'salvini', 'governo', 'emergenza', 'misure', 'italia', 'decreto', 'via', 'speranza']

Topic #356:

['covid19', '19', 'covid', 'solo', 'cosa', 'covid_19', 'fatto', 'esiste', 'prima', 'quando']

Topic #461:

['coronavirus', 'matteosalvinimi', 'solo', 'prima', 'fatto', 'poi', 'già', 'sempre', 'fa', 'ora']

Topic #459:

['morti', 'nuovi', 'casi', 'bollettino', 'italia', 'oggi', '24', 'contagi', 'ultime', 'coronavirus']

Topic #132:

['trump', 'usa', 'biden', 

372

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime


text_raw_df['dates'] =  text_raw_df['publication_date'].apply(lambda x:  datetime.fromtimestamp(x))
text_raw_df['Topic'] = cluster.labels_
text_raw_df


for num_topic in topic_sizes['Topic']:
    ax = text_raw_df.groupby([text_raw_df['Topic'], text_raw_df['dates'].dt.year, text_raw_df['dates'].dt.month]).count()['id'][num_topic].plot(kind="bar") #Imprime el topic cero
    ax.grid(b=True, axis='both')
    #print(type(ax))
    ax.set_title('Distribución del Topic ' + str(num_topic) + ' en los documentos en inglés')
    ax.set_xlabel('Mes y año de los tweets')
    ax.set_ylabel('Número de tweets del topic')
    plt.show()

