In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import util

In [3]:
# df_news_tweets = pd.read_pickle('en-emd-paraphrase-distilroberta-base-v2.pkl')
# df_news_tweets = pd.read_pickle('en-emd-paraphrase-mpnet-base-v2.pkl')
# df_news_tweets = pd.read_pickle('en-emd-paraphrase-MiniLM-L6-v2.pkl')
# df_news_tweets = pd.read_pickle('en-emd-nli-mpnet-base-v2.pkl')
# df_news_tweets = pd.read_pickle('en-emd-nli-roberta-base-v2.pkl')
# df_news_tweets = pd.read_pickle('en-emd-digitalepidemiologylab-covid-twitter-bert-v2.pkl')
# df_news_tweets = pd.read_pickle('en-emd-cardiffnlp-twitter-roberta-base.pkl')
df_news_tweets = pd.read_pickle('en-emd-facebook-laser.pkl')

In [4]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [4]:
df_cluster_labels = pd.read_pickle('Leiden_CosTh85.pkl')
# df_cluster_labels = pd.read_pickle('Agglomerative_CosTh85.pkl')

In [6]:
# Separate dataframe of each cluster
dict_com = {}
for i in range(len(set(df_cluster_labels['com_label']))):
    dict_com[i] = df_cluster_labels.loc[df_cluster_labels['com_label'] == i]

In [7]:
len(dict_com)

705

### Remove duplicated and near-duplicated tweets by seting a larger similarity threshold

In [10]:
from sklearn.cluster import AgglomerativeClustering

In [11]:
def agglom_cluster(corpus, embeddings, threshold):
    cos_mat = util.cos_sim(embeddings, embeddings)
    cos_copy = cos_mat.detach().cpu().numpy()
    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=threshold, affinity='precomputed', linkage='average')
    clustering_model.fit(1-cos_copy)
    cluster_assignment = clustering_model.labels_
    clustered_sentences = {}
    clustered_ids = {}
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
            clustered_ids[cluster_id] = []

        clustered_sentences[cluster_id].append(corpus[sentence_id])
        clustered_ids[cluster_id].append(sentence_id)
    communities = sorted(clustered_ids.items(), key= lambda x: len(x[1]), reverse=True) 
    
    return communities

In [13]:
# Contatenate tweets text and news summaries in each cluster
total_tweets_summaries = []
total_news_ids = []
total_news_summaries = []
total_com_ids = []

for com in range(len(dict_com)):
#     print(com)
    total_com_ids.append(com)
    news_id_set = set(dict_com[com]['news_id'])
    news_ids = ','.join(news_id_set)
    total_news_ids.append(news_ids)
    
    news_set = set(dict_com[com]['news_summary'])
    news_summaries = '\n'.join(news_set)
    total_news_summaries.append(news_summaries)
    
    new_com = dict_com[com]
    
    sub_communities = agglom_cluster(new_com['clean_text'].tolist(), new_com['tweet_embeddings'].tolist(),threshold = 0.1)
    df_coms = {}
    i = 0
    for sub in sub_communities:
        df_coms[i]= new_com.loc[new_com.index[sub[1]]]
        i += 1
    
    summaries = []
    for i in range(len(sub_communities)):
        summaries.append(df_coms[i]['clean_text'].tolist()[0])
        
    summaries = '\n'.join(summaries)
        
    total_tweets_summaries.append(summaries)

### Abstractive summarization
Models: Bart, T5

In [8]:
from transformers import pipeline

In [9]:
summarizer = pipeline('summarization', model = 'facebook/bart-large-cnn', device =0)
# summarizer = pipeline('summarization', model = 't5-large', device =0)

In [14]:
total_summaries = []
total = []
for i in range(0, len(dict_com)-15, 15):
    sumry = summarizer(total_tweets_summaries[i:i+15], truncation = True, min_length = 10, max_length = 140)
    total+= ([s['summary_text'] for s in sumry])
sumry = summarizer(total_tweets_summaries[i+15:], truncation = True, min_length = 10,  max_length = 140)
total += ([s['summary_text'] for s in sumry])

Your max_length is set to 140, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 140, but you input_length is only 103. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 140, but you input_length is only 129. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 140, but you input_length is only 127. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 140, but you input_length is only 59. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 140, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 140, but you input_length is only 99. You might consi

In [15]:
len(total)

705

In [16]:
from nltk.tokenize import sent_tokenize

In [17]:
total_sumry = ['\n'.join(sent_tokenize(para)) for para in total] # Join sentences with '\n' to be the same format as news summaries.

In [18]:
dict_summary = {'news_ids': total_news_ids, 'news_summaries': total_news_summaries, 'tweets_summaries': total_sumry, 'tweets':total_tweets_summaries} 
df_summary = pd.DataFrame(dict_summary)

In [19]:
df_summary.to_pickle('bart_leiden.pkl')
# df_summary.to_pickle('t5_leiden.pkl')
# df_summary.to_pickle('bart_agglomerative.pkl')
# df_summary.to_pickle('t5_agglomerative.pkl')

### Calculate ROUGE score for summarization evaluation

In [20]:
from datasets import load_metric
rouge = load_metric('rouge')
bertscore = load_metric('bertscore')

In [21]:
rouge_results = rouge.compute(predictions = df_summary['tweets_summaries'].tolist(), references = df_summary['news_summaries'].tolist())

In [22]:
rouge_results

{'rouge1': AggregateScore(low=Score(precision=0.43368526759846837, recall=0.6325468508516967, fmeasure=0.47947813401556355), mid=Score(precision=0.4530424028216105, recall=0.6558864013238046, fmeasure=0.49935404988195053), high=Score(precision=0.47315352025898383, recall=0.6803422645131049, fmeasure=0.520343374349764)),
 'rouge2': AggregateScore(low=Score(precision=0.3181877067227712, recall=0.46972141044030025, fmeasure=0.35453257492847434), mid=Score(precision=0.3422487225221194, recall=0.49900197543518265, fmeasure=0.380818591030837), high=Score(precision=0.366161424297369, recall=0.5276762253629017, fmeasure=0.40542384776988283)),
 'rougeL': AggregateScore(low=Score(precision=0.39932435241284453, recall=0.5819279741294705, fmeasure=0.44282582296705614), mid=Score(precision=0.4210870556787015, recall=0.6065152622973269, fmeasure=0.4646634433377768), high=Score(precision=0.443205122204518, recall=0.6304697352802568, fmeasure=0.48590849803516917)),
 'rougeLsum': AggregateScore(low=Sco

In [23]:
bert_results = bertscore.compute(predictions = df_summary['tweets_summaries'].tolist(), references = df_summary['news_summaries'].tolist(), lang='en')

In [24]:
np.mean(bert_results['f1'])

0.9076200063346971