In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from time import time
from itertools import combinations
import peakmetrics_utilities
from peakmetrics_utilities import random_search, generate_clusters

In [None]:
from sentence_transformers import util
tqdm.pandas()

In [None]:
with open('news.pickle','rb') as f:
    news=pickle.load(f)
with open('social.pickle','rb') as f:
    social=pickle.load(f)
with open('blog.pickle','rb') as f:
    blog=pickle.load(f)

In [None]:
#Generate record indeces for use by Louvain community detection algorithm --- has to be done twice because reasons
blog=blog.reset_index()
del(blog['index'])
blog=blog.reset_index()
news=news.reset_index()
del(news['index'])
news=news.reset_index()
social=social.reset_index()
del(social['index'])
social=social.reset_index()

### Blog data Louvain community detection

In [None]:
blog_edges=list(combinations(blog['index'].to_list(),2))
blog_edges=[(x[0],x[1],blog.loc[x[0]]['minilm_embeddings'],blog.loc[x[1]]['minilm_embeddings']) for x in tqdm(blog_edges, position=0)]
blog_edges=pd.DataFrame(blog_edges,columns=['node_1','node_2','vectors_1','vectors_2'])
blog_edges['proximity']=blog_edges.swifter.apply(lambda x: util.pytorch_cos_sim(x['vectors_1'],x['vectors_2'])[0][0].item(), axis=1)

tik=time()
blog_G=peakmetrics_utilities.find_louvain_communities(blog_edges)
tok=time()

tok-tik
blog['community']=blog_G['community']
blog['cluster']=-2
import pickle
with open('blog_clustered.pickle','wb') as f:
    pickle.dump(blog, f)

### News and Social media clustering --- negative records only

#### News media

In [None]:
news_neg=news[news['sentiment_label']=='negative']
news_neg=news_neg.reset_index()
del(news_neg['index'])
news_neg=news_neg.reset_index()

In [None]:
news_edges=list(combinations(news_neg['index'].to_list(),2))
news_edges=[(x[0],x[1],news_neg.iloc[x[0]]['minilm_embeddings'],news.iloc[x[1]]['minilm_embeddings']) for x in tqdm(news_edges, position=0)]
news_edges=pd.DataFrame(news_edges,columns=['node_1','node_2','vectors_1','vectors_2'])
news_edges['proximity']=news_edges.swifter.apply(lambda x: util.pytorch_cos_sim(x['vectors_1'],x['vectors_2'])[0][0].item(), axis=1)

In [None]:
tik=time()
news_neg_G=peakmetrics_utilities.find_louvain_communities(news_edges)
tok=time()

print(tok-tik)
print(news_neg_G['community'].value_counts(normalize=True))
news_neg['community']=news_neg_G['community']

In [None]:
space={'n_neighbors': range(5,100),
      'n_components': range(3,15),
      'min_cluster_size': range(5,50),
      'random_state':42}

In [None]:
news_neg_embeds=news_neg['minilm_embeddings']
news_neg_embeds=np.array([np.array(y) for y in news_neg_embeds])
news_neg_random_use=random_search(news_neg_embeds, space, 100)

In [None]:
news_neg_random_use.head(20)

In [None]:
news_neg_cluster_labels = generate_clusters(news_neg_embeds, 
                                     n_neighbors = 31, 
                                     n_components = 11,
                                     min_cluster_size = 8, 
                                     random_state = 42)
news_neg['clusters']=news_neg_cluster_labels.labels_
print(news_neg['clusters'].value_counts(normalize=True).head(20))

In [None]:
import pickle
with open('news_neg_clustered.pickle','wb') as f:
    pickle.dump(news_neg, f)

#### Social media

In [None]:
social_neg=social[social['sentiment_label']=='negative']
social_neg=social_neg.reset_index()
del(social_neg['index'])
social_neg=social_neg.reset_index()

social_edges=list(combinations(social_neg['index'].to_list(),2))
social_edges=[(x[0],x[1],social.iloc[x[0]]['minilm_embeddings'],social.iloc[x[1]]['minilm_embeddings']) for x in tqdm(social_edges, position=0)]
social_edges=pd.DataFrame(social_edges,columns=['node_1','node_2','vectors_1','vectors_2'])
social_edges['proximity']=social_edges.swifter.apply(lambda x: util.pytorch_cos_sim(x['vectors_1'],x['vectors_2'])[0][0].item(), axis=1)
tik=time()
social_neg_G=peakmetrics_utilities.find_louvain_communities(social_edges)
tok=time()

print(tok-tik)
social_neg['community']=social_neg_G['community']
print(social_neg_G['community'].value_counts(normalize=True))

In [None]:
social_neg_embeds=social_neg['minilm_embeddings']
social_neg_embeds=np.array([np.array(y) for y in social_neg_embeds])
social_neg_random_use=random_search(social_neg_embeds, space, 50)

In [None]:
social_neg_random_use.head(20)

In [None]:
social_neg_cluster_labels = generate_clusters(social_neg_embeds, 
                                     n_neighbors = 54, 
                                     n_components = 13, 
                                     min_cluster_size = 27, 
                                     random_state = 42)
social_neg['clusters']=social_neg_cluster_labels.labels_

In [None]:
import pickle
with open('social_neg_clustered.pickle','wb') as f:
    pickle.dump(social_neg, f)