In [1]:
from sentence_transformers import SentenceTransformer
import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import community as community_louvain  # Louvain algorithm
import matplotlib.pyplot as plt
import community as community_louvain
from collections import defaultdict
import pandas as pd

In [2]:
#From Hugging Face
# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight and powerful

In [3]:
# Load the network
import pickle

# Load the pickle file
with open('../ready_networks/cooc_filtered.pkl', 'rb') as f:
    G = pickle.load(f)

# Get the hashtags (nodes)
hashtags = list(G.nodes)

In [5]:
# Embed hashtags
embeddings = model.encode(hashtags)
print(embeddings.shape) #480 hashtags / each is represented by a 384-dimension vector

(60609, 384)


In [None]:
# Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[1.0000, 0.4957, 0.1060,  ..., 0.1704, 0.2799, 0.1584],
        [0.4957, 1.0000, 0.0737,  ..., 0.2421, 0.1139, 0.1472],
        [0.1060, 0.0737, 1.0000,  ..., 0.3066, 0.3126, 0.2193],
        ...,
        [0.1704, 0.2421, 0.3066,  ..., 1.0000, 0.2415, 0.2199],
        [0.2799, 0.1139, 0.3126,  ..., 0.2415, 1.0000, 0.1440],
        [0.1584, 0.1472, 0.2193,  ..., 0.2199, 0.1440, 1.0000]])


: 

In [None]:
#cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

In [None]:
#have to plot a histogram to see the curt-off
import matplotlib.pyplot as plt

# similarities is your 480 x 480 cosine similarity matrix (flattened)
plt.hist(similarities.flatten(), bins=100)
plt.title('Distribution of Cosine Similarities')
plt.show()

In [None]:
# Graph
G = nx.Graph()
for i in range(len(hashtags)):
    G.add_node(hashtags[i])
    for j in range(i+1, len(hashtags)):
        if similarity_matrix[i, j] > 0.2:  # Threshold to create an edge - look at histogram
            G.add_edge(hashtags[i], hashtags[j], weight=similarity_matrix[i, j])

In [None]:
# Cluster with Louvain
partition = community_louvain.best_partition(G, weight='weight')

In [None]:
# Save graph to a pickle file
with open('S_R_cooc.pkl', 'wb') as f:
    pickle.dump(G, f)