In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
from collections import defaultdict

In [3]:
likes = pd.read_csv('data/liked_videos.csv')
videos = pd.read_json('../shared-folder-gald/data/video-creators.json')

In [13]:
likes

Unnamed: 0,username,video_id,creator
0,bbcnews,7421594306210696481,taliamaizels
1,bbcnews,7454572661281262880,bbcnews
2,bbcnews,7444593083552976161,bbcnews
3,bbcnews,7381432178942414112,mahmood
4,bbcnews,7467351767571778858,msnbc
...,...,...,...
17546,maikengreimel,7467861092786638102,janes.wonda
17547,maikengreimel,7465696976609266966,desperatebookwives
17548,maikengreimel,7466856348878212374,ninagreimel
17549,maikengreimel,7451570066488921366,sandrasbooks


In [4]:
likes_vid_hash = likes.merge(
    videos[['id', 'hashtag_names']], 
    left_on='video_id', 
    right_on='id', 
    how='left'
).drop(columns=['id', 'creator']).dropna()


In [40]:
likes_vid_hash

Unnamed: 0,username,video_id,hashtag_names
16,nowthisearth,7233493968586198318,"[nature, cool, wow, asmr, antarctica]"
20,nowthisearth,7245393461862468907,"[uae, oil, fossilfuels, cop28]"
21,nowthisearth,7234983316443778346,"[honey, bees, pov, worldbeeday]"
22,nowthisearth,7231664936122862890,"[glass, earth, recycle, ecofriendly, upcycle]"
23,nowthisearth,7230893194458778926,"[texas, shooting, allen]"
...,...,...,...
15554,lrcphl,7218211357592702213,"[uknature, ecotok, climateuk, climateactionuk]"
15597,alyssa_says_marigolds,7190827949141134597,"[black, diversity, foryou, equality, blm, ally..."
16570,wxether,7233786615645179182,"[severeweather, fypシ, fypシ゚viral, stayprepared..."
16580,wokestwoke1312,7192894502690966785,"[woke, politics, fyp, satire, sjw, democrat, r..."


In [41]:
print('Number of unique hashtags: ',likes_vid_hash['hashtag_names'].explode().nunique())
print('Number of unique videos: ',likes_vid_hash['video_id'].nunique())
print('Number of unique usernames: ',likes_vid_hash['username'].nunique())

Number of unique hashtags:  475
Number of unique videos:  62
Number of unique usernames:  22


In [44]:
# Get unique usernames
usernames = likes_vid_hash['username'].unique()

# Dictionary to store graphs for each user
user_networks = {}

for user in usernames:
    # Filter data for the user
    user_data = likes_vid_hash[likes_vid_hash['username'] == user]
    
    # Create a bipartite graph
    B = nx.Graph()
    for _, row in user_data.iterrows():
        video = row['video_id']
        hashtags = row['hashtag_names']
        
        # Add edges between video and its hashtags
        B.add_nodes_from([video], bipartite=0)  # Video nodes
        B.add_nodes_from(hashtags, bipartite=1)  # Hashtag nodes
        B.add_edges_from((video, hashtag) for hashtag in hashtags)
    
    # Extract hashtag nodes
    hashtags = {node for node, data in B.nodes(data=True) if data['bipartite'] == 1}
    
    # Project onto the hashtag layer using weighted projection
    hashtag_projection = nx.bipartite.weighted_projected_graph(B, hashtags, ratio=False)
    
    # Store the network
    user_networks[user] = hashtag_projection


In [58]:
# Aggregate all user networks into one graph
aggregated_graph = nx.Graph()
for user_graph in user_networks.values():
    for u, v, data in user_graph.edges(data=True):
        if aggregated_graph.has_edge(u, v):
            aggregated_graph[u][v]['weight'] += data['weight']
        else:
            aggregated_graph.add_edge(u, v, weight=data['weight'])

In [63]:
with open("likes_g_aggregated.pkl", "wb") as f:
    pickle.dump(aggregated_graph, f)

## Creating the og likes network

User -> full list of hashtags from liked videos

|username|hashtag|weight| - directed

project hashtags - the weight is the avg weight with users they share

In [5]:
df_exploded = likes_vid_hash.explode("hashtag_names")

# Counting hashtag occurrences per user
hashtag_counts = df_exploded.groupby(["username", "hashtag_names"]).size().reset_index(name="count")

In [6]:
LG = nx.Graph()

In [7]:
for _, row in hashtag_counts.iterrows():
    LG.add_edge(row['username'], row['hashtag_names'], weight=row['count'])

In [None]:
# Create a projection onto hashtags
hashtags = hashtag_counts['hashtag_names'].unique()
hashtag_projection = nx.Graph()

# Iterate over pairs of hashtags
for h1 in hashtags:
    for h2 in hashtags:
        if h1 != h2:
            # Find common users between the two hashtags
            common_users = set(LG.neighbors(h1)) & set(LG.neighbors(h2))
            
            if common_users:
                # Sum the weights of shared users
                weight_sum = sum(LG[u][h1]['weight'] + LG[u][h2]['weight'] for u in common_users)
                weight_avg = float(weight_sum) / float(len(common_users))

                
                # Add edge with weight
                hashtag_projection.add_edge(h1, h2, weight=weight_avg)


4.0
4.0
11.0
11.0
14.0
15.0
12.0
12.0
12.0
11.0
12.0
2.0
2.0
5.0
6.0
3.0
3.0
3.0
2.0
3.0
5.0
6.0
3.0
3.0
3.0
2.0
3.0
9.0
6.0
6.0
6.0
5.0
6.0
7.0
7.0
7.0
6.0
7.0
4.0
4.0
3.0
4.0
4.0
3.0
4.0
3.0
4.0
3.0
2.0
4.0
3.0
3.0
3.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
3.0
2.0
3.0
2.0
2.0
3.0
2.0
2.0
2.0
2.0
2.0
4.0
2.0
3.0
3.0
4.0
3.0
3.0
3.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
3.0
2.0
3.0
2.0
2.0
3.0
2.0
2.0
2.0
2.0
2.0
4.0
2.0
3.0
3.0
5.0
5.0
5.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0