In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
from collections import defaultdict

In [47]:
likes = pd.read_csv('../data/liked_videos.csv')
videos = pd.read_json('../../shared-folder-gald/data/video-creators.json')

In [16]:
videos.head()

Unnamed: 0,username,video_description,region_code,share_count,hashtag_names,id,like_count,music_id,view_count,voice_to_text,comment_count,create_time,cluster,cluster-label,playlist_id,effect_ids
0,the.animal.holocaust,Replying to @ov10bronco #dominicizrealmyers #v...,US,4,"[vegan, yes, bbq, meat, carnivore, govegan, ve...",7274212644502998314,40,7.274213e+18,1054,"You say, why torture yourself? Because all we ...",181.0,2023-09-02 13:00:37,10,Healthy Cooking,,
1,bakemehealthylove,How to make: 1. Get Your Mix: Grab our Oatmeal...,US,4,"[wafflemix, waffleday, plantbased, strawberryw...",7271102720256314666,65,7.217848e+18,1087,Happy National Waffle Day we're celebrating wi...,0.0,2023-08-25 03:52:14,10,Healthy Cooking,,
2,livinapril7,Looking for a healthy #plantbased #protein pac...,US,0,"[food, healthy, lunch, health, salad, tasty, o...",7272105378920353054,2,6.705026e+18,271,,0.0,2023-08-27 20:43:20,10,Healthy Cooking,,
3,goodvibessocietyofficial,"Pull up to Sprouts, grab those good vibes bott...",US,0,"[drinks, weekendvibes, sprouts, plantbased, su...",7271427012529524011,31,7.271427e+18,372,,2.0,2023-08-26 00:50:41,-1,Outliers,,
4,settonfarms,"At Setton Farms, we are committed to sustainab...",US,1,"[nuts, recipe, farm, farmlife, harvest, pistac...",7273980908565433646,120,7.133309e+18,7031,,2.0,2023-09-01 22:01:08,8,Gardening,,


In [15]:
likes

Unnamed: 0,username,video_id,creator
0,bbcnews,7421594306210696481,taliamaizels
1,bbcnews,7454572661281262880,bbcnews
2,bbcnews,7444593083552976161,bbcnews
3,bbcnews,7381432178942414112,mahmood
4,bbcnews,7467351767571778858,msnbc
...,...,...,...
21203,financialeyes,7489948830285221142,jaymeerees
21204,financialeyes,7489181937920265494,mantisbeats
21205,financialeyes,7488308106288483606,maryamsuuii
21206,financialeyes,7489824307871452418,lewisbaaron


In [50]:
likes_vid_hash = likes.merge(
    videos[['id', 'hashtag_names']], 
    left_on='video_id', 
    right_on='id', 
    how='left'
).drop(columns=['id', 'creator']).dropna()

In [53]:
likes_vid_hash = likes_vid_hash[likes_vid_hash['hashtag_names'].apply(lambda x: x != [])]


In [54]:
likes_vid_hash

Unnamed: 0,username,video_id,hashtag_names
16,nowthisearth,7233493968586198318,"[nature, cool, wow, asmr, antarctica]"
20,nowthisearth,7245393461862468907,"[uae, oil, fossilfuels, cop28]"
21,nowthisearth,7234983316443778346,"[honey, bees, pov, worldbeeday]"
22,nowthisearth,7231664936122862890,"[glass, earth, recycle, ecofriendly, upcycle]"
23,nowthisearth,7230893194458778926,"[texas, shooting, allen]"
...,...,...,...
16586,wokestwoke1312,7195867746087865602,"[woke, troll, lgbtq, blm, bait, trolling, fyp,..."
20499,justice197300,7167293926745885957,"[climatejustice, lossanddamage, cop27]"
20506,justice197300,7167853807715061038,"[wolves, climatechange, alandislands, aland, c..."
20561,bigcatrescue,7259798722421280042,"[bigcatrescue, pallascats, conservationheroes,..."


In [55]:
print('Number of unique hashtags: ',likes_vid_hash['hashtag_names'].explode().nunique())
print('Number of unique videos: ',likes_vid_hash['video_id'].nunique())
print('Number of unique usernames: ',likes_vid_hash['username'].nunique())

Number of unique hashtags:  496
Number of unique videos:  64
Number of unique usernames:  24


## Creating the og likes network

User -> full list of hashtags from liked videos

|username|hashtag|weight| - directed

project hashtags

In [9]:
df_exploded = likes_vid_hash.explode("hashtag_names")

# Counting hashtag occurrences per user
hashtag_counts = df_exploded.groupby(["username", "hashtag_names"]).size().reset_index(name="count")

In [67]:
print('Number of edges: ', len(hashtag_counts))
print('Number of weights = 1: ', len(hashtag_counts[hashtag_counts['count'] == 1]))
print('Max weight: ', hashtag_counts['count'].max())

Number of edges:  535
Number of weights = 1:  464
Max weight:  10


In [10]:
LG = nx.Graph()

In [11]:
for _, row in hashtag_counts.iterrows():
    LG.add_edge(row['username'], row['hashtag_names'], weight=row['count'])

In [63]:
# Create a projection onto hashtags
hashtags = hashtag_counts['hashtag_names'].unique()
hashtag_projection = nx.Graph()

# Iterate over pairs of hashtags
for h1 in hashtags:
    for h2 in hashtags:
        if h1 != h2:
            # Find common users between the two hashtags
            common_users = set(LG.neighbors(h1)) & set(LG.neighbors(h2))
            
            if common_users:
                # Sum the weights of shared users
                weight_sum = sum(LG[u][h1]['weight'] + LG[u][h2]['weight'] for u in common_users)
                weight_avg = float(weight_sum) / float(len(common_users))

                
                # Add edge with weight
                hashtag_projection.add_edge(h1, h2, weight=weight_avg)

# Print edges and weights in the projected network
for edge in hashtag_projection.edges(data=True):
    print(edge[2]['weight'])


4.0
4.0
11.0
11.0
14.0
15.0
12.0
12.0
12.0
11.0
12.0
2.0
2.0
5.0
6.0
3.0
3.0
3.0
2.0
3.0
5.0
6.0
3.0
3.0
3.0
2.0
3.0
9.0
6.0
6.0
6.0
5.0
6.0
7.0
7.0
7.0
6.0
7.0
4.0
4.0
3.0
4.0
4.0
3.0
4.0
3.0
4.0
3.0
2.0
4.0
3.0
3.0
3.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
3.0
2.0
3.0
2.0
2.0
3.0
2.0
2.0
2.0
2.0
2.0
4.0
2.0
3.0
3.0
4.0
3.0
3.0
3.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
3.0
2.0
3.0
2.0
2.0
3.0
2.0
2.0
2.0
2.0
2.0
4.0
2.0
3.0
3.0
5.0
5.0
5.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0
4.0

In [69]:
with open('likes_network.pkl', 'wb') as f:
    pickle.dump(hashtag_projection, f)