In [38]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
from collections import defaultdict

In [3]:
likes = pd.read_csv('data/liked_videos.csv')
videos = pd.read_json('../shared-folder-gald/data/video-creators.json')

In [39]:
likes_vid_hash = likes.merge(
    videos[['id', 'hashtag_names']], 
    left_on='video_id', 
    right_on='id', 
    how='left'
).drop(columns=['id', 'creator']).dropna()


In [40]:
likes_vid_hash

Unnamed: 0,username,video_id,hashtag_names
16,nowthisearth,7233493968586198318,"[nature, cool, wow, asmr, antarctica]"
20,nowthisearth,7245393461862468907,"[uae, oil, fossilfuels, cop28]"
21,nowthisearth,7234983316443778346,"[honey, bees, pov, worldbeeday]"
22,nowthisearth,7231664936122862890,"[glass, earth, recycle, ecofriendly, upcycle]"
23,nowthisearth,7230893194458778926,"[texas, shooting, allen]"
...,...,...,...
15554,lrcphl,7218211357592702213,"[uknature, ecotok, climateuk, climateactionuk]"
15597,alyssa_says_marigolds,7190827949141134597,"[black, diversity, foryou, equality, blm, ally..."
16570,wxether,7233786615645179182,"[severeweather, fypシ, fypシ゚viral, stayprepared..."
16580,wokestwoke1312,7192894502690966785,"[woke, politics, fyp, satire, sjw, democrat, r..."


In [41]:
print('Number of unique hashtags: ',likes_vid_hash['hashtag_names'].explode().nunique())
print('Number of unique videos: ',likes_vid_hash['video_id'].nunique())
print('Number of unique usernames: ',likes_vid_hash['username'].nunique())

Number of unique hashtags:  475
Number of unique videos:  62
Number of unique usernames:  22


In [44]:
# Get unique usernames
usernames = likes_vid_hash['username'].unique()

# Dictionary to store graphs for each user
user_networks = {}

for user in usernames:
    # Filter data for the user
    user_data = likes_vid_hash[likes_vid_hash['username'] == user]
    
    # Create a bipartite graph
    B = nx.Graph()
    for _, row in user_data.iterrows():
        video = row['video_id']
        hashtags = row['hashtag_names']
        
        # Add edges between video and its hashtags
        B.add_nodes_from([video], bipartite=0)  # Video nodes
        B.add_nodes_from(hashtags, bipartite=1)  # Hashtag nodes
        B.add_edges_from((video, hashtag) for hashtag in hashtags)
    
    # Extract hashtag nodes
    hashtags = {node for node, data in B.nodes(data=True) if data['bipartite'] == 1}
    
    # Project onto the hashtag layer using weighted projection
    hashtag_projection = nx.bipartite.weighted_projected_graph(B, hashtags, ratio=False)
    
    # Store the network
    user_networks[user] = hashtag_projection


In [58]:
# Aggregate all user networks into one graph
aggregated_graph = nx.Graph()
for user_graph in user_networks.values():
    for u, v, data in user_graph.edges(data=True):
        if aggregated_graph.has_edge(u, v):
            aggregated_graph[u][v]['weight'] += data['weight']
        else:
            aggregated_graph.add_edge(u, v, weight=data['weight'])

In [63]:
with open("likes_g_aggregated.pkl", "wb") as f:
    pickle.dump(aggregated_graph, f)

## Creating the og likes network

In [None]:
import random

all_hashtags = videos['hashtag_names'].explode().to_list()
usernames = videos['username'].explode().to_list()[0:400]
data = {"username": usernames, "hashtags": [random.sample(all_hashtags, 5) for _ in range(400)]}

# Create DataFrame
user_hash = pd.DataFrame(data)

# Display first few rows
user_hash.head()

Unnamed: 0,username,hashtags
0,the.animal.holocaust,"[videostar, foundinoxfam, atlanticstaionatl, t..."
1,bakemehealthylove,"[question, norfolksouthern, sustainableskincar..."
2,livinapril7,"[dinner, mpa, artwork, lighter, news]"
3,goodvibessocietyofficial,"[createexplore, tealeaf, weatherlive, authors,..."
4,settonfarms,"[closetdiy, chores, earnfromhome, jailtiktok, ..."


In [25]:
vid_hash = videos[['username', 'id', 'hashtag_names']]
vid_hash.head()

Unnamed: 0,username,id,hashtag_names
0,the.animal.holocaust,7274212644502998314,"[vegan, yes, bbq, meat, carnivore, govegan, ve..."
1,bakemehealthylove,7271102720256314666,"[wafflemix, waffleday, plantbased, strawberryw..."
2,livinapril7,7272105378920353054,"[food, healthy, lunch, health, salad, tasty, o..."
3,goodvibessocietyofficial,7271427012529524011,"[drinks, weekendvibes, sprouts, plantbased, su..."
4,settonfarms,7273980908565433646,"[nuts, recipe, farm, farmlife, harvest, pistac..."


In [36]:
all_videos = videos['id'].explode().to_list()
usernames = videos['username'].explode().to_list()[0:400]
data = {"username": [random.choice(usernames) for _ in range(1000)], "id": [random.choice(all_videos) for _ in range(1000)]}

# Create DataFrame
user_likes = pd.DataFrame(data)

# Display first few rows
user_likes.head()

Unnamed: 0,username,id
0,ixfny,7227006791274597659
1,randolph.sketch,7239098207022665002
2,maribliss_essentials,7193538320876653870
3,pacethepance,7211630941167521067
4,lorenzojohn6,7171981384800570667


In [42]:
likes.rename(columns={'video_id': 'id'}, inplace=True)

In [None]:


# Merge user_hash with user_likes to get hashtags for each user
merged = pd.merge(likes, user_hash, on='username', how='left')

# Now we need to gather the source (user) hashtags and target (video) hashtags
edges = []

# Iterate through each user and the videos they liked
for _, row in merged.iterrows():
    user_hashtags = row['hashtags']  # List of user hashtags
    video_id = row['id']

    # Find the video hashtags corresponding to this video_id
    video_hashtags_list = vid_hash[vid_hash['id'] == video_id]['hashtag_names'].values

    # Check if video_hashtags_list is empty
    if len(video_hashtags_list) == 0:
        continue  # Skip this iteration if no matching video is found

    video_hashtags = video_hashtags_list[0]  # Extract the hashtags
    
    # Create edges between each user hashtag and each video hashtag
    for user_hashtag in user_hashtags:
        for video_hashtag in video_hashtags:
            edges.append((user_hashtag, video_hashtag))

# Create a directed weighted graph
G = nx.DiGraph()

# Add edges with weights (count occurrences)
edge_weights = defaultdict(int)
for edge in edges:
    edge_weights[edge] += 1

# Add edges to the graph
for edge, weight in edge_weights.items():
    G.add_edge(edge[0], edge[1], weight=weight)

# Now you can visualize or analyze the graph
# Example: View the edges with weights
for u, v, data in G.edges(data=True):
    print(f"Source: {u}, Target: {v}, Weight: {data['weight']}")


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()