In [2]:
import json
import pandas as pd
import requests
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [3]:
duet_stitch = pd.read_csv('../data/duet_stitch_uniques.csv')

In [4]:
duet_stitch.head()

Unnamed: 0.1,Unnamed: 0,username,video_description,id,creator
0,0,da2thwillsetu3,#duetwith@cjdeeznutzmaffia#intersectionalfemin...,7175840146766433538,cjdeeznutzmaffia
1,1,becomingchristine,#duetwith@karaaah#politics#jamie#raskin#evange...,7176140216506305834,karaaah
2,2,twinscott,#stitchwith@venetialamannam&sarepullingoutoffa...,7158808636972698885,venetialamanna
3,3,beardsndgears,#stitchwith@soundbite_3.0nothingmakessenseanym...,7157889471503928618,gma
4,4,thestreetswannaknow,#greenscreen#streetswannaknow#is#a#millionarie...,7183689694155263278,streetswannaknow


In [5]:
videos = pd.read_json('../../shared-folder-gald/data/video-creators.json')

In [6]:
videos.head()

Unnamed: 0,username,video_description,region_code,share_count,hashtag_names,id,like_count,music_id,view_count,voice_to_text,comment_count,create_time,cluster,cluster-label,playlist_id,effect_ids
0,the.animal.holocaust,Replying to @ov10bronco #dominicizrealmyers #v...,US,4,"[vegan, yes, bbq, meat, carnivore, govegan, ve...",7274212644502998314,40,7.274213e+18,1054,"You say, why torture yourself? Because all we ...",181.0,2023-09-02 13:00:37,10,Healthy Cooking,,
1,bakemehealthylove,How to make: 1. Get Your Mix: Grab our Oatmeal...,US,4,"[wafflemix, waffleday, plantbased, strawberryw...",7271102720256314666,65,7.217848e+18,1087,Happy National Waffle Day we're celebrating wi...,0.0,2023-08-25 03:52:14,10,Healthy Cooking,,
2,livinapril7,Looking for a healthy #plantbased #protein pac...,US,0,"[food, healthy, lunch, health, salad, tasty, o...",7272105378920353054,2,6.705026e+18,271,,0.0,2023-08-27 20:43:20,10,Healthy Cooking,,
3,goodvibessocietyofficial,"Pull up to Sprouts, grab those good vibes bott...",US,0,"[drinks, weekendvibes, sprouts, plantbased, su...",7271427012529524011,31,7.271427e+18,372,,2.0,2023-08-26 00:50:41,-1,Outliers,,
4,settonfarms,"At Setton Farms, we are committed to sustainab...",US,1,"[nuts, recipe, farm, farmlife, harvest, pistac...",7273980908565433646,120,7.133309e+18,7031,,2.0,2023-09-01 22:01:08,8,Gardening,,


In [7]:
hashtags_to_remove = set([
    'hashtag', 'fyp', 'foryou', 'foryoupage', 'fypシ', 'viral', 'love',
    'trending', 'tiktok', 'funny', 'fypage', 'capcut', 'duet', 'news',
    'foryourpage', 'fy', 'fypシ゚viral', 'follow', 'viralvideo', 'like',
    'trend', 'stitch', 'video', 'lol', 'instagram', 'asmr', 'explorepage',
    'instagood', 'viraltiktok', 'youtube', 'share', 'new', '2023', 'reels',
    'followme', 'vlog', 'satisfying', 'viralvideos', 'wow', 'funnyvideos',
    'repost', 'relatable', 'followforfollowback', 'breakingnews', 'storytime',
    'tiktokfamous', 'greenscreenvideo', 'for', 'foru', 'tiktoktrend', 'goviral',
    'bhfyp', 'viralpost', 'f', 'tiktoker', 'fypp', 'fyppppppppppppppppppppppp',
    'tiktokviral'
])

# Function to remove unwanted hashtags
def clean_hashtags(hashtags):
    if isinstance(hashtags, list):  # Ensure it's a list
        return [tag for tag in hashtags if tag not in hashtags_to_remove]
    return hashtags  # Return as-is if not a list

# Apply the cleaning function
videos['hashtag_names'] = videos['hashtag_names'].apply(clean_hashtags)

# Display the cleaned dataframe
print(videos[['hashtag_names']].head())

                                       hashtag_names
0  [vegan, yes, bbq, meat, carnivore, govegan, ve...
1  [wafflemix, waffleday, plantbased, strawberryw...
2  [food, healthy, lunch, health, salad, tasty, o...
3  [drinks, weekendvibes, sprouts, plantbased, su...
4  [nuts, recipe, farm, farmlife, harvest, pistac...


In [9]:
# videos dictionary 
user_videos = {}
for index, row in videos.iterrows():
    user_videos.setdefault(row['username'], []).append(row['id'])

filter the videos by users that exist in the stitch/duet database 


In [10]:
usernames_unique = pd.unique(duet_stitch[['username', 'creator']].values.ravel()).tolist()
usernames_unique

['da2thwillsetu3',
 'cjdeeznutzmaffia',
 'becomingchristine',
 'karaaah',
 'twinscott',
 'venetialamanna',
 'beardsndgears',
 'gma',
 'thestreetswannaknow',
 'streetswannaknow',
 'mattsrefillery',
 'thebigfavorite',
 'info_4_you',
 'global.crisis',
 'drygoodsrefillery',
 'tiny_waste',
 'rawdoggin_life',
 'vicenews',
 'dw_planeta',
 'ecofran',
 'redcrown32',
 'stoniepresents',
 'sageadvice22',
 'cardinalandpine',
 'lynnrazz2',
 'whatsupcv',
 'pixilatepixies',
 'nowthis',
 'preciousjoyjones3',
 'independent',
 'brittainspears1983',
 'mrpoliticalclips',
 'counterpointpolitics',
 'julianadeliberais',
 'k3lli4nne',
 'a.riverrat',
 'pique_action',
 'blerdronner',
 'nexamp',
 'luis_islandboi',
 'xrnyc',
 'padi',
 'paigebrownunicorn',
 'weather',
 'somewhatsustainable',
 'ecoshika',
 'radd7773nab',
 'nativeamericanlegend',
 'wearepuzzles',
 'genzforchange',
 'ryangle3000',
 'alternativelivingspaces',
 'bori3xx',
 'ilo',
 'trashcaulin',
 'oneoceanresearch',
 'a_thing_with_feathers',
 'goodgoodg

In [11]:
# Initialize an empty dictionary to store user -> hashtags
user_hashtags = {}

for user, videos in videos.groupby('username'):  # No need for iterrows()
    all_hashtags = []
    
    for _, row in videos.iterrows():  # Now iterating rows correctly
        all_hashtags.extend(row['hashtag_names'])  # Add hashtags from each row
    
    # Store the concatenated hashtags in the dictionary
    user_hashtags[user] = all_hashtags


In [12]:
# Convert usernames_unique to a set for faster lookups (O(1) instead of O(n))
usernames_unique_set = set(usernames_unique)

# Filter dictionary: Keep only users in usernames_unique
filtered_user_hashtags = {user: videos for user, videos in user_hashtags.items() if user in usernames_unique_set}

In [13]:
#hashtags to  text format
user_hashtag_text = {user: " ".join(hashtags) for user, hashtags in filtered_user_hashtags.items()}
print(user_hashtag_text)



In [14]:
from collections import Counter

# Flatten all hashtags and count occurrences
hashtag_counts = Counter(tag for tags in user_hashtag_text.values() for tag in tags.split())

# Keep hashtags appearing at least 3 times
frequent_hashtags = {tag for tag, count in hashtag_counts.items() if count >= 3}

# Filter hashtags in user_hashtag_text
filtered_user_hashtag_text = {
    user: " ".join([tag for tag in hashtags.split() if tag in frequent_hashtags])
    for user, hashtags in user_hashtag_text.items()
}


In [15]:
# Step 2: Compute TF-IDF
vectorizer = TfidfVectorizer(max_features=100000)
tfidf_matrix = vectorizer.fit_transform(filtered_user_hashtag_text.values())

In [16]:
# Get feature names (hashtags)
feature_names = vectorizer.get_feature_names_out()

# Step 3: Extract Top Hashtags Per User
user_tfidf_scores = {}

for i, user in enumerate(filtered_user_hashtag_text.keys()):
    # Get TF-IDF scores for this user
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    
    # Rank hashtags by score
    top_indices = tfidf_scores.argsort()[::-1]  # Sort in descending order
    top_hashtags = [feature_names[idx] for idx in top_indices[:]]  # Get top 5 hashtags
    
    # Store in dictionary
    user_tfidf_scores[user] = top_hashtags


In [17]:
# Step 4: Append to Original Dictionary
user_hashtags = defaultdict(list)
for user, hashtags in user_tfidf_scores.items():
    user_hashtags[user].extend(hashtags)

# Convert to normal dictionary (optional)
user_hashtags = dict(user_hashtags)


In [18]:
# Verify the number of relationships
print(f"Total number of relationships: {len(duet_stitch)}")

Total number of relationships: 730


creating pickle for duet_stitch hashtags called : duet_stitch_hashtags.csv

In [None]:
import pickle

with open('duet_stitch_hashtags.csv', 'wb') as f:
    pickle.dump(user_hashtags, f)

In [None]:
with open('duet_stitch_hashtags.csv', 'rb') as f:  # 'rb' = read binary
    user_hashtags = pickle.load(f)

In [21]:
edges = defaultdict(int)

In [22]:
# Build the network
for _, row in duet_stitch.iterrows():
    user1 = row['username']
    user2 = row['creator']

    hashtags_user1 = user_hashtags.get(user1, [])
    hashtags_user2 = user_hashtags.get(user2, [])

    for h1 in hashtags_user1:
        for h2 in hashtags_user2:
            edges[(h1, h2)] += 1

# Create directed graph
G = nx.DiGraph()
for (h1, h2), weight in edges.items():
    G.add_edge(h1, h2, weight=weight)

# Print sample edges
print("Some edges in the directed network with weights:")
for edge in list(G.edges(data=True))[:10]:
    print(edge)

print(f"Total number of edges: {len(G.edges())}")

KeyboardInterrupt: 

In [None]:
nx.write_graphml(G, "duet_stitch_graph.graphml")

In [None]:
import pickle

with open("duet_stitch_graph.pkl", "wb") as f:
    pickle.dump(G, f)

In [None]:
import csv

edges_with_weights = G.edges(data=True)

# Open a CSV file to write
with open("ds_edgelist_unipartite.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Username", "Creator", "Weight"])  # Column headers
    
    # Write the edges and weights
    for u, v, weight in edges_with_weights:
        writer.writerow([u, v, weight['weight']])