### Creating Network of hashtags based on followers relationships 


Steps:
1. directed follower network from user to user 
2. get videos of each user 
exclude if user has no hashtags in their videos 
3. use tf-idf on hashtags from all user videos and exctract the most significant ones 
function = hashtags of user 
4. create directed network of hashtags 
5. direction from hashtags of the follower to hashtags of the folowed 


In [4]:
import json
import pandas as pd
import requests
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
follows = pd.read_csv('../shared-folder-gald/data/follow-link.csv')

In [3]:
videos = pd.read_json('../shared-folder-gald/data/video-creators.json')

In [4]:
videos.head()

Unnamed: 0,username,video_description,region_code,share_count,hashtag_names,id,like_count,music_id,view_count,voice_to_text,comment_count,create_time,cluster,cluster-label,playlist_id,effect_ids
0,the.animal.holocaust,Replying to @ov10bronco #dominicizrealmyers #v...,US,4,"[vegan, yes, bbq, meat, carnivore, govegan, ve...",7274212644502998314,40,7.274213e+18,1054,"You say, why torture yourself? Because all we ...",181.0,2023-09-02 13:00:37,10,Healthy Cooking,,
1,bakemehealthylove,How to make: 1. Get Your Mix: Grab our Oatmeal...,US,4,"[wafflemix, waffleday, plantbased, strawberryw...",7271102720256314666,65,7.217848e+18,1087,Happy National Waffle Day we're celebrating wi...,0.0,2023-08-25 03:52:14,10,Healthy Cooking,,
2,livinapril7,Looking for a healthy #plantbased #protein pac...,US,0,"[food, healthy, lunch, health, salad, tasty, o...",7272105378920353054,2,6.705026e+18,271,,0.0,2023-08-27 20:43:20,10,Healthy Cooking,,
3,goodvibessocietyofficial,"Pull up to Sprouts, grab those good vibes bott...",US,0,"[drinks, weekendvibes, sprouts, plantbased, su...",7271427012529524011,31,7.271427e+18,372,,2.0,2023-08-26 00:50:41,-1,Outliers,,
4,settonfarms,"At Setton Farms, we are committed to sustainab...",US,1,"[nuts, recipe, farm, farmlife, harvest, pistac...",7273980908565433646,120,7.133309e+18,7031,,2.0,2023-09-01 22:01:08,8,Gardening,,


In [5]:
hashtags_to_remove = set([
    'hashtag', 'fyp', 'foryou', 'foryoupage', 'fypシ', 'viral', 'love',
    'trending', 'tiktok', 'funny', 'fypage', 'capcut', 'duet', 'news',
    'foryourpage', 'fy', 'fypシ゚viral', 'follow', 'viralvideo', 'like',
    'trend', 'stitch', 'video', 'lol', 'instagram', 'asmr', 'explorepage',
    'instagood', 'viraltiktok', 'youtube', 'share', 'new', '2023', 'reels',
    'followme', 'vlog', 'satisfying', 'viralvideos', 'wow', 'funnyvideos',
    'repost', 'relatable', 'followforfollowback', 'breakingnews', 'storytime',
    'tiktokfamous', 'greenscreenvideo', 'for', 'foru', 'tiktoktrend', 'goviral',
    'bhfyp', 'viralpost', 'f', 'tiktoker', 'fypp', 'fyppppppppppppppppppppppp',
    'tiktokviral'
])

# Function to remove unwanted hashtags
def clean_hashtags(hashtags):
    if isinstance(hashtags, list):  # Ensure it's a list
        return [tag for tag in hashtags if tag not in hashtags_to_remove]
    return hashtags  # Return as-is if not a list

# Apply the cleaning function
videos['hashtag_names'] = videos['hashtag_names'].apply(clean_hashtags)

# Display the cleaned dataframe
print(videos[['hashtag_names']].head())

                                       hashtag_names
0  [vegan, yes, bbq, meat, carnivore, govegan, ve...
1  [wafflemix, waffleday, plantbased, strawberryw...
2  [food, healthy, lunch, health, salad, tasty, o...
3  [drinks, weekendvibes, sprouts, plantbased, su...
4  [nuts, recipe, farm, farmlife, harvest, pistac...


In [6]:
user_videos = {}
for index, row in videos.iterrows():
    user_videos.setdefault(row['username'], []).append(row['id'])

filter the videos by users that exist in the follower database 


In [7]:
usernames_unique = pd.unique(follows[['source', 'target']].values.ravel()).tolist()
usernames_unique

['designs_from_time',
 'cosmicsummit',
 'conspiracyhubog',
 'guildworld',
 'venetialamanna',
 'i_d',
 'texasbeeworks',
 'dcopperman',
 'healthyholistichomes',
 'maxlamanna',
 'thatcurlytopp',
 'zainab.slow.fashion',
 'politicsjoe',
 'tedtoks',
 'bbcnews',
 'lois1xblue',
 'vicenews',
 'andreacheong_',
 'nssmagazine',
 'iobservefashion',
 'nowthisearth',
 'billnye',
 'worldeconomicforum',
 'gardenmarcus',
 'pelacase',
 'okczoo',
 'alexisnikole',
 'greenpeace_international',
 'onetreeplanted',
 'wwf',
 'newscientist',
 'kirasabin',
 'havrestudio',
 'earthrise.studio',
 'cristinamantas',
 'farmer_nick',
 'flourishingmother',
 'joycelynlongdon',
 'iamtabithabrown',
 'herbifoods',
 'minimalistbaker',
 'browngirlgreen',
 'pxgon',
 'sustainthemag',
 'aditimayer',
 'nataliatrevinoamaro',
 'swansonsfabrics',
 'stopbigoil',
 'ethiqueworld',
 'xiyebastida',
 'siranda_manchez',
 'domipalmer',
 'maggie_zhou',
 'ouryouth4theclimate',
 'ecofran',
 'thehellajam',
 'judybaogarden',
 'goingzerowaste_',
 

Get dictionary of hahtags per user

Using TF-IDF (Term Frequency-Inverse Document Frequency) to determine the most important hashtags for each user based on their videos.

Plan:
1. Convert user_videos into a format suitable for TfidfVectorizer.
2. Compute TF-IDF scores for hashtags within each user’s videos.
Compute TF-IDF: Treat each user as a "document" and their hashtags as "terms".
3. Select top hashtags per user based on importance.
4. Append those hashtags to your dictionary.

In [8]:
# Initialize an empty dictionary to store user -> hashtags
user_hashtags = {}

for user, videos in videos.groupby('username'):  # No need for iterrows()
    all_hashtags = []
    
    for _, row in videos.iterrows():  # Now iterating rows correctly
        all_hashtags.extend(row['hashtag_names'])  # Add hashtags from each row
    
    # Store the concatenated hashtags in the dictionary
    user_hashtags[user] = all_hashtags


In [9]:
# Convert usernames_unique to a set for faster lookups (O(1) instead of O(n))
usernames_unique_set = set(usernames_unique)

# Filter dictionary: Keep only users in usernames_unique
filtered_user_hashtags = {user: videos for user, videos in user_hashtags.items() if user in usernames_unique_set}

In [10]:
#number of creators is smaller after filtering by the follow list
print(len(list(filtered_user_hashtags.keys())))

13867


In [None]:
user_hashtag_text = {user: " ".join(hashtags) for user, hashtags in filtered_user_hashtags.items()}
print(user_hashtag_text)

In [None]:
from collections import Counter

# Flatten all hashtags and count occurrences
hashtag_counts = Counter(tag for tags in user_hashtag_text.values() for tag in tags.split())

# Keep hashtags appearing at least 3 times
frequent_hashtags = {tag for tag, count in hashtag_counts.items() if count >= 3}

# Filter hashtags in user_hashtag_text
filtered_user_hashtag_text = {
    user: " ".join([tag for tag in hashtags.split() if tag in frequent_hashtags])
    for user, hashtags in user_hashtag_text.items()
}


In [None]:
# Step 2: Compute TF-IDF
vectorizer = TfidfVectorizer(max_features=100000)
tfidf_matrix = vectorizer.fit_transform(filtered_user_hashtag_text.values())

In [None]:
# Get feature names (hashtags)
feature_names = vectorizer.get_feature_names_out()

# Step 3: Extract Top Hashtags Per User
user_tfidf_scores = {}

for i, user in enumerate(filtered_user_hashtag_text.keys()):
    # Get TF-IDF scores for this user
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    
    # Rank hashtags by score
    top_indices = tfidf_scores.argsort()[::-1]  # Sort in descending order
    top_hashtags = [feature_names[idx] for idx in top_indices[:]]  # Get top 5 hashtags
    
    # Store in dictionary
    user_tfidf_scores[user] = top_hashtags


: 

In [None]:
# Step 4: Append to Original Dictionary
user_hashtags = defaultdict(list)
for user, hashtags in user_tfidf_scores.items():
    user_hashtags[user].extend(hashtags)

# Convert to normal dictionary (optional)
user_hashtags = dict(user_hashtags)

# Output Example
print(user_hashtags)

In [None]:
import pickle

with open('user_hashtags.csv', 'wb') as f:
    pickle.dump(user_hashtags, f)

In [11]:
with open('user_hashtags.csv', 'rb') as f:  # 'rb' = read binary
    user_hashtags = pickle.load(f)

In [15]:
user_hashtags.keys()

dict_keys(['._._.jaysonkiogima_._._', '.alessandrarosa', '.bcole', '.cass.ervative5.0', '.e.liza.beth', '.giddyupboutique', '.ilovejellyfishh', '.jeune.espion', '.roadshow', '.salty.nomad', '.tarasina', '.utopiaisnow', '.webs.girl', '007outofcover', '0530jenserrano', '0_aviation_7', '0_r_i_o_n7', '0ddlyt3rr1fy1ng', '0nly_pans', '0p0lystyrene', '10gsocial', '10newsfirst', '1173182q55', '11thhourmsnbc', '1234elena214', '1234glitterstore', '126media', '12may17', '12tides', '1360truth', '13hollywood520', '15601470nru', '171ststreetgames', '1775.flag.this', '180shift', '180studios', '1890_homestead', '1972pandoy', '1984destiny1', '19980124zff', '19bigg84', '19xxcovey', '1damir_kee', '1hotels', '1lbclogo', '1loki3', '1maytweather', '1mooreshop', '1newsnz', '1rechelle', '1sgcal70', '1stblackmanukvillagetour', '1stdonoharm', '1wishywashyvegan', '2.0thrifts', '2000chevytahoe', '2000s.bbys', '2021sam2', '2024usa', '2050magazin', '20_years_ago_', '20shedidit21', '213raider76', '222ryles', '222vir