## Initial hashtag analysis
Tayae Rogers, 3/12/2024

In [78]:
import os
import csv
from collections import defaultdict
import pandas as pd
import re
import json

In [5]:
def list_of_hashtags(csv_in):
    """
    Takes hashtag CSV (our shared Google Sheets file) and returns a list of news accounts
    """
    file = f"{csv_in}.csv"
    hashtags = []
    with open(file, mode ='r')as file:
        csvFile = csv.reader(file)
        for lines in csvFile:
            hashtags.append(lines[0])
    return hashtags

hashtag_list = list_of_hashtags('hashtags_03112024')

In [29]:
def get_hashtags(phrase):
    """"Takes a string phase and returns a list of hashtags
    """
    if isinstance(phrase,str):
        return re.findall(r'#(\S+)', phrase.lower())
    else:
        return []

In [82]:
def news_post_abundance(path):
    all_news = pd.DataFrame(columns = ["video_id", "author_username", "video_description", "hashtags", "suggested_words", "file_name"])
    stats_df = pd.DataFrame(columns = ["file_name", "full_length", "news_length", "prop_news"])
    
    for file in os.listdir(path):
        meta_csv = pd.read_csv(os.path.join(path,file))
        meta_csv["hashtags"] = meta_csv['video_description'].apply(get_hashtags)

        meta_sub_df = meta_csv[["video_id", "author_username", "video_description", "hashtags", "suggested_words"]].copy()
        meta_sub_df["file_name"] = str(file)
        news_df = meta_sub_df[meta_sub_df["hashtags"].apply(lambda htlst: (any(hashtag in htlst for hashtag in hashtag_list) if isinstance(htlst, list) else False))]

        allLen = meta_csv.shape[0]
        newsLen = news_df.shape[0]
        propNews = newsLen/allLen
        stats_df.loc[len(stats_df.index)] = [str(file), allLen, newsLen, propNews]  

        all_news = pd.concat([all_news, news_df])
    
    return (stats_df, all_news)

In [83]:
path = os.path.dirname(os.path.abspath('hashtag_analysis_initial.ipynb'))
path = path.rsplit('/',2)[0]
curr_path = f"{path}/pre-processing/metadata-csv"
curr_path

'/Users/tayaerogers/Documents/MEDSL/GitHub/CS315-Project-2/pre-processing/metadata-csv'

In [86]:
news_by_hashtag = news_post_abundance(curr_path)[1]
more_hashtags = news_by_hashtag["hashtags"].explode().to_list()

In [88]:
news_by_hashtag

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,file_name
242,7322519232544115998,jamesgetspolitical,Jan. 10th - a school #leftist #greenscreenvide...,"[leftist, greenscreenvideo, politics, usa, fyp...","y.s. barber, Gaza Is A Prison",pyktok-metadata-EM.csv
255,7321857300258508074,yayakityara,For educational purposes only. #news,[news],"Educational Content, Educational Videos, educa...",pyktok-metadata-EM.csv
362,7322260297367014661,trtworld,As celebrities took to the red carpet for the ...,"[gaza, palestine, goldenglobe]","golden globes, Red Carpet, golden globe outfit...",pyktok-metadata-EM.csv
433,7322120678109498657,historywithhenrietta,#fyp #olgaofkiev #foryoupage #historytime #his...,"[fyp, olgaofkiev, foryoupage, historytime, his...","joan of arc, history, History TikTok, History ...",pyktok-metadata-EM.csv
484,7299973627393363246,nycaoc,#aoc #alexandriaocasiocortez #nycaoc #aoc2028 ...,"[aoc, alexandriaocasiocortez, nycaoc, aoc2028,...",,pyktok-metadata-EM.csv
487,7318114819620474145,regularfaucettap,I wonder who is illegally annexing Palestinian...,"[leftist, politics]","regularfaucettap, bella hadid christmas tree, ...",pyktok-metadata-EM.csv
569,7318437057016155434,thatgoodnewsgirl,You can have Icelandic horses write your out o...,"[icelandichorse, horsesoftiktok, horse, horset...","iceland, outhorse email, Icelandic Horses, Hor...",pyktok-metadata-EM.csv
617,7311683075119926560,goalglobal,The moment French sixth division team US Revel...,"[football, soccer]",,pyktok-metadata-EM.csv
1096,7322138334816521503,jphamskitchen,Recipe ⬇️ Chà Bông | Ruốc ~ Pork Meat Floss Sp...,"[recipe, ngonqua, chabong, ruoc, porkfloss, ho...",,pyktok-metadata-EM.csv
1147,7321440701831466272,prodbyknm,them mitre balls were dangerous business 😭🙏🏾 #...,"[fyp, relateable, football, primaryschool, sch...","mitre ball, mitre football ball, football, Foo...",pyktok-metadata-EM.csv


In [80]:
with open(f"{path}/analysis/hashtag_initial/more_hashtags.json", 'w') as outf:
    json.dump(more_hashtags, outf)

In [90]:
news_by_hashtag.to_csv(f"{path}/analysis/hashtag_initial/news_by_hashtag.csv", index=False)  