## Initial account analysis
Tayae Rogers, 3/12/2024

In [3]:
import os
import csv
from collections import defaultdict
import pandas as pd
import re
import json

In [4]:
def list_of_news_accs(csv_in):
    """
    Takes news acc CSV (our shared Google Sheets file) and returns a list of news accounts
    """
    file = f"{csv_in}.csv"
    news_accs = []
    with open(file, mode ='r')as file:
        csvFile = csv.reader(file)
        for lines in csvFile:
            news_accs.append(lines[0])
    return news_accs

In [5]:
wd = "/Users/tayaerogers/Documents/MEDSL/GitHub/CS315-Project-2" # put personal wd ending in "/CS315-Project-2" here
account_wd = wd + "/analysis/news_accs/"
news_accs = list_of_news_accs(os.path.join(account_wd, 'news_accounts_03112024'))
news_accs

['nytimes',
 'washingtonpost',
 'cnn',
 'pbsnews',
 'abcnews',
 'msnbc',
 'cbsnews',
 'briantylercohen',
 'cbseveningnews',
 'abcworldnews',
 'nbcnews',
 'yahoonews',
 '7newsq',
 '',
 'skynews',
 'bbc',
 'forbes',
 'usatoday',
 'gbnews',
 'varietymagazine',
 'sophiasmithgaler',
 'maxfostercnn',
 'nytopinion',
 '60minutes',
 'cbsmornings',
 'cbssundaymorning',
 'vicenews',
 'abcnewslive',
 'gma',
 'newsweek',
 'underthedesknews',
 'theconversation',
 'nowthisimpact ',
 'dailymail',
 'davebondy',
 'aljazeeraenglish',
 'vicenewsdocs',
 'c4news',
 'itvnews',
 'niickjackson',
 'abc7la',
 'reuters',
 'npr',
 'nowthis',
 'brutamerica',
 'thetelegraph',
 'vox',
 'buzzfeednews',
 'taylorlorenz',
 'wallstreetjournal',
 'latimes',
 'natgeo',
 'vicderbyshire',
 'pinknews',
 'eveningstandard',
 'cleoabram',
 'sandragathmann',
 'tldrnews',
 'morningbrew',
 'semafor',
 'thepocketreport',
 'theeconomist',
 'thegarbagequeen',
 'nbcnightlynews',
 'benjaminzamoratven',
 'thenewsmovement',
 'elliecolts',


## Return dataframe with posts whose poster matches ours news account list

In [5]:
def get_hashtags(phrase):
    """"
    Takes a string phase and returns a list of hashtags
    """
    if isinstance(phrase,str):
        return re.findall(r'#(\S+)', phrase.lower())
    else:
        return []

In [6]:
def news_poster_abundance(path):
    all_news = pd.DataFrame(columns = ["video_id", "author_username", "video_description", "hashtags", "suggested_words", "file_name"])
    stats_df = pd.DataFrame(columns = ["file_name", "full_length", "news_length", "prop_news"])
    prefixed = [filename for filename in os.listdir(path) if filename.startswith("Sec2Gr3")]
    
    for file in prefixed:
        meta_csv = pd.read_csv(os.path.join(path,file))
        meta_csv["hashtags"] = meta_csv['video_description'].apply(get_hashtags) # doing this just in case we want the hashtag data later

        meta_sub_df = meta_csv[["video_id", "author_username", "video_description", "hashtags", "suggested_words"]].copy()
        meta_sub_df["file_name"] = str(file)
        news_df = meta_sub_df[meta_sub_df["author_username"].isin(news_accs)]

        allLen = meta_csv.shape[0]
        newsLen = news_df.shape[0]
        propNews = newsLen/allLen
        stats_df.loc[len(stats_df.index)] = [str(file), allLen, newsLen, propNews]  

        all_news = pd.concat([all_news, news_df])
    
    return (stats_df, all_news)

In [7]:
curr_path = f"{wd}/pre-processing/metadata-csv"
curr_path

'/Users/tayaerogers/Documents/MEDSL/GitHub/CS315-Project-2/pre-processing/metadata-csv'

In [8]:
news_by_poster = news_poster_abundance(curr_path)[1]
news_by_poster

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,file_name
341,6988185257426128133,gma,Morgan Wallen is speaking out to address using...,"[news, morganwallen]","morgan wallen racist video, morgan wallen, mor...",Sec2Gr3_77777.csv
204,7283179469001461035,nbcnews,"After wrapping ""#Scandal,"" #KerryWashington's ...","[scandal,"", kerrywashington's, dna]","shera kerry washington, kerry washington somal...",Sec2Gr3_77217.csv
353,6948100100992322821,cbsnews,A baby kangaroo is rescued from its mother’s p...,"[news, australia]","kangaroo pouch, Kangaroo, baby kangaroo, Anima...",Sec2Gr3_77217.csv
610,7307673039922105643,wired,"Elmo is here to set the record straight, once ...","[elmo., sesamestreet, newyorker, newyorkaccent...","elmo, elmo funniest moments, elmo balsamicvine...",Sec2Gr3_77217.csv
1343,7306609553305652510,nbcnews,Former President #JimmyCarter and former first...,"[jimmycarter, rosalynncarter]",,Sec2Gr3_77217.csv
...,...,...,...,...,...,...
8228,7288868567003614510,todayshow,#taylorswift has arrived at the #tserastourmov...,"[taylorswift, tserastourmovie]",,Sec2Gr3_74721.csv
8283,7267091322559843616,bbcnews,One Hawaii resident says some tourists are car...,"[lahaina, maui, hawaii, hawaiifire, wildfire, ...","hawaii tourists, maui tourists, hawaii, Lahain...",Sec2Gr3_74721.csv
8337,7287837532161625377,middleeasteye,Palestinian ambassador to the UK blasts the BB...,"[bbc, israel, palestinian, ambassador, gaza, l...",,Sec2Gr3_74721.csv
8891,7285777286316510470,ctvnews,If you’re stuck in traffic on Hwy. 400 Tuesday...,"[breakingnews, truck, highway, ontario, gta, t...",,Sec2Gr3_74721.csv


In [9]:
news_by_poster.to_csv(f"{wd}/analysis/news_accs/news_by_poster.csv", index=False)  