In [1]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = 'FoodJustice*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)



exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')




############
## Reverse filtering
############

data.preprocessed <- data.preprocessed %>% filter(
    grepl('Food|Nutrition|Meal|meals|nutritional|diet|dietary', content, ignore.case=TRUE)
)

Loading required package: pacman

[1mRows: [22m[34m20276[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (28): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (2): title, noise_category

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m14973[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (29): url, published, content, lang, domain

In [2]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################

exclusion_keywords <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
        "Bail reform",
        "Join us at 7:00PM",
        "@Freakonomics",
        "@JInterlandi",
        "criminal justice",
        "application deadline",
        "#WorldSleepDay",
        "#reproductive",
        "One can only hope.",
        "Memoir of a Race Traitor",
        "#WarCrimes",
        "#GetMePPE",
        "#KHive",
        "policing",
        "desensitized to death",
        "#China",
        "Reproductive Health Care",
        "Nigeria",
        "Alteon Health",
        "International Women’s Day",
        "warning about for WEEKS",
        "reproductive freedom",
        "@MAPublicHealth",
        "rich & famous",
        "NYPD",
        "#BailReform",
        "#bailreform",
        "childbirth",
        "@EarthDayNewYork",
        "you better believe capitalism",
        "Eduardo Bonilla-Silva",
        "#Momnibus",
        "Warren dropped out",
        "Maternal Health",
        "North Carolina",
        "imposter syndrome",
        "voter suppression",
        "suicide",
        "all-encompassing",
        "@rachelpowers",
        "#BlackTwitter",
        "protesters",
        "#AMJoy",
        "Georgetown",
        "#RBG",
        "S.F.",
        "ACP Online",
        "Texas Monthly",
        "Africa",
        "reparations",
        "paid sick leave",
        "diet racists",
        "confederate monuments",
        "cardiologist",
        "Rage Becomes Her",
        "satisfying her husband",
        "Register for free",
        "Watts",
        "food for thought",
        "Brazil",
        "#cva",
        "DM me",
        "Reince Priebus",
        "Albany",
        "breast cancer",
        "diet racism",
        "#AbolishPolice",
        "reliant on China",
        "UK",
        "We must flip the senate",
        "Dolphins",
        "Australia",
        "Manchester",
        "Poughkeepsie",
        "diet white",
        "Justice League",
        "#PuertoRico",
        "Hawaii",
        "Food Tech Sector",
        "Arizona",
        "diet of justice",
        "Philly",
        "racist demon food",
        "soap bars",
        "#premed",
        "Toronto",
        "Vegetable Joe",
        "complications related to pregnancy",
        "#curriculumdesign",
        "Dog food",
        "diet devilz",
        "DC",
        "Detroit",
        "Houston",
        "Louisville",
        "USC",
        "MO",
        "#ProudtobeGIM",
        "Nashville",
        "football season",
        "Oregon",
        "Jacksonville",
        "MA",
        "Holyoke",
        "Wayne Ford"
        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')
)

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords, content),
        !grepl('\\$SNAP', content)
    )

total_records_raw <- data.preprocessed %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    2022
    ############################
    ############################
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(50) #%>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_food_justice.csv');

#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
# change the seed number above and run this cell again
#######################################
#######################################
#######################################
#######################################

total raw records: 13878 
records after preprocessing: 10964 
percent records remaining: 79%

content
<chr>
Trader Joe's backtracks on ditching 'racist' food brands
"RT : Our city needs access to healthy food, now more than ever. Thank you for joining the fight for food justice. We couldn't do it without you. 🙏 #ThisFoodFights"
RT : Jobs have been lost and people are hungry. The NW Bronx Food Justice Project is right there helping them get food. Sign up here for our newsletter to know more: #nwbronxfoodjustice #foodaccess #bronxcommunityresources #foodinsecurity
"RT : Late last week, 87 U.S.-based farmer organizations and allied agroecology, farm, and food justice groups in delivered a solidarity statement in support of the #FarmersProtest. Read more about this incredible act of solidarity via !"
Burning anything is racist—like this racist blackface food image.
"Black & Brown fast-food workers cannot work from home, we aren’t afforded full-time hours in order to take care of our families, and we cannot take sick days. We are calling for economic and racial justice for fast food workers across the nation! #ProtectAllWorkers"
Food is racist now? I’ll add it to the list.
"Reporter: “Um...is it...Um...is it racist to call Chinese Food...Chinese Food?"""
RT : We hate racists and politicians but let food snobs walk amongst us like it’s no big deal.
Hey CNN is Chinese food racist? How about Mexican food?
