In [3]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = 'FoodPantries*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)

exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')

[1mRows: [22m[34m12968[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (28): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (2): title, noise_category

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m9323[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (29): url, published, content, lang, domain_url, parent_url, post_type, ...
[

In [4]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################

exclusion_keywords <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
        "Oakland",
        "@LVMH",
        "Spicy tenders",
 "Wholesome-simple-convenience",
        "podcast to pass knowledge",
        "Mr Food Test Kitchen",
"100,000 free face coverings",
"Poughkeepsie",
"Rosewood",
        "Mobi Free True",
        "Chinese Kitchen Garden",
        "#dealoftheday",
        "EBOOK",
"Prime Pantry",
"Greater Boston",
"Bro stfu",
        "Giving Kitchen",
        "Atlanta",
        "Elana's Pantry's",
        "EPUB",
        "Caracas",
        "Mbarara",
"@mamiskitchenio",
        "Honolulu",
        "Eduardo Fernandez",
        "REDONDO BEACH",
        "Allentown",
"NYU Stern 20 years",
"Future of Kitchen Design",
"Gilead",
        "libertarians"
        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')
)

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords, content),
        !grepl('\\$SNAP', content)
    )

total_records_raw <- data %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    2034
    ############################
    ############################
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(10) %>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_food_pantries.csv');

#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
# change the seed number above and run this cell again
#######################################
#######################################
#######################################
#######################################

total raw records: 111783 
records after preprocessing: 87426 
percent records remaining: 78%

content
<chr>
Spent the day handing out free groceries and pantry items as a volunteer at Soup Kitchen Network. You guys are doing incredibly important work and it was an honor to join you today!
Market Outlook Growing consumer preference towards flavored food has fuelled the demand for coffee emulsion in recent years. The coffee emulsion is the liquefied extracts of coffee beans which provide the aromatic flavor of coffee. The coffee emulsi...
"is on a mission to reduce food insecurity by developing models that distribute culturally sensitive meals to nourish healthy and thriving communities. They currently deliver 51,000 meals to those in need. Read Our Blog"
RT : Thank you Diana Lee & for helping RLC distribute food for those in need! #foodinsecurity #hunger #CoronavirusOutbreak
"As food banks are running out of food, the #MedgarEversCollege #CougarCountry Food Pantry continues to open once a week, helping to feed over 500 students and their families during the #COVID19 pandemic!"
"RT : Thank you to all of our volunteers keeping the community fed during this challenging time! We have food distribution at Bedford Hills Elementary School & on Mondays, Wednesdays & Fridays (please note that this week distribution is on Thurs instead of Fri). 🦊❤️"
"""Love In Action"" Food Distribution via"
"Plagues could be even more lethal before modern medicine and epidemiology.S ince at least the time of ancient Greece and Rome, massive outbreaks of disease have repeatedly ravaged the human race. Until quite recently, epidemics of cholera, smallpox, ..."
"A middle school in Linden, New Jersey has converted its Little Free Libraries into “Little Free Pantries” to help those with food insecurity during the pandemic."
"Back a while, ran a program, partnering with NYC restaurants to distribute unused food to the needy, iirc. 🙏🏽❤🙂"
