In [1]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = 'FoodPrices*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)

exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')

Loading required package: pacman

[1mRows: [22m[34m19022[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (28): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (2): title, noise_category

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
“One or more parsing issues, see `problems()` for details”
[1mRows: [22m[34m9663[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","

In [2]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################

exclusion_keywords <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
        "seasonings",
        "self care",
        "just got carded",
        "#Hwiyoung's",
        "back to the car",
        "chefs kiss",
        "Klu Klux Klan",
        "Allergy test",
        "Car fumes",
        "pups",
        "too much sauce",
        "man takes car",
        "face masks",
        "relaxing weekend",
        "#OurOceans",
        "Mexico",
        "car food lol",
        "surprise breakfast",
        "fast food career",
        "shame eat",
        "AWOL",
        "quiet but nice",
        "even cooking",
        "brave Americans",
        "walk up",
        "Tattoos",
        "UK's",
        "Quadra Island",
        "bad habit",
        "documentary",
        "Tax the rich",
        "cat",
        "Give a follow",
        "food poisoning",
        "mf's",
        "dog",
        "dogs",
        "farmers",
        "too much sauce",
        "India",
        "singing",
        "sing",
        "on the list",
        "food parties",
        "caramel",
        "daydreaming",
        "food processor",
        "tax cuts",
        "food snob",
        "cow",
        "actually cares",
        "organic industry",
        "mislabeling",
        "barbaric",
        "air fryers",
        "laid to rest",
        "Wash your hands",
        "shoes",
        "Junta's",
        "#Karen",
        "second career",
        "Nigerian",
        "montreal",
        "gaga",
        "Sweepstakes",
        "win big",
        "jewelry",
        "racist insurrection",
        "college people",
        "date",
        "Tell Ya Friends",
        "charge facebook",
        "zoo",
        "Zumba",
        "collectible",
        "Mayo Clinic",
        "Ate...",
        "food dyes",
        "Cyber Truck",
        "Food Preservation Tray",
        "food industry",
        "ex",
        "too much food",
        "gift cards",
        "Massachusetts",
"sacrifice my grandmother",
"$ADRNY",
"COVID-19 testi",
"You asked me to dm you",
"Coronavirüse",
"Zoom shiva",
"Chicago",
"10 YEAR OLD ME",
"NANUET",
"I am not in NY",
"#seattle",
"Instafart",
"#SouthAfrica",
"beauty supply",
"Hastings-on-Hudson",
"Springville",
"China",
"#LakeGeorge",
"Brentwood",
"Nigeria",
"police dick",
"Human Trafficking Awareness Day",
"Paolo Coelho",
"#ChicagoProtests",
"HETERO",
"Switzerland",
"UK",
"Oprah",
"Cape Cod",
"@BoutiqueDesign",
"Yulin",
"Minneapolis",
"Sainsbury’s",
"#emmetttill",
"Alexandria, MN",
"Haemi",
"Failed Startups",
"N.O.R.E.",
"Jersey City",
"los Postmates",
"Aysha",
"£",
"$shop",
"NJ",
"AWS",
"@Airbnb",
"#OleMiss",
"Robinhood",
"#TheBachelorette",
"D.C. Attorney General",
"Najmul",
"SOCIALLY DISTANCED VIGIL",
"#jobseeker",
"Pilar Grace"

        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')
)

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords, content),
        !grepl('\\$SNAP', content)
    )

total_records_raw <- data %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    99
    ############################
    ############################
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(20) %>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_food_prices.csv');
#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
# change the seed number above and run this cell again
#######################################
#######################################
#######################################
#######################################

total raw records: 78554 
records after preprocessing: 30857 
percent records remaining: 39%

content
<chr>
"RT : And early in the 20th century, countless Americans died of food borne illnesses. We created the FDA and set forth regulations to govern food manufacture, distribution and sale. Today, death from these illnesses is super rare in the US."
"RT : I’ve seen tweets floating around about this, but here’s another reminder to avoid grocery shopping today through April 3rd if you are able to, because EBT cards (formerly food stamps) are reloaded at the beginning of the month. Allow folks with EBT cards to access full shelves!"
2020 Hustles: ✔️ Sports / Trading cards ✔️ Streaming ✔️ Food delivery ✔️ Garage “scaling” 🤔 ___________ What else would you add?
"RT : Food-Delivery Apps Are Charging Businesses Too Much In Fees, Downtown Alliance President Jessica Lappin Says"
I use Instacart and had a delivery person tell me they were shamed for wearing their mask in the car. They don’t want to put it on and take it off 200 times a day with peoples food in the car.
"Moms are health care workers, janitors, grocery store and market cashiers, and delivery service people, child care and home care workers. They are essential to our country and community and must have the support they need! #MothersDay #FrontlineMoms #equalpay"
"RT : Today, we distributed 1,600 bags of food to New Yorkers in the South Bronx to commemorate #HolyThursday. This food was blessed by during the #PalmSunday mass at . joined us during our ""grab and go"" program."
When I moved I thought I would miss my friends and just everything about NY and I literally couldn’t care less about anything but the food right now. I had no idea I gave this much of a fuck about food 😢
"(Disregard punctuation errors. Saving character spaces so you all can save oceans😂). As the ascending food chain gets affected, the near 3 billion of us that rely on the ocean’s eco systems for food and careers start to see evidence of long term damages."
"RT : InStyle spoke to five women on the front lines of the coronavirus pandemic: a nurse, a flight attendant, a home caregiver, a pharmacy technician, and a grocery store cashier. Here are their stories, and how you can help."
