In [3]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = 'GroceryStoresBrands*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)

exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')

[1mRows: [22m[34m17983[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (28): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (2): title, noise_category

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m11742[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (29): url, published, content, lang, domain_url, parent_url, post_type, ...


In [4]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################

exclusion_keywords <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
        "this is an example exclusion keyword",
        "exclusion keyword are case sensitive and literal",
        "guns",
        "newest commercials",
        "blood tainted air",
        "Kirk Franklin",
        "Big 10 Championship",
        "Vekilinin",
        "Atlanta supermarket",
        "Trump'a",
        "Saydam",
        "Soruyorum",
        "Yusufsuz",
        "bad day too",
        "In another universe",
        "#mukbangvideo",
        "farm in central Italy",
        "Massachusetts",
        "sacrifice my grandmother",
        "$ADRNY",
        "COVID-19 testi",
        "You asked me to dm you",
        "Coronavirüse",
        "Zoom shiva",
        "Chicago",
        "10 YEAR OLD ME",
        "NANUET",
        "I am not in NY",
        "#seattle",
        "Instafart",
        "#SouthAfrica",
        "beauty supply",
        "Hastings-on-Hudson",
        "Springville",
        "laundromat",
        "China",
        "#LakeGeorge",
        "Brentwood",
        "Nigeria",
        "police dick",
        "Human Trafficking Awareness Day",
        "Paolo Coelho",
        "#ChicagoProtests",
        "HETERO",
        "Switzerland",
        "UK",
        "Oprah",
        "Cape Cod",
        "@BoutiqueDesign",
        "Yulin",
        "Minneapolis",
        "Sainsbury’s",
        "#emmetttill",
        "Alexandria, MN",
        "Haemi",
        "Failed Startups",
        "N.O.R.E.",
        "Jersey City",
        "los Postmates",
        "Aysha",
        "£",
        "$shop",
        "NJ",
        "AWS",
        "@Airbnb",
        "#OleMiss",
        "#jobs",
        "Robinhood",
        "#TheBachelorette",
        "D.C. Attorney General",
        "Miami",
        "Najmul",
        "side hustles",
        "SOCIALLY DISTANCED VIGIL",
        "#jobseeker",
        "California",
        "apply",
        "Pilar Grace",
        "Quick cash",
        "1st class idiots",
        "#Hamilton",
        "fixing computers",
        "ultimate holiday stop",
        "food crops",
        "Prop 22",
        "U.S. Book Trade"
        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')
)

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords, content),
        !grepl('\\$SNAP', content)
    )

total_records_raw <- data %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    1500

    ############################
    ############################
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(10) %>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_grocery_stores_brands.csv');
#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
# change the seed number above and run this cell again
#######################################
#######################################
#######################################
#######################################

total raw records: 81948 
records after preprocessing: 64813 
percent records remaining: 79%

content
<chr>
omg yes I was just in aldi and some lady had LITERALLY 10 bags of family size chips !! why !
Same thing at shoprite
"Stop by the meat market. Cheesy eggs, creamy grits & toast !"
"This CNN piece is hilarious. It says “Instacart changes policy after CNN Business report” — when *everyone* was reporting on this. It also implies criticism of the policy because others have shorter windows, not noting Amazon Prime Now’s is also 24 hours."
"RT : When ride-hailing heavyweights Uber and Lyft and delivery giants Grubhub and Instacart began making shared rides and meals available with a few taps on a smartphone, they transformed the way people work, travel and get food delivered to their homes."
Aldi commits to reducing volume of plastic packaging by 50% by 2025
He's a business owner?! That's great!
Instacart please help your girl 😭
RT : Crooks dug a tunnel into the Shoprite Liquors at Newtown Junction and stole booze worth R300 000 during the night. #Level3
"RT : latest Behind the News, ⁦⁩ breaks into the hidden abode of production in the time of covid-19 w/⁦⁩ & ⁦⁩ on Amazon, Whole Foods, & Instacart Strikes + ⁦⁩ takes on GND mobilization.🔥🔥"
