In [1]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = '2EBT*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)

exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')

Loading required package: pacman

[1mRows: [22m[34m19714[39m [1mColumns: [22m[34m49[39m
[36m‚îÄ‚îÄ[39m [1mColumn specification[22m [36m‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ[39m
[1mDelimiter:[22m ","
[31mchr[39m (28): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (2): title, noise_category

[36m‚Ñπ[39m Use `spec()` to retrieve the full column specification for this data.
[36m‚Ñπ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
‚ÄúOne or more parsing issues, see `problems()` for details‚Äù
[1mRows: [22m[34m19399[39m [1mColumns: [22m[34m49[39m
[36m‚îÄ‚îÄ[39m [1mColumn specification[22m [36m‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚

In [4]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################

exclusion_keywords <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
        "Bitch Yes",
        "SNAP like WTF",
        "Cum pay and play",
        "Meg & Cardi",
        "Ooooooohhhh SNAP!",
        "Show$",
        "The wild SNAP!",
        "#BTS_Dynamite",
        "Block Tiktok unless China",
        "SOONYOUNG",
        "Hot guy in electronics",
        "Amb. Dian Djani",
        "JAYE_HARDBODY",
        "COLD SNAP",
        "Wyoming",
        "Michiganders",
        "#KarmaAndDarwin",
        "#MonicavsBrandy",
        "#Girlgroup",
        "(YOUNG BLOODZ)",
        "virtually slap this asssss",
        "JAZMINE",
        "PREMIUM SNAP",
        "$oorahusa",
        "SNAP LIKE THAT",
        "Sexual eruption",
        "ICY!!!!!! AHHHH",
        "Reef Life",
        "PURAH",
        "Nebraska",
        "Muthafudgin",
        "#iHeartFestival2020",
        "SUBSCRIBE TO",
        "WONWOO",
        "SNAP SHOT",
        "New video",
        "Masks work on Grandma",
        "RAPPER HARUTO",
        "Mom Voice",
        "#personalitykey",
        "Preachüëèüëèüëè",
        "BREATH OF THE WILD",
        "NUGGETS WIN GAME 7",
        "SNAP this HARD",
        "BIG HOMIE ROCKING",
        "DAT MEAN SHE WANT NIGGAS",
        "Just Was Chillen",
        "Missy's the poo, so take a big whif",
        "COMMENT YOUR SNAP",
        "SNAP JUMPS 20%",
        "üëª",
        "ZE ADD FUN FACT",
        "OHHHHH SNAP",
        "SNAP IM WHEEZING",
        "BEND AND SNAP",
        "$FSLY",
        "JALEN HURTS",
        "KIM JISOO",
        "SNAP EMOTE",
        "Less antitrust issues?",
        "safari camp prank.",
        "ABOUT TO FUCKING SNAP",
        "SNAP IT SHUT. LISTEN TO LESBIANS.",
        "Deutsche Bank sees ad",
        "AUTO PASSING",
        "JAZMINE",
        "LEMME TELL U ON SNAP",
        "BALLGAME",
        "CUM",
        "Cody",
        "$oorahusa",
        "OPEN MY SNAP",
        "#BTSARMY",
        "THANOS",
        "slow tf down",
        "Cult of Snap",
        "29 years later",
        "SNAP YA FINGERS",
        "Raffle animation",
        "SNAP AND SUBSCRIBE",
        "CIRCUIT BREAKERS",
        "#1 BEST FRIEND",
        "Ohhh SNAP",
        "DM OR SNAP",
        "Poshmark",
        "wig game up",
        "DO NOT piss me off",
        "SNAP SHOT",
        "ATINYS",
        "BEAR/BODY",
        "SIIIIIS",
        "SNAP THE NECKS",
        "SNAPCHAT",
        "SELENA",
        "RYAN GET SNAP",
        "WEED On SNAP",
        "FB +$5",
        "DONT YOU SNAP",
        "WHYD YOU SNAP",
        "CHAERYEONG",
        "IPRAY",
        "Ooooooohhhh SNAP!",
        "SNAP TOGETHER",
        "Oooo SNAP!",
        "DRINK BEER",
        "KPOP",
        "Choreo",
        "DELETING SNAP",
        "SNAP MEMORIES",
        "SWAN-HANDLED",
        "YOU DIDN‚ÄôT SNAP",
        "SNAP ME",
        "DIDNT SNAP ME",
        "punching bag",
        "perfectionism hits HARD",
        "premium SNAP",
        "$SNAP",
        "SNAPCHAT",
        "Snapchat",
        "Pinterest",
        "oh, SNAP",
        "Morgan Stanley",
        "SNAP fitness",
        "SNAP, CRACKLE, POP",
        "XXX content creator",
        "OOOOOH",
        "SNAP the ball",
        "SNAP EM",
        "BOOM SNAP CLAP",
        "#IndependentBookstoreDay",
        "Oh - SNAP",
        "SOMEONE TO SNAP",
        "Ooooop SNAP",
        "OOOHH SNAP",
        "Minnesota",
        "S&P 500",
        "Ohhhh SNAP",
        "earrape",
        "#KOYA",
        "SNAP CHAT",
        "ADD ME ON SNAP",
        "CHECK SNAP",
        "abortions",
        "ADD NO SNAP",
        "SNIP SNAP",
        "SNAP GC",
        "AT HIS SNAP",
        "auto saves",
        "Acme",
        "Virtual Console",
        "KY",
        "SNAP this hard",
        "CRACKLE",
        "SNAP STORY",
        "PRIVATE SNAP",
        "ON SNAP DO THAT",
        "Michigan",
        "ElectraRx",
        "GINGER SNAP",
        "PA",
        "TSLA",
        "SNAP AT",
        "COMMENT ON SNAP",
        "NJ",
        "sfx",
        "Roe v Wade",
        "ON GOD",
        "THIS ON SNAP",
        "MA",
        "SNAP STREAK",
        "CHECK UR SNAP",
        "CHECK SNAP",
        "CHECK YOUR SNAP",
        "Nevada",
        "earnings gapper",
        "Arkansans",
        "SNAP SO HARD",
        "Resokute",
        "TMZ",
        "TWTR",
        "COLORADO",
        "MN",
        "THE SNAP ü§¨",
        "SNAP LIKE THIS",
        "SNAP BEATS",
        "STILL USES SNAP"
        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')
)

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords, content),
        !grepl('\\$SNAP', content)
    )

total_records_raw <- data %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    2022
    ############################
    ############################
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(25) %>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_EBT_2.csv');
#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
# change the seed number above and run this cell again
#######################################
#######################################
#######################################
#######################################

total raw records: 77035 
records after preprocessing: 60755 
percent records remaining: 79%

content
<chr>
"RT : Trump Budget Would Fray Social Safety Net. Trump/GOP believe n Socialism for corporations, rugged individualism for 99%. Proposals for cuts in food stamps, housing & education could exacerbate inequality. Biden‚Äôs platform is progressive. #Biden2020"
What you know about buying hot food with ebt lmaooo you ain‚Äôt ever have ebt ü§£ü§£
"And there's a very new initiative, so just a few grocery stores so far, where every dollar spent on fruit, vegetables or beans with SNAP/EBT is matched:"
imagine having to survive on food stamps and logging on to see people (who are 1 emergency/inconvenience away from your situation) saying that your boss deserves not b- but t-trillions.
BIG SNAP!
"RT : Shout out to Judge Howell for blocking Trump‚Äôs rule that would‚Äôve kicked 700,000 people off food stamps during a global pandemic! Trump shows us again he cares nothing about the least of these. Thank you & for leading the way. We must keep fighting!"
can we address the issue on why folks with EBT cards can't order food from big chains online. People receiving new EBT cards w/no expiration date or cvc #... but yet we are supposed to #STAYATHOME these are people who are disabled more prone to get sick. #Quarantine
"I'm sure you meant well, but if I'll that should happen most of Donald Trump's base would lose their Food Stamps, Welfare Checks, and Free Medical care."
"thanks for allowing SNAP/EBT for groceries with and #AmazonPantry, but could you please add ? When I try to check-out, I do not see my SNAP/EBT as payment option."
"Not only does the HEALS Act not include an increase in funding for SNAP, but it also does not expand Pandemic-EBT benefits, which has been a lifeline for families who rely on free and reduced-price school meals. #PandemicEBT"
