In [1]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = 'Struggle*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)

exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')

Loading required package: pacman

[1mRows: [22m[34m18518[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (29): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (1): title

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m18675[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (28): url, published, content, lang, domain_url, parent_url

In [2]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################

exclusion_keywords <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
        "#FamiliesFirst",
        "Gun violence",
        "sick leave",
        "coronavirus testing",
        "Zimbabwe",
        "baon",
        "#FamiliesFirstBill",
        "Eva Mendes",
        "economic package",
        "mail carrier",
        "Evangelical",
        "Miami",
        "Obamacare",
        "FEMA",
        "Americans",
        "Voter ID",
        "Freekibble",
        "Congress",
        "TARP",
        "supply chains",
        "Hoboken",
        "Michael Bloomberg",
        "Bangkok",
        "Florida",
        "farmers",
        "irene is so rich",
        "Arizona",
        "#MayThe4th",
        "Africa",
        "Ecuador",
        "Toms River",
        "Jungkook",
        "Italians",
        "#HKers",
        "impeachment",
        "unfettered capitalism",
        "Senate",
        "@GovMurphy",
        "California",
        "LA",
        "Lebanon",
        "Mirai Nagasu",
        "Hobby lobby",
        "@BBCIndia",
        "North Carolina",
        "Michael Jackson",
        "NJ",
        "Fort Lauderdale",
        "Dallas",
        "#JoeBidenIsARacist",
        "Show of Lies",
        "Cultural Revolution",
        "LOSER",
        "hobi",
        "LI",
        "#AltRight",
        "#CommercialRealEstate",
        "@bonappetit",
        "@DanCrenshawTX",
        "#Obamagate",
        "#HeroesAct",
        "welcome the stranger",
        "Amazon jobs",
        "parked illegally",
        "#Georgia",
        "Mitch McConnell",
        "progressive or conservative position",
        "Jackson, Miss",
        "own stock",
        "@RonJohnsonWI",
        "Chinese wife",
        "Elmira",
        "Tesla",
        "#bcorporation",
        "Self Anointing",
        "Long Island",
        "Ron Johnson",
        "super bowl",
        "my struggle food",
        "Jim Jordan",
        "#EconJustice4All",
        "#GivingTuesday2020",
        "#VoteBlueToEndThisNightmare",
        "News Corp",
        "Adigrat",
        "shut down the network",
        "Boeing",
        "#PPP",
        "west coast",
        "Police",
        "Sen Kennedy",
        "family recipe",
        "hospitals are full",
        "#TrumpVirus",
        "#Charlotte",
        "pro life",
        "Elliott",
        "golfing",
        "DC",
        "food industry",
        "SF",
        "Cinnamon Toast Crunch",
        "cut restaurants a tax break",
        "PUTIN",
        "fire every time",
        "#Albertans",
        "District Judge Howell",
        "pro-life",
        "hard time with margins",
        "choose one struggle meal",
        "restaurants are struggling",
        "audiobook",
        "Jesus",
        "#Tripoli",
        "#PMQs",
        "#Hartford",
        "GoogleOrg",
        "live off the system",
        "#BetterWay4Buses",
        "Cape Town",
        "Federal judge",
        "Nigerian households",
        "#HerdMentality",
        "Venezuela",
        "#Amherst",
        "companies",
        "Companies",
        "subscribe",
        "GA",
        "UK",
        "JACK FROSS",
        "Bristol",
        "#MorningJoe",
        "Rockland",
        "vacation days",
        "dementia",
        "#COVID_19uk",
        "Missouri",
        "#TrumpGOPGenocide",
        "YouTube",
        "horses",
        "klapped",
        "Coach Thibs",
        "Lindsey Graham",
        "Syrians",
        "Wisconsin",
        "business suport",
        "SPACE FORCE",
        "Space Force",
        "Google",
        "Las Vegas",
        "struggling local businesses",
        "#ukulele",
        "Hackensack",
        "Texas",
        "#Hollywood",
        "New Jersey",
        "sick donkeys",
        "#Minneapolis",
        "#BirdTwitter"
        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')
)

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords, content),
        !grepl('\\$SNAP', content)
    )

total_records_raw <- data %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    2022
    ############################
    ############################
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(50) %>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_struggle.csv');
#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
# change the seed number above and run this cell again
#######################################
#######################################
#######################################
#######################################

total raw records: 148345 
records after preprocessing: 48299 
percent records remaining: 33%

content
<chr>
that’s a struggle meal?? i just ate 3 day old kraft mac and cheese
"RT : The families forced to the streets in the middle of the pandemic, the bereaved relatives, the unemployed, the hungry, the brutalized, and the struggling masses are waiting to be organized together to build the power we need. The power of our people is our only constant"
"It’s not elitist when you work in the arts. It’s a full time job. Millions of artists are also struggling for food, work, and healthcare. We are deeply struggling and will be the last to return. It’s unfair of you to not acknowledge that - and disappointing. #artsworkers"
"RT : How New York will pair food banks in need with struggling farms, which have excess product"
"RT : Vote for Kelly Loeffler and David Perdue if you’d like to be homeless, hungry, and sick."
"RT : Ben Roethlisberger is in a great mood this morning, having fun with the media, including some where who are having struggles getting their live feed working on the zoom call."
"I’m living in nyc and I lost my job because of the pandemic going on, I’m struggling to help my mom with food and our rent is back up. We’ve been selling items on eBay and other sites just to try to get some extra money but it’s not helping because the bills just keep piling up"
"RT : When #DonaldTrump walked away from relief negotiations, he abandoned hungry families, shuttered businesses, and struggling students. Our nation’s pain is taking a backseat to his ego."
A wonderful night - Family Feud Night and youth preparing food for Midnight run to feed the homeless in New York City. Lord be praised!
RT : Ya’ll like breakfast food? ... 😕 i struggle.
