In [1]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = 'Access*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)

exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')

Loading required package: pacman

[1mRows: [22m[34m2274[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (29): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (1): title

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
“One or more parsing issues, see `problems()` for details”
[1mRows: [22m[34m452[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): keyword

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the colu

In [2]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################

exclusion_keywords <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
        "How to learn Spanish",
    "AnimalCrossing",
    "refugee camp",
    "swore allegiance to ISIS",
        "Berlin",
        "Fintech",
        "Female privilege",
        "Expat",
        "Alibaba",
        "Schenectady",
        "in the Caribbean",
        "Jüsto",
        "WayV",
        "tech firms",
        "Parkdale",
        "Xingsheng Youxuan",
        "initial public offering",
        "WeChat",
        "Latin America",
        "LI",
        "outdoor dining",
        "San Gabriel Valley",
        "ATL",
        "Prenzlauer",
        "Cardi B",
        "White Plains",
        "Denmark",
        "Yuan",
        "Jamaican Man",
        "N.J.",
        "NJ",
        "New Jersey",
        "France",
        "LA",
        "MI",
        "#auspol",
        "IPO",
        "spic",
        "Utah",
        "Petit Bourg",
        "#Poughkeepsie",
        "baby butt wipes",
        "dog",
        "misbehaving",
        "Boulder",
        "SF",
        "Mexican women stocked",
        "Startup",
        "San Leandro",
        "simulation",
        "vitiligo",
        "newsletter",
        "#JerseyCity",
        "palestine",
        "San Francisco",
        "Montana",
        "Australia",
        "Ann Arbor",
        "USED DRESS"
        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')
)

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords, content),
        !grepl('\\$SNAP', content)
    )

total_records_raw <- data %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    202
    ############################
    ############################
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(50) %>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_access.csv');
#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
#######################################
#######################################
#######################################
#######################################

total raw records: 2274 
records after preprocessing: 503 
percent records remaining: 22%

content
<chr>
"Today, continuing the efforts of Consul General to bring aid and relief to our diaspora in the tri-state area, with the help of the organization Templo Emmanuel of East Harlem, we delivered food pantries, thanks to the support of Mexican corporations."
ooooh thats smart—wholesale to DTC. I keep waiting for vertical on demand startups (my homies and i thought about an on demand indian grocery food startup once)
RT : Thank you to for operating your Halal Food Pantry— a critical community resource during #Ramadan amid the #COVID19 pandemic— providing quality meals while helping distribute masks to those who need them.
"1. Kal Suarez An OC I created a year ago for the fun of it. • He/him • (trans) Male • 21 years old • Puerto Rican-Colombian heritage • Works at a grocery store (dreads it, but he needs the money xD)"
"RT : At the start of #Ramadan announces 500,000 free halal meals at DOE sites and food pantries"
"i went out yesterday the second i read it was a possibility! the indian grocery store in my neighborhood is a lifesaver, i was in & out in 15 min with all i needed while fairway and trader joe’s had lines out the door"
"Just gave an end of year donation to Kosher Soup Kitchens, doing the vital work of feeding the needy. Due to the crises this year, demand for their services increased by 500%! You have until midnight to make a tax deductible donation for 2020."
the nectar from the dominican grocery store + tequila >>>
"“Parents at Vaughn Occupational High School are organizing a food pantry, sending out urgent messages in Spanish and Polish, and scrambling to figure out what child care looks like during a 10-day voluntary quarantine.”"
"please some Bangladeshi, indian and pakistani groceries"
