In [3]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = '1SchoolFood*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)

exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')


############
## Reverse filtering
############

data.preprocessed <- data.preprocessed %>% filter(
    grepl('food|Nutrition|Meal|meals|nutritional|diet|dietary|lunch|lunches|breakfast|breakfasts|eat|feed|feeds|eats', content, ignore.case=TRUE)
)

[1mRows: [22m[34m12751[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (29): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (1): title

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m9700[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (28): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19)

In [4]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################

exclusion_keywords <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
         "EPUB Download",
        "PDF Download Free",
        "Volleyball Stay Low",
        "Coloring Book",
        "EBOOK Download Free",
        "Mobi Free",
        "Valentine's Day Gift",
        "ArgoPrep",
        "Sunday School Killer",
        "free 'Cold War' course",
        "Public School Law",
        "offering free clinic services",
        "Center for Coping's",
        "free course from Yale",
        "#WhatsHappeningInMyanmar",
        "YouthBridge-NY",
        "Leprechauns Don't Play Basketball",
        "Vitali GossJankowski",
        "school social media regulation",
        "Cambridge School Shakespeare",
        "school sports schedule apps",
        "EBOOK Free",
        "#ChineseCommunistParty",
        "Onlyfans",
        "@FullCourtPUMPS",
        "Sewing Machine",
        "Esports Labs",
        "Crush Law School",
        "law school need",
        "Diamond and Pearl",
        "virtual game jam",
        "Green Architecture Student Day",
        "Eddy Learning",
        "GFX products",
        "EPUB Free",
        "I had a dream",
        "Free City sweats",
        "holiday shopping",
        "Future Artists program",
        "BPA Free Water Bottle",
        "NYC Guitar School",
        "exchange for a math job",
        "The Adventures of Charlie",
        "5 FREE online courses",
        "Free online course",
        "free internet",
        "Kindle Download Free",
        "Publishing",
        "£",
        "Organ",
        "torment my",
        "LSAT",
        "police",
        "#DefundNYPD",
        "Our free school",
        "$MCD",
        "AU",
        "#LongIsland",
        "don’t feel like existing",
        "tuition free",
        "New School Free Press",
        "Sarah Jessica Parker",
        "PMQs",
        "Xbox",
        "CT",
        "breakfast club",
        "New Jersey",
        "beyblade",
        "England",
        "cali",
        "vaccinate your child",
        "Martin Avenue School",
        "biological physics",
        "Spotify",
        "Taiwan",
        "#LiberationSchool",
        "cruise ships",
        "airlines",
        "watchmaking",
        "Coaching",
        "free speech",
        "Goff",
        "#Buffalo",
        "requiring vaccinations",
        "tax-free",
        "British",
        "Down Dog",
        "bullying",
        "extra features",
        "Read Free",
        "tuition-free",
        "free educational resources",
        "free learning resources",
        "free society",
        "free country",
        "library card", 
        "Gun free",
        "risk-free",
        "#ConnectivitiesConference",
        "admissions",
        "ancient texts",
        "free scholarship",
        "Austria",
        "that's a joke",
        "Free Willy",
        "Register for free",
        "Feel free",
        "Simpson County",
        "old-school",
        "Rosetta Stone",
        "French",
        "#UFT",
        "Law school",
        "this is free",
        "School-to-Prison",
        "Free Full Access",
        "astrological sign",
        "drug free",
        "It’s FREE on",
        "Missouri",
        "Boris Johnson",
        "streamed for free",
        "Boston",
        "San Diego",
        "ruined my nap",
        "FREE HSE prep",
        "screenshots",
        "free online courses",
        "free styling",
        "autobiographical",
        "#dealoftheday",
        "being free from school",
        "Food and Finance High School",
        "Culver",
        "rent-free",
        "free years of tuition",
        "Smithtown",
        "Monmouth",
        "because it's free",
        "military",
        "Links to FREE",
        "free ride",
        "food for thought",
        "mall food court",
        "Free emails",
        "Demon Slayer",
        "Baldwinsville",
        "must-read",
        "#FREE",
        "MA",
        "back ot back meetings",
        "Columbus",
        "Waterville",
        "GEORGIA",
        "Testing needs to be free",
        "free licenses",
        "free to think",
        "AP",
        "medical school",
        "#PuertoRico",
        "FREE Watch",
        "med school",
        "dental school",
        "the child free",
        "childfree",
        "free rape points",
        "Listen for FREE",
        "free trial",
        "Berlin",
        "homeschool",
        "Free Student Planner",
        "Rome",
        "Tom O'Connell",
        "Alton",
        "free bike racks",
        "#ShakespearesBear",
        "student loan debt",
        "biglaw offer",
        "FREE virtual visits",
        "random flashes",
        "register to fuck me",
        "free educational downloads",
        "50k go to",
        "Black Pre-Law Conference",
        "High School DxD",
        "free access to its online publication",
        "free digital access",
        "#TeacherGifts",
        "All gas, no brake",
        "Arts Justice Division",
        "squads to murder",
        "excelsior scholarship",
        "senile/dementia",
        "Gundam",
        "underlying emotion",
        "how to shoot",
        "free coding lessons",
        "monthly baby chalkboards",
        "2021 NYC Mayor",
        "college textbooks",
        "NBA",
        "Agricultural and Food Business",
        "#EmpowermentSummit",
        "Tucker Carlson",
        "free courses",
        "Bangladesh's",
        "free preparatory classes",
        "worse school than steub",
        "#MLB",
        "Trenton",
        "garner a social media following",
        "Puerto Rican",
        "medical education",
        "UK",
        "African Free School",
        "old school",
        "Ossining",
        "Colorado",
        "Startup School",
        "anyone I ate lunch with",
        "Shanghai",
        "free daily courses",
        "upscale healthcare",
        "you are in a civil war",
        "Virtual Skills Academy",
        "to lunch or wherever",
        "free 12-week program",
        "coding for free",
        "middle school president",
        "Accra",
        "Charles Finney School",
        "eating my lunch in the bathroom",
        "Gingko Bioworks",
        "tuition",
        "hate free",
        "free time",
        "free course",
        "Sunday school",
        "Education is free",
        "Baby Shark",
        "live for free",
        "free software",
        "feel free to",
        "Free story idea",
        "Anfield",
        "free flowing",
        "free copy",
        "lunchroom gossip",
        "schools are free",
        "EU",
        "#NewZealand",
        "free month",
        "SG",
        "FREE resources",
        "Handmade Face Mask",
        "Kindle Free",
        "serim",
        "hot cheetos",
        "#OHIO",
        "Yale",
        "#HoustonStrong",
        "#PalmBeachCounty",
        "esthetics school",
        "Pokemon",
        "free comics",
        "#NowAvailable",
        "Milwaukee",
        "free screening",
        "Breakfast Club",
        "XXL cypher",
        "GRE",
        "Afghanistan",
        "Oberoi",
        "Chapter Leader",
        "abject arrogance",
        "Old School",
        "Orange County",
        "Apple Podcasts",
        "School of Art",
        "parliament",
        "PM",
        "school project",
        "massive fluff",
        "Grammar School",
        "Mazes Activity",
        "Spy School",
        "MATILDA X INCEPTION",
        "CryptoArt",
        "Kindle Free",
        "EPUB Download",
        "PDF Download Free",
        "Volleyball Stay Low",
        "Coloring Book",
        "EBOOK Download Free",
        "Mobi Free",
        "Valentine's Day Gift",
        "ArgoPrep",
        "Sunday School Killer",
        "free 'Cold War' course",
        "Public School Law",
        "offering free clinic services",
        "Center for Coping's",
        "free course from Yale",
        "#WhatsHappeningInMyanmar",
        "YouthBridge-NY",
        "Leprechauns Don't Play Basketball",
        "Vitali GossJankowski",
        "school social media regulation",
        "Cambridge School Shakespeare",
        "school sports schedule apps",
        "EBOOK Free",
        "#ChineseCommunistParty",
        "Onlyfans",
        "@FullCourtPUMPS",
        "Sewing Machine",
        "Esports Labs",
        "Crush Law School",
        "law school need",
        "Diamond and Pearl",
        "virtual game jam",
        "Green Architecture Student Day",
        "Eddy Learning",
        "GFX products",
        "EPUB Free",
        "I had a dream",
        "Free City sweats",
        "holiday shopping",
        "Future Artists program",
        "BPA Free Water Bottle",
        "NYC Guitar School",
        "exchange for a math job",
        "The Adventures of Charlie",
        "FREE online courses",
        "Free online course",
        "lunchbox jokes",
        "print this for free",
        "Teacher Camp",
        "Massachusetts",
        "UK",
        "#NewRochelle",
        "tuition",
        "free speech",
        "Thailand",
        "Duncan Smith",
        "Bethlehem",
        "British government",
        "Colo",
        "WALES",
        "Quantico",
        "GA",
        "Chuck E Cheese",
        "#MA04",
        "lunch time scrimmage",
        "#Kenya",
        "Southern Africa",
        "Livingston County",
        "Puerto Rico",
        "bottom of the food chain",
        "nastalgia",
        "School of Greatness", 
        "free night school",
        "Oxford",
        "Chattanooga",
        "culinary school",
        "Marty",
        "Marcus Rashford",
        "Kindle",
        "Rhinebeck",
        "Old School",
        "old school",
        "Bridgewater-Raritan High School",
        "African governments",
        "Seneca Falls",
        "film school",
        "cat food",
        "#clearthelist",
        "Texas",
        "Texan",
        "Isaiah Washington",
        "Bergen County",
        "law school",
        "LA",
        "Velma",
        "class president",
        "Biz School",
        "Mediterranean",
        "maintain a routine",
        "undrafted free agents",
        "threw my retainer away",
        "buzzes around",
        "Breakfast Club",
        "Cairo",
        "Durham",
        "virtual tours",
        "Malawi",
        "Feel free",
        "Pokémon",
        "sold drugs",
        "FREE college",
        "Entebbe",
        "#twitch",
        "New Jersey",
        "ADOLPH",
        "grad school",
        "Rutland",
        "VA",
        "Adobe Creative Cloud",
        "free money",
        "#etsy",
        "makeup school",
        "breakfast club",
        "Katipunan",
        "One School",
        "yoongi",
        "flo milli",
        "Bay Area",
        "Ireland",
        "Uganda",
        "Rucola Cress",
        "weeb",
        "pupperino",
        "Pokemon",
        "Buffalo",
        "Wrenshall",
        "free periods",
        "Netflix",
        "#mealprep",
        "pay for school",
        "Register today",
        "#Beijing",
        "Regis Falls",
        "buy one get one",
        "available online",
        "Business School",
        "Zimbabwe",
        "Pressure Cooker",
        "Atlanta",
        "Petrol",
        "antifa",
        "Moundsville",
        "Nutley",
        "Buhari",
        "Newmark",
        "#Venezuela",
        "Cinnamon Toast Crunch",
        "business school",
        "courses for free",
        "free guide",
        "Chaeyoung",
        "prestigious private school",
        "Fort Defiance",
        "FREE SCHOOL",
        "Tokyo",
        "#MyTwitterAnniversary",
        "America Scores",
        "TX",
        "school-free holidays",
        "TMZ",
        "ONE School",
        "shanghai",
        "Magnolia",
        "West Virginia",
        "Cognitive Cardio",
        "Lunch Lady series",
        "Philip Giordano",
        "free mural",
        "high food",
        "art school",
        "Marlboro",
        "rom-com",
        "Free Online",
        "Philippines",
        "Breakfast In America",
        "Ramsey",
        "Southampton",
        "Hong Kong",
        "free access",
        "Hicksville",
        "Israeli accent",
        "#BreakfastClub",
        "LRCHS",
        "Gaithersburg",
        "back-to-school deals",
        "Guyana",
        "theater arts degree",
        "free theater camp",
        "Old school",
        "Livingston",
        "American Icon",
        "#momlife",
        "lunch table",
        "baseball cards",
        "knuckle sandwich",
        "Learn How to Code",
        "Galway",
        "Pom Pom Pet",
        "McRib",
        "Essex",
        "U.K.",
        "Lawrence",
        "SF",
        "paterson",
        "meal prepping",
        "Chris Wallace",
        "jline",
        "#Cambodia",
        "FREE Pre-K",
        "nominative determinism",
        "perform in the lunchroom",
        "Divinity School",
        "#tidepods",
        "Pleasant Valley",
        "Syracuse",
        "#CA15",
        "Harvard",
        "smash Bros",
        "brownstone pancake factory",
        "free from school",
        "Clay High School",
        "#FridayMotivation",
        "#OpenTheDoorToHappiness",
        "novio",
        "School of Law",
        "Amtrak",
        "Britain",
        "Eastbay",
        "free filmmaking",
        "Naked Lunch",
        "haikyuu",
        "Somalia",
        "Jorge Guevara Mellado School",
        "go to dunkin",
        "Manchester",
        "writing program",
        "uno out",
        "Kingston",
        "Trump University",
        "#Yemen",
        "Saratoga",
        "online workshops",
        "virtual conference",
        "Miami",
        "DPR",
        "Clatsop",
        "Watergate",
        "FREE course",
        "Renaissance Women",
        "FREE",
        "#Yemen",
        "#ecuador",
        "Wall Street Massacre",
        "NYU",
        "CA",
        "Arizona",
        "Mukbang"
        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')
)

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords, content),
        !grepl('\\$SNAP', content)
    )

total_records_raw <- data %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    50
    ############################
    ############################
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(50) %>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_school_food_1.csv');
#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
# change the seed number above and run this cell again
#######################################
#######################################
#######################################
#######################################

total raw records: 89172 
records after preprocessing: 62952 
percent records remaining: 71%

content
<chr>
"honestly i don’t know. I think it’s cause i was eating mad when i was at school (halal almost every night, thai food twice a week, etc) but now i’m home and I don’t have access to all that"
RT : What she did to school lunches was the worst crime of all
Yes she is. I wouldn't trust her to carry my kid's lunch let alone tell them when and how they should go back to school. No safety protocols whatsoever hardly. Disgusting
"but you're insinuating that the obama school lunch plan was a waste of time, i dont think thats true. i think ppl's rejection of that is the same reason we need to focus more on health-focused programs. leadership enables ppl's laziness & dependency on govt bc its profitable"
"NYC DOE Division of Early Childhood: Due to expected severe weather conditions, all New York City public school buildings will remain closed tomorrow, Tuesday, February 2, 2021. All instruction will take place remotely. Food distribution sites are closed tomorrow as well."
"Schools provide students w/ everything they need to learn for free, everything that is except for school meals. Learning on an empty belly is hard, so are working to make it easier for schools to feed kids."
"RT : School bus drivers, custodians and food service workers are just some of the essential staff that keep our students safe and healthy. We need Congress to provide funding to keep them on the job if we are going to safely reopen schools. #FundtheFrontLines"
"NYC public school buildings closed Tuesday, 2/2. All students will learn remotely. All Meal Hubs will be closed."
"Who is to say they had not already eaten their ""substantive food"" and were nursing their drinks? This is the same guy who banned semi-automatic weapons... FOR POLICE!!!. You know is a moron, right? Ask people who went to school with him. -JGC"
"Do you know what a dollar is? Have you ever worked and paid all your bills from rent, school, books, food, health care, car insurance, phone bills, etc.? Or did your family give you a hand out for everything and anything? xoxo, DR #drresa"
