In [2]:
#######################################
#######################################
#######################################
#######################################
#######################################
# to run a block of code, click on the cell and press control + enter
# a block of code is still running if there's an asterisk to the left
# run this block fo code JUST ONCE
#######################################
#######################################
#######################################
#######################################

# load and optionally install required packages
if (!require('pacman')) install.packages('pacman')
pacman::p_load(
    dplyr,
    readr,
    scales,
    stringr,
    qdapRegex
)

# initial read in of all data
csv_files <- list.files(path='./data/raw/talkwalker', full.names = T, pattern = 'Comm*')
data_groups <- lapply(csv_files, read_csv)
data <- do.call(rbind, data_groups)

# grab only columns we want and rekey some of them for convenience
data.preprocessed <- data %>%
    select(
        # post data
        content,
        domain_url,
        engagement,
        fluency_level,
        images.url,
        lang,
        matched_profile,
        noise_category,
        parent_url,
        porn_level,
        post_type,
        published,
        reach,
        sentiment,
        tags_internal,
        title,
        url,
        url_views = article_extended_attributes.url_views,
        videos.url,
        word_count,
        username = extra_author_attributes.short_name,
        
        # social media data
        facebook_followers = source_extended_attributes.facebook_followers,
        facebook_likes = article_extended_attributes.facebook_likes,
        facebook_shares = article_extended_attributes.facebook_shares,
        twitter_followers = source_extended_attributes.twitter_followers,
        twitter_likes = article_extended_attributes.twitter_likes,
        twitter_retweets = article_extended_attributes.twitter_retweets,
        twitter_shares = article_extended_attributes.twitter_shares,
        instagram_followers = source_extended_attributes.instagram_followers,
        instagram_likes = article_extended_attributes.instagram_likes,
        
        # demographic data
        author_name = extra_author_attributes.name,
        author_birthday = extra_author_attributes.birthdate.date,
        author_birthday_resolution = extra_author_attributes.birthdate.resolution,
        author_gender = extra_author_attributes.gender,
        author_short_name = extra_author_attributes.short_name,
        author_url = extra_author_attributes.url,
        author_description = extra_author_attributes.description,

        # geographic data
        
        author_continent = extra_author_attributes.world_data.continent,
        author_country = extra_author_attributes.world_data.country,
        author_country_code = extra_author_attributes.world_data.country_code,
        author_region = extra_author_attributes.world_data.region,
        author_city = extra_author_attributes.world_data.city,
        article_city = extra_article_attributes.world_data.city,
        article_latitude = extra_article_attributes.world_data.latitude,
        article_longitide = extra_article_attributes.world_data.longitude,
        source_continent = extra_source_attributes.world_data.continent,
        source_country = extra_source_attributes.world_data.country,
        source_country_code = extra_source_attributes.world_data.country_code,
        source_region = extra_source_attributes.world_data.region,
        source_city = extra_source_attributes.world_data.city
    )

# filter ot rows based on various criteria

noise_category_exclusions <- c(
    'real_estate',
    'job_offers',
    'promotions',
    'diet_pharma',
    'hate_speech',
    'seo_scam'
)

exclusion_keywords <- read_csv('./data/raw/exclusion_keywords.csv') %>%
    pull('keyword') %>%
    paste(collapse = '|')

exclusion_keywords1 <- paste0(
    exclusion_keywords,
    '|',
    paste(list(
        #################################################
        #################################################
        # insert keywords in the orange area below here
        # only edit below this line
        # make sure last item is NOT followed by comma
        # one keyword per line
        # put a comma after each keyword except the last one
        #################################################
        #################################################
        "this is an example exclusion keyword",
        "exclusion keyword are case sensitive and literal",
        "jcpenny",
        "supermarket is testing hands-free",
        "vaccination",
        "The Dropkick Murphys",
        "I’ve been quarantined with my parents (cause the fridge at home is stocked and free)",
        "death camps",
        "thousands are in cages",
        "AmazonBasics 564 L Frost Free Side-by-Side Refrigerator",
        "New Orleans",
        "Jewish Activists",
        "#Neveragain",
        "Free Food Fridge Albany",
        "I run a six figure vegan friendly beauty brand",
        "made chicken soup in our healthy cooking class today",
         "4641 Marconi Ave",
"Apartments in NYC be like",
"Maricopa County",
"Jersey City woman starts a community fridge",
"Just started a @Twitch stream for @WhatevsClevRecs Tonight",
"Community Fridges in Houston: - HTX Community Fridges",
"My full line of stickers.",
"Um Refrigerador Electrolux Multidoor DM84X Frost Free",
"i buy 5 of every brand because i love how many brands of the same cheese, toothpaste and mustard",
"Download Free Manual de refrigeracion domestica",
"#FreezerGoals Sweep",
"Pomona Plants",
"Sheesh. Oakland city officials said Pro Arts",
"#LetMyPeopleGo",
"#AbolishICE",
"cop spies",
"#vaccine",
"Called @HPSupport to refund my care pack",
"A study done on frost-free home fridge found that salad drawers alone",
"Mattress on the floor, empty fridge",
"office equipment will get sad",
"when your job has a cool open concept office",
"Houston’s first community fridge",
"Manual De Refrigeracion Y Aire Acondicionado",
"This kitchen that had 2 side-by-side refrigerators",
"2025 Venable St",
"#JerseyCity",
"Community fridge in New Orleans",
"some people say April Fool’s Day is annoying",
"old white privilege is blinding you",
"Free Giveaway",
"Modern Refrigeration and Air Conditioning",
"Magnets",
"magnets",
"I’m high and remembered I have dairy free ice cream in the fridge",
"my asian trifecta. Vietnamese coffee",
"Baby Safety Locks",
"Thoughts on “clean eating”",
"sangria",
"Microwave and Mini Fridge",
"Houston",
"it’s 90 degrees in LA right now",
"@JerseyCity",
"Massive Memorial Day Wknd Pool Party",
"San Antonio",
"Living with other people is so fun",
"freeze-dried chicken dog treats",
"moth",
"East Hollywood",
"Apartment Size Refrigerator",
"Fridge Detective",
"rent free, 0 utilities",
"Florida",
"eye gel",
"bonus to its hideousness",
"storing bodies",
"futbol",
"Richmond, VA",
"50 Ways to Feel Thin",
"ban a book",
"commit a crime",
"do u lick",
"How does this affect human health and equity",
"@NutritionDiva",
"@GetFitGuy's",
"#PoweredByChainlink",
"The Prince of Wales",
"deserve those eggs",
"Prince Charles",
"House of Raeford Chickens",
"#neveragain",
"replaced a stove and fridge",
"Oakland",
"Modern Refrigeration and Air Conditioning",
"want to buy a house",
"Smeg FAB32RCR5UK",
"toronto",
"Summit Appliance",
"@NeverAgainActn",
"Newark",
"precious refrigerator real estate",
"mostly dairy free lately",
"Apartments in NYC",
"#SleepyJoe",
"#JoesGotNOMojo",
"#NoMojoJoe",
"Welcome Center on Grand Island",
"Urban Garden Apartment",
"#FarmersProtest",
"#FreeJaggiNow",
"Scotland",
"Central California/ Farming communities",
"My People Are Raped",
"Garden City",
"London",
"Urban Garden Apartment",
"Going from rural Maryland to NYC",
"those in rural communities will love having to pay $5.00",
"I’m deeply honored to have @AmyKlobuchar's endorsement",
"Community Garden",
"Dunk",
"The eyes of Florida are on @GovRonDeSantis",
"when i start my farmers market for slackers then you will all see",
"DSS Invites Journalist",
"Florida",
"@NYCParks parks",
"President Nader",
"We wish the President and First Lady",
"Madison Square Garden",
"SPD",
"hunt for Meg’s shooter",
"@KamaiuJohnson",
"@PennLive",
"Rhubarbia",
"THE BEAR'S GARDEN",
"Ecuador",
"@DrBiden",
"Mennonite",
"#Amish",
"masters of the dark arts",
"Community Solar Farms",
"Peacekeepers",
"#Iowa",
"Valnan Communications",
"@JerseyMatters",
"rural town in between Niagara Falls and Buffalo",
"The State of #NYCHA",
"Delaware",
"Halloween",
"small farmers’ reliance",
"Wilber Portillo",
"#NotFreeToPee",
"Bhartiya Kisan",
"@PoliceNG",
"EAST HAMPTON TOWN SHELLFISH HATCHERY",
"Fulani Herdsmen",
"Wisconsin farmers",
"communist problem",
"anti pastoralist",
"North Korea",
"Kulak",
"#FarmersProtests",
"Breonna's Family",
"Incessant Oil Spillage",
"Fulani",
"India",
"S. Africa",
"shipwrecked",
"Africa",
"#grassfed beef",
"Paris",
"England",
"@homedepot",
"show some love for Columbus Park!",
"leader of the Koinonia Farm",
"Indian",
"Hindu",
"Sikh",
"#BritishColumbia",
"I’m running in VA",
"@USAID",
"@bankimooncentre",
"DeFi yield farmers",
"#Malawi",
"AFK Forge farming",
"Edo Community",
"-Give resources to OSHA & farms",
"creamy tomato risotto",
"Punjab",
"gentrifying the shit out of the south bronx",
"PUA",
"Maplewood",
"Wisconsin dairy farm",
"Asian's Largest Urban Rooftop Farm",
"community solar farm",
"Saint Of The Day",
"#Mexican",
"Texas A&M",
"farmer suicide",
"Parakeet",
"South Central LA",
"1933",
"@Nike",
"Flex on your pantry",
"St. Louis",
"Herdsmen",
"Ontario",
"Singapore",
"Palestinians",
"Ethereum",
"White China Silk",
"Monsantoo's dicamba",
"They've been had",
"Oriana Franklin",
"RAVEN HOLLOW",
"Southern New Jersey",
"Adamawa",
"Urban Conversion Farmhouse",
"Community solar",
"CompostNow?",
"Commercial Urban Farming course",
"REGISTER NOW",
"Heart of Glass",
"Arizona",
"Bagco sacks",
"Trump just",
"Oscar Ramos",
"microbiome networks",
"@FortuneMagazine",
"election day votes",
"Devendra Kul",
"Palos Verdes",
"Philly",
"urban zoological garden",
"Reddit's largest subreddits",
"#banfactoryfarming",
"upstate farming community",
"Tim Tebow",
"Ikpide-Irri",
"lakh saplings",
"partnership between farmers, conservationists, and the business community",
"#NH Gardeners",
"Welcome to America",
"#DairyFAN",
"#FarmerProtest",
"downtown L.A.",
"Jersey",
        "Race for Nature",
"Jake Gibbs was a real warrior for 3rd District",
"Middletown, NY",
"Appellate Court",
"Salt Lake City",
"Dicamba",
"3 Hudson communities",
"Native American communities",
"more PPE to constituents",
"I shall sign with great pleasure!",
"RENUBLE15",
"Ohio",
"Colombian army",
"Donziger",
"livestream concert",
"langar food",
"Woodbridge, VA",
"LANGAR",
"Immokalee",
"Harry Potter -The Great Gatsby",
"Bay Area",
"gay married",
"Minnesota",
"Paris's",
"SOIL TESTING FOR URBAN FARMS",
"Alexis Wilson",
"Obedience is the KEY",
"AAG Denver",
"gay commies this farmer",
"BK Botanical Gardens",
"90DayFiance",
"Rice Farmer",
"Brazil",
"immigration policy",
"@AfricanFarming",
"Alberta",
"Not a failure but a different animal",
"Architects & developers",
"Polk County",
"scuplture",
"Ambani and Adani",
"Bitter harvest",
"SWEET BABY JAMES",
"UN fund",
"Australia",
"Biafra",
"public prisons",
"Umbrealla Hotel",
"@BJP4India",
"@IFAD",
"Tuition-Free Farming Program",
"remants of rural America",
"A Betrothal",
"Urban Garden Center LLC",
"Atlanta-based org",
"ranch culture",
"Grits, with butter",
"Capital Hill Autonomous Zone",
"Jazz Coalition",
"Filipino healthcare",
"White Supermacists",
"Urbanization, offices, globalism",
"wheat procurement centres",
"World Relief Seattle",
"Civil War",
"N604PD",
"Save the Horse Farm",
"ASL Interpreters",
"@PoliceNG",
"Boko Haram",
"Million Dollar Idea of the Day",
"Charlotte",
"oppression in my DNA",
"Farmingdale State College",
"Montana",
"Jungkook's"
            ), collapse = '|')
)


Loading required package: pacman

[1mRows: [22m[34m4680[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (29): url, published, content, lang, domain_url, parent_url, post_type, ...
[32mdbl[39m (19): porn_level, fluency_level, sentiment, article_extended_attributes....
[33mlgl[39m  (1): title

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m9199[39m [1mColumns: [22m[34m49[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (29): url, published, content, lang, domain_url, parent_url, 

In [3]:
#######################################
#######################################
#######################################
#######################################
#######################################
# run this block EVERY TIME you add new exclusion keywords
#######################################
#######################################
#######################################
#######################################


exclusion_keywords2 <- paste(list(

"undegrad fantasies",
"Hawaiian farming ",
"Chicago's South Side",
"PWNA ",
"rural communities, schools, hospitals",
"Bro Kimbal's venture",
"Farmiton, Victor or Canandaigua area",
"reactionary prattle",       
"Kai, carving, farming",
"#greencommunications",
"Denver Urban Gardens",
"Seox (dark event SSR)",
"$NXM",
"SUPPORT YOUR NEIGHBORHOOD RESTAURANTS",
"Sims 4 community",
"Ikpide-Irri Community",
"Richie & Hilltop Bagels",
"US Urban Farms Director, Michael Hollis",
"Urban Garden Center",
"Rural America plans",
"Tioga",
"#Tanzania",
"A Brief History of Pesticides",
"USSR",
"Over a million #Farmworkers aren’t hunkered down",
"7 Gift Ideas For Urban Farming Enthusiasts",
"URBAN GUIDE TO FARMING IN NEW YORK",
"Mel Barthalomew's",
"Tourmaline's ‘Pleasure Garden’",
"Seems global. A sadness in Farm communities.",
"took me a while to understand the controllers are never going to stop",
"Wage Board",
"UPSTATE NY!!",
"#JuneDiaryMonth",
"STILL clomping all over their garden",
"Green Gulch Farm",
"#GardenCommVirtual",
"Called a communist",
"Sao Paulo",
"LEIMAY NEWS",
"Getting ready to plant your garden?",
"#Mosul",
"communities of Crown Heights, Prospect Lefferts Gardens, Wingate, and East Flatbush",
"#Cauvery",
"Werewolf games",
"#reforest the Amazon",
"Redding, CT",
"Italy",
"Dearfield",
"Second Gentleman",
"passive house construction",
"Catholic Church",
"Timor Leste",
"Spectrum internet",
"Lupin Publishers",
"grocery market to support in Seattle",
"Bill Carey's",
"Harry Potter Club",
"NEW BOOK",
"Spain",
"giving Chait grief",
"@FAIRTRADE cocoa",
"bunkfed farming",
"@MonicaRamirezOH",
"Ugbene Ajima",
"Illinois",
"Lord of the Flies",
"Biden will tour an urban farm",
"AquAlliance",
"Farming in the 1600s",
"dark money",
"Senegal",
"Pacific Northwest",
"Tom Nook",
"#Bankless",
"Horniman Museum",
"watershed",
"SNEAKERS",
"#spatialanalysis",
"Pat Foran",
"@perlasofia",
"@PinkyTurtles",
"garden-style community",
"#STPINEURON", "feather patterns",
"Tidewater Gardens",
"Gateshead Council",
"Marist College",
"Bike with RIPA",
"NewAir",
"#Rwanda",
"Striated tornadic",
"polluted drinking water",
"Devorah Brous",
"ENT business",
"obamawhitehouse",
"Mexico City's",
"let's buy it today",
"WFF cat sanctuary",
"#africa",
"Kindle Field Guide",
"FUPA",
"vegan girl's insta",
"urban patio gardener",
"DSA members",
"Tottenham",
"Shelf Magnet","Aussie",
"Duplin County",
"Ifarm",
"Hispanic Federation",
"#ClimateChange",
"George Floyd",
"Katie Porter's",
"NAARO",
"Search engine optimization",
"Gary Matson",
"#sadtimes",
"CA Southland",
"climate issue",
"Groton Community",
"It is systematic", 
"parliament without proper debate",
"#PuertoRico",
"sales directly to consumers",
"The Black Feminist Project",
"NRCS",
"Egypt",
"iFarm",
"climate adaptation",
"animal sanctuary eviction",
"NYC Open Restaurants",
"old retired farmers",
"underserved dc communities",
"taro farming",
"I have a sizable urban garden",
"aristocrat",
"Barr",
"Garden Sage",
"Urban Farm Corps",
"trauma therapy yoga",
"survival", 
"boardwalk",
"berlin",
"buy shares",
"farmers",
"Cattle",
"cattle",
"jerk off",
"#museumsfromhome",
"#Farmers",
"Cal Anderson",
"human trafficking", 
"Asia",
"#landscapearchitecture",
"#neveralonesummit",
"#SeaTurtle",
"refrigeration",
"farming",
    "Join me",
"Govinda",
"Hip Hop",
"BBQ",
    "#thecocofund",
"destruction is incredible",
"CONCEPT",
"CFU Guaranteed",
"#bike",
"farmworker",
"Rosedale",
"#biblestudy",
"Emhoff",
"#Sweepstakes",
"POS",
"hedgerow",
"garden-style",
"oysters",
"communal",
"Sworn in",
"Maya Marie",
"Tennessee",
"#housing",
"webinar",
"Texas",
"8:29",
"LUNGS",
"$24,000",
"improv",
"Dorjee",
"MAS",
"Yurok",
"Zerega",
"#innovation",
"Charleston",
"neurodiverse",
"DM me",
"bathing suit",
"turkey",
"Iloilo City",
"rose bush",
"Tampa!",
"#Thatching",
"living room",
"sharing cookies",
"Rhonda Bell",
"#Somalia",
"#DYK",
"rural America",
"urban parks",
"#solar",
"beekeeping",
"Beekeeping",
"Apocalypse",
"Canada",
"HBCU",
"garden at twilight",
"#ConfirmClimate",
"Snow day",
"bike tour",
"social circle",
"toledo",
"#HousingCriticalResponse",
"Communism",
"agrarian community",
"Austin",
"Great leadership",
"intense",
"Dryponics",
"herbal health store",
"3D prints",
"Color from a big",
"film festival",
"interlocking fingers",
"Frontyard Politics",
"Olivia Watkins",
"grubbers",
"Hasidic",
"Buffalo",
"pothos",
"Shop online,",
"#blockchain",
"bar",
"jail",
"musician",
"Refrigeration Equipment",
"Camps",
"5 mile walk",
"Hopin",
"warfare",
"#paintings",
"WORST",
"hummingbirds",
"fucking up racism",
"TFFJ",
"10460",
"free reign",
"Garden Studios",
"Linda Yang",
"Inverter",
"edibles",
"Climate adaptation",
"Coleman's",
"EBOOK",
"Tivoli",
"renewable energy",
"Seoul",
"Farming",
"Cryptocurrency",
"Woodpecker",
"2x2",
"MPH",
"Curtis",
"#TeamHorticulture",
"#BarnCam",
"Green Space program",
"Cape Town",
"Artsakh",
"Bakers",
"bankruptcies",
"Pontefract",
"landscaper",
"jersey",
"board members",
"coke fridge",
"butterfly gardens",
"Nazis",
"book-making",
"garden city",
"online friends",
"running for Congress",
"community lectures",
"#WeFeedYou",
"Las Vegans",
"Nevada",
"per hour",
"#Adirondacks",
"State Farm Arena",
"power plant",
"AgSphere",
"Lowe's",
"urban farm",
"Urban Farm",
"urban farms",
"urban farmers",
"urban farmer",
"urban farming",
"tea garden community",
"#NeverAloneLGBTQIA",
"White House",
"urban gardeners",
"DigitalOcean",
"1BR/1BA",
"Urban Delights Farm",
"transaction",
"community leaders",
"Oswego Speedway",
"South LA",
"Yolanda’s",
"Read for free",
"Europe",
"#Recovery4All"
        #################################################
        #################################################
        # no more edits below this line
        #################################################
        #################################################
    ), collapse = '|')

data.filtered <- data.preprocessed %>%
    mutate(
        content = str_remove_all(content, pattern = '@\\w+') %>% rm_url()
    ) %>%
    filter(
        !is.na(content),
        length(content) > 15,
        porn_level == 0,
        lang == 'en',
        !noise_category %in% noise_category_exclusions,
        !grepl(exclusion_keywords1, content),
        !grepl('\\$SNAP', content)
    )

data.filtered <- data.filtered %>%
    filter(
        !grepl(exclusion_keywords2, content),
    )

total_records_raw <- data %>% nrow()
total_records_filtered <- data.filtered %>% nrow()

cat(paste('total raw records:', total_records_raw, '\n'))
cat(paste('records after preprocessing:', total_records_filtered, '\n'))
cat(paste('percent records remaining:', percent(total_records_filtered / total_records_raw)))

set.seed(
    ############################
    ############################
    # change the following seed number to get a different random sample
    # you can use any number
    # run this until you get an average of 90% precision over 5 runs
    ############################
    ############################
    1220
    ############################
    ############################    
    # no more edits below this line
    ############################
    ############################
)

data.filtered %>% distinct(content) %>% sample_n(50) %>% select(content)
data.filtered %>% write_csv('./data/generated/filtered_comm.csv');
#######################################
#######################################
#######################################
#######################################
#######################################
# read the random sample below
# calculate your own precision accuracy
# if you want to add more exclusion keywords, go back to the top of this cell, edit it, and run it again
#     you will AUTOMATICALLY get a different random sample of 10
# if you want a different sample to check for precision AND do not want to add to the exclusion keywords,
# change the seed number above and run this cell again
#######################################
#######################################
#######################################
#######################################

total raw records: 28193 
records after preprocessing: 7701 
percent records remaining: 27%

content
<chr>
Edgemere Farm my immediate community garden
"After long break with Twitter , I like to share more updates from me. Last week I’m back with my farmer community, I’ll knock many doors to talk about women in decision working . Stay tune #SAPP"
How Do Farm and Rural Communities Vote?
"The flower and vegetable delivery from Homegrown Kitchen Gardens / homegrownnurseries came in today! These will be delivered soon to GreenThumb community gardens. Thank you to homegrownnurseries, run by …"
Awesome to see this Plant-Based Community Fridge filled with humane and healthy #vegan food options for New Yorkers struggling to make ends meet in the pandemic. #foodjustice #healthequity #mutualaid
"RT : Friendly fridge coming soon to 231-34 Merrick Blvd Springfield Gardens ,ny 11413 Free food will be available in the fridge for the community, by the community"
"RT : RiskExec has successfully integrated the 2019 #CRA peer data set into its Peer Analysis module. This dataset includes small business, small farm and community development lending data reported by certain commercial banks and savings associations. Read more"
RT : I don't recommend apples for urban gardens...small fruits such as currants and serviceberries are a better use of space. And there's enough to share if nearby wildlife decides to have a nosh. The million dollar question: How long until the squirrels find these! #GYO
"""A community refrigerator created for needy families was discovered vandalized & left in disrepair outside the district office of #Queens state Sen. Jessica Ramos on #NewYearsDay, coming at a moment when the pandemic has left more residents food insecure:"
One more point that I really see a problem with in this response is the idea that I am a person with my own garden. You are wrong. I am a member of several community gardens where I can both feed myself and give back to my community. The future is COMMUNAL NOT ISOLATED
