In [1]:
# Loading in packages and data
library(tidyverse)
library(stringr)

# healthcare

proquest <- read.csv("intermediate/healthcare.csv") # Loading in data


── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.0     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [None]:
# Cleaning

stopwords <- c(" i |  me |  my |  myself |  we |  our |  ours |  ourselves |  you |  your |  yours |  yourself |  yourselves |  he |  him |  his |  himself |  she |  her |  hers |  herself |  it |  its |  itself |  they |  them |  their |  theirs |  themselves |  what |  which |  who |  whom |  this |  that |  these |  those |  am |  is |  are |  was |  were |  be |  been |  being |  have |  has |  had |  having |  do |  does |  did |  doing |  would |  should |  could |  ought |  i'm |  you're |  he's |  she's |  it's |  we're |  they're |  i've |  you've |  we've |  they've |  i'd |  you'd |  he'd |  she'd |  we'd |  they'd |  i'll |  you'll |  he'll |  she'll |  we'll |  they'll |  isn't |  aren't |  wasn't |  weren't |  hasn't |  haven't |  hadn't |  doesn't |  don't |  didn't |  won't |  wouldn't |  shan't |  shouldn't |  can't |  cannot |  couldn't |  mustn't |  let's |  that's |  who's |  what's |  here's |  there's |  when's |  where's |  why's |  how's |  a |  an |  the |  and |  but |  if |  or |  because |  as |  until |  while |  of |  at |  by |  for |  with |  about |  against |  between |  into |  through |  during |  before |  after |  above |  below |  to |  from |  up |  down |  in |  out |  on |  off |  over |  under |  again |  further |  then |  once |  here |  there |  when |  where |  why |  how |  all |  any |  both |  each |  few |  more |  most |  other |  some |  such |  no |  nor |  not |  only |  own |  same |  so |  than |  too |  very |  will ")

healthcare <- proquest |>
mutate(year = substr(Date, start = 1, stop = 4), # Extracting year from date
year = as.numeric(year)) |>
mutate(Text = gsub(stopwords, " ", Text)) |> # Removing stopwords
mutate(Text = gsub(" health-care ", " healthcare ", Text, ignore.case = T), # Selective stemming (healthcare) 
      Text = gsub(" health care ", " healthcare ", Text, ignore.case = T),
      Text = gsub(" health insurance ", " healthcare ", Text, ignore.case = T)) |> 
mutate(Text = gsub(" racism ", " racialization ", Text, ignore.case = TRUE), # Selective stemming (race)
        Text = gsub(" racial ", " racialization ", Text, ignore.case = TRUE),
        Text = gsub(" racist ", " racialization ", Text, ignore.case = TRUE),
        Text = gsub(" racists ", " racialization ", Text, ignore.case = TRUE),
        Text = gsub(" racialized ", " racialization ", Text, ignore.case = TRUE),
        Text = gsub(" racializes ", " racialization ", Text, ignore.case = TRUE),
        Text = gsub(" racialize ", " racialization ", Text, ignore.case = TRUE),
        Text = gsub(" racializing ", " racialization ", Text, ignore.case = TRUE)
      ) |>
mutate(Text = gsub(" african american ", " blck ", Text, ignore.case = TRUE), # Selective stemming (blacks)
        Text = gsub(" african americans ", " blck ", Text, ignore.case = TRUE),
        Text = gsub(" african-american ", " blck ", Text, ignore.case = TRUE),
        Text = gsub(" blacks ", " blck ", Text, ignore.case = TRUE),
        Text = gsub(" black men ", " blck ", Text, ignore.case = TRUE),
        Text = gsub(" black man ", " blck ", Text, ignore.case = TRUE),
        Text = gsub(" black woman ", " blck ", Text, ignore.case = TRUE),
        Text = gsub(" black women ", " blck ", Text, ignore.case = TRUE),
        Text = gsub(" black people ", " blck ", Text, ignore.case = TRUE)
      ) |>
mutate(Text = gsub(" hispanics ", " hisplat ", Text, ignore.case = TRUE), # Selective stemming (Hispanics)
      Text = gsub(" latino ", " hisplat ", Text, ignore.case = TRUE),
      Text = gsub(" latina ", " hisplat ", Text, ignore.case = TRUE),
      Text = gsub(" latinos ", " hisplat ", Text, ignore.case = TRUE),
      Text = gsub(" latinas ", " hisplat ", Text, ignore.case = TRUE),
      Text = gsub(" latin american ", " hisplat ", Text, ignore.case = TRUE),
      Text = gsub(" latin americans ", " hisplat ", Text, ignore.case = TRUE),
      Text = gsub(" latin-american ", " hisplat ", Text, ignore.case = TRUE),
      Text = gsub(" latin-americans ", " hisplat ", Text, ignore.case = TRUE))

In [None]:
# Looping to create txt for each year

years <- c(2004:2022)

for (year in years) {
      tempdf <- healthcare |>
        filter(
          !is.na(year),
          year == year
        )

      # Collapse article text to character
      articles <- paste(tempdf$Text, collapse = "\r\n")
    
      sent <- tolower(articles) # 1. Lowercase
      sent <- str_replace_all(sent, "-", " ") # 2a. Replace em-dashes
      sent <- str_replace_all(sent, "[:punct:]", "") # 2b. Remove punctuation
      sent <- trimws(sent, which = "both") # 2c. Remove leading and trailing whitespace
    
    write.table(sent,
        paste0("intermediate/healthcare/healthcare_", year, ".txt"),
        row.names = FALSE,
        col.names = FALSE)
    }