# Reading the tweets extracted and filtering

This notebook contains the different functions developed to:
- unify the files of the same category but extracted based on a different keyword
- remove unnecesary characters, get rid of URLS, hastaghs, etc. 
- remove tweets where the username is the keyword
- remove tweets with keywords from multiple cateogories

When running this notebook, we saved the output at two different step, first, before removing those tweets that contain keywords from multiple categories (semiclean) and the final ones after all the filtering steps (cleanTweets). 

In [1]:
#####################
# Load Libraries    #
#####################
#install.packages(c("tidyverse","dplyr"))
library("dplyr")
library("tidyverse")
library("ggplot2")
rm(list=ls())


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1     ✔ readr   1.3.1
✔ tibble  2.1.1     ✔ purrr   0.3.2
✔ tidyr   0.8.3     ✔ stringr 1.4.0
✔ ggplot2 3.1.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
#####################
# Unify Files       #
#####################
unifyFiles <- function( folder_path, folder_name ){
  
  # create a unique file with all the tweets present in that folder
  files <- list.files( paste0( folder_path, folder_name ))
  
  for( i in 1:length( files ) ){
    
    print( paste0( "Reading the file ", files[i] ) ) 
    if( i == 1 ){
      tweets <- read.csv(paste0(folder_path, folder_name, files[i]), sep = ",")
      print( paste0( "For the keyword ", files[i], " there are a total of ", nrow( tweets)))
    }else{
      new_tweet <- read.csv( paste0(folder_path, folder_name, files[i]), sep = ",")
      print( paste0( "For the keyword ", files[i], " there are a total of ", nrow( new_tweet)))
      tweets<- rbind( tweets , new_tweet )
    }
    
  }
  return( tweets)
}

In [3]:
##################################################################
# Clean the text in the tweets to prepare for sentiment analysis #
##################################################################
cleanTweets <- function( tweets.df ){
    tweets.df$text <- gsub("&amp;", "and", tweets.df$text) # Replace & symbol with "and" when coded as &amp;
    tweets.df$text <- gsub("&gt;", "", tweets.df$text ) # Get rid of  > symbol when coded as &gt;
    tweets.df$text <- gsub("&lt;", "", tweets.df$text ) # Get rid of  < symbol when coded as &lt;
    tweets.df$text <- gsub("&#13;", " ", tweets.df$text ) # Get rid of  html symbol for Carriage Return &#13;
    tweets.df$text <- str_replace_all(tweets.df$text ," "," ")          #Get rid of unnecessary spaces
    tweets.df$text <- gsub("“", "", tweets.df$text) # Get rid of quotes;
    tweets.df$text <- gsub("”", "", tweets.df$text) # Get rid of quotes;
    tweets.df$text <- gsub("\"", "", tweets.df$text) # Get rid of quotes;
    tweets.df$text <- gsub("—", "", tweets.df$text) # Get rid of quotes;
    tweets.df$text <- str_replace_all(tweets.df$text ,"#[a-z,A-Z]*","") # Get rid of hashtags
    tweets.df$text <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "",tweets.df$text )# Take out retweet header / via @
    tweets.df$text <- str_replace_all(tweets.df$text,"@[a-z,A-Z]*","")  # Get rid of references to other screennames
    tweets.df$text <- gsub("http[[:alnum:][:punct:]]*", "", tweets.df$text ) # Get rid of URLs
    tweets.df$text <- gsub("www[[:alnum:][:punct:]]*", "", tweets.df$text ) # Get rid of URLs that do not start with http
    tweets.df$text <- gsub("pic.twitter[[:alnum:][:punct:]]*", "", tweets.df$text ) # Get rid of pic.twitter
    tweets.df$text <- trimws( tweets.df$text ) # Get rid of leading and/or trailing whitespace 
    return( tweets.df)
}

In [4]:
######################################################
# To remove tweets where the username is the keyword #
######################################################
usernameClean <- function( tweets.df, category ){
  
  tweets.df$text <- tolower( tweets.df$text )
  tweets.df$username <- tolower( tweets.df$username )
  
  wclist <- read.delim("/home/ec2-user/SageMaker/CategoryKeyword", sep = "\t", header = TRUE)
  wclist$Category <- tolower( wclist$Category )
  wclist$Keyword  <- tolower( wclist$Keyword )
  
  keywords <- trimws( wclist[ wclist$Category == tolower(category), "Keyword"] )
  
  
  
  if( length(grep(paste(keywords,collapse="|"), 
           tolower(tweets.df$username)) ) != 0){
    tweets.df <- tweets.df[ -c( (grep(paste(keywords,collapse="|"), 
                                      tweets.df$username))), ]
  }

  return( tweets.df)
  
}


In [5]:
############################################################
# To remove tweets with keywords from multiple cateogories #
############################################################

tweetsUniqueCategory <- function( tweets.df, category){
    
    wclist <- read.delim("/home/ec2-user/SageMaker/CategoryKeyword", sep = "\t", header = TRUE)
    wclist$Category <- tolower( wclist$Category )
    wclist$Keyword  <- tolower( wclist$Keyword )
    
    if( category == "LNG IUD"| category == "Copper IUD"){
          excludedKeywords <- trimws( wclist[ ( wclist$Category != tolower(category) & wclist$Category != tolower("IUD") ), "Keyword"] )
          includedKeywords <- trimws( wclist[ wclist$Category == tolower(category), "Keyword"] )
    }else{
          excludedKeywords <- trimws( wclist[ wclist$Category != tolower(category), "Keyword"] )
          includedKeywords <- trimws( wclist[ wclist$Category == tolower(category), "Keyword"] )
    }
    
  
  excluded.tweets <- grep(paste(excludedKeywords,collapse="|"), 
                                    tweets.df$text)
  included.tweets <- grep(paste(includedKeywords,collapse="|"), 
                          tweets.df$text)
  
  final.tweets <- included.tweets[ ! included.tweets %in% excluded.tweets ]
  
  if( length( final.tweets) != 0 ){
    tweets.df <- tweets.df[ c( final.tweets ) , ]
  }

  return( tweets.df)
  
}


The previous set of functions are applied to each of the 8 birth control categories. Additionally, we add some steps to:
- remove duplicated tweets. 
- remove male contractive tweets based on three expressions: "male contraception", "male contraceptive"and "male birth control"
- remove emergency contraceptive tweets based on three expressions: "emergency contraception", "emergency contraceptive"and "emergency birth control"

In [6]:
##############
# Copper IUD #
##############

#1st. Unify the tweets
tweets <-  unifyFiles( folder_path = "/home/ec2-user/SageMaker/Tweets/", 
                       folder_name = "Copper_IUD/")

tweets <- tweets[ tweets$text != "", ]

folder_name = "Copper_IUD/"
print( paste0( "In total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets <- tweets[!duplicated(tweets$text),]

folder_name = "Copper_IUD/"
print( paste0( "Without duplicates, in total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

copperNoClean <- tweets

#2nd. remove male BC, then count tweets
maleContraception <- grep(" male contraception", tolower(tweets$text))
maleContraceptive <- grep(" male contraceptive", tolower(tweets$text))
maleBirthControl <- grep(" male birth control", tolower(tweets$text))

maleBCrows <- c(maleContraception, maleContraceptive, maleBirthControl)
print(paste0("There were ", length(maleBCrows), " tweets about male birth control"))

tweets <- tweets[-maleBCrows, ]

folder_name = "Copper_IUD/"
print( paste0( "Without male contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

#3rd. remove emergency contraception, then count tweets
emergencyContraception <- grep(" emergency contraception", tolower(tweets$text))
emergencyContraceptive <- grep(" emergency contraceptive", tolower(tweets$text))
emergencyBirthControl <- grep(" emergency birth control", tolower(tweets$text))

emergencyContraceptionrows <- c(emergencyContraception, emergencyContraceptive, emergencyBirthControl)
print(paste0("There were ", length(emergencyContraceptionrows), " tweets about emergency contraception"))

tweets <- tweets[-emergencyContraceptionrows, ]

folder_name = "Copper_IUD/"
print( paste0( "Without emergency contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))
tweets$text <- as.character( tweets$text )

#4th. Clean tweet text
print("Length of the tweets before cleaning")
summary( nchar( tweets$text ) )
ourCleanTweets <- cleanTweets( tweets.df = tweets )

print("Length of the tweets after cleaning")
summary( nchar( ourCleanTweets$text ) )

#5th. Clean by username
ourCleanTweets <- usernameClean( tweets.df = ourCleanTweets, category = "Copper IUD")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that do not contain the keyword as username" ))

#6th. Remove duplicates
ourCleanTweets <- ourCleanTweets[!duplicated(ourCleanTweets$text),]
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets without duplicates" ))

#7th. Save the file to make Figures
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/SemiCleanAndAggregateTweets/copperIUD_SemiCleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

copperIUDSemiClean <- ourCleanTweets

#8th. Remove tweets mentioning multiple categories
ourCleanTweets <- tweetsUniqueCategory( tweets.df = ourCleanTweets, category = "Copper IUD")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that refer to a unique category" ))

#9th. Save the file to proceed to the AWS comprehend analysis
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/CleanAndAggregateTweets/copperIUD_CleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

copperIUD <- ourCleanTweets
rm(ourCleanTweets)
rm(tweets)

[1] "Reading the file Copper-T.csv"
[1] "For the keyword Copper-T.csv there are a total of 337"
[1] "Reading the file CopperIntrauterineDevice.csv"
[1] "For the keyword CopperIntrauterineDevice.csv there are a total of 249"
[1] "Reading the file CopperIntrauterineSystem.csv"
[1] "For the keyword CopperIntrauterineSystem.csv there are a total of 0"
[1] "Reading the file CopperIUD.csv"
[1] "For the keyword CopperIUD.csv there are a total of 12083"
[1] "Reading the file CopperIUS.csv"
[1] "For the keyword CopperIUS.csv there are a total of 2"
[1] "Reading the file CopperTBirthControl.csv"
[1] "For the keyword CopperTBirthControl.csv there are a total of 6"
[1] "Reading the file CopperTContraception.csv"
[1] "For the keyword CopperTContraception.csv there are a total of 0"
[1] "Reading the file CopperTContraceptive.csv"
[1] "For the keyword CopperTContraceptive.csv there are a total of 3"
[1] "Reading the file Cu-IUD.csv"
[1] "For the keyword Cu-IUD.csv there are a total of 39"
[1] "Readin

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      8      82     122     129     148     605 

[1] "Length of the tweets after cleaning"


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0    68.0   107.0   117.2   139.0   288.0 

[1] "In total we have 24923 tweets that do not contain the keyword as username"
[1] "In total we have 24122 tweets without duplicates"
[1] "In total we have 17577 tweets that refer to a unique category"


In [7]:
###########
# LNG IUD #
###########

#1st. Unify the tweets
tweets <-  unifyFiles( folder_path = "/home/ec2-user/SageMaker/Tweets/", 
                       folder_name = "LNG-IUD/")

tweets <- tweets[ tweets$text != "", ]

folder_name = "LNG-IUD/"
print( paste0( "In total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets <- tweets[!duplicated(tweets$text),]

folder_name = "LNG-IUD/"
print( paste0( "Without duplicates, in total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

lngNoClean <- tweets 

#2nd. remove male BC, then count tweets
maleContraception <- grep(" male contraception", tolower(tweets$text))
maleContraceptive <- grep(" male contraceptive", tolower(tweets$text))
maleBirthControl <- grep(" male birth control", tolower(tweets$text))

maleBCrows <- c(maleContraception, maleContraceptive, maleBirthControl)
print(paste0("There were ", length(maleBCrows), " tweets about male birth control"))

tweets <- tweets[-maleBCrows, ]

folder_name = "LNG-IUD/"
print( paste0( "Without male contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

#3rd. remove emergency contraception, then count tweets
emergencyContraception <- grep(" emergency contraception", tolower(tweets$text))
emergencyContraceptive <- grep(" emergency contraceptive", tolower(tweets$text))
emergencyBirthControl <- grep(" emergency birth control", tolower(tweets$text))

emergencyContraceptionrows <- c(emergencyContraception, emergencyContraceptive, emergencyBirthControl)
print(paste0("There were ", length(emergencyContraceptionrows), " tweets about emergency contraception"))

tweets <- tweets[-emergencyContraceptionrows, ]

folder_name = "LNG-IUD/"
print( paste0( "Without emergency contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))
tweets$text <- as.character( tweets$text )

#4th. Clean tweet text
print("Length of the tweets before cleaning")
summary( nchar( tweets$text ) )
ourCleanTweets <- cleanTweets( tweets.df = tweets )

print("Length of the tweets after cleaning")
summary( nchar( ourCleanTweets$text ) )

#5th. Clean by username
ourCleanTweets <- usernameClean( tweets.df = ourCleanTweets, category = "LNG IUD")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that do not contain the keyword as username" ))

#6th. Remove duplicates
ourCleanTweets <- ourCleanTweets[!duplicated(ourCleanTweets$text),]
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets without duplicates" ))

#7th. Save the file to make Figures
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/SemiCleanAndAggregateTweets/LNG-IUD_SemiCleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

LNG_IUDSemiClean <- ourCleanTweets

#8th. Remove tweets mentioning multiple categories
ourCleanTweets <- tweetsUniqueCategory( tweets.df = ourCleanTweets, category = "LNG IUD")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that refer to a unique category" ))

#9th. Save the file to proceed to the AWS comprehend analysis
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/CleanAndAggregateTweets/LNG-IUD_CleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

LNG_IUD <- ourCleanTweets
rm(ourCleanTweets)
rm(tweets)

[1] "Reading the file HormonalIntrauterineDevice.csv"
[1] "For the keyword HormonalIntrauterineDevice.csv there are a total of 49"
[1] "Reading the file HormonalIntrauterineSystem.csv"
[1] "For the keyword HormonalIntrauterineSystem.csv there are a total of 8"
[1] "Reading the file HormonalIUD.csv"
[1] "For the keyword HormonalIUD.csv there are a total of 3298"
[1] "Reading the file HormonalIUS.csv"
[1] "For the keyword HormonalIUS.csv there are a total of 35"
[1] "Reading the file HormoneIntrauterineDevice.csv"
[1] "For the keyword HormoneIntrauterineDevice.csv there are a total of 2"
[1] "Reading the file HormoneIntrauterineSystem.csv"
[1] "For the keyword HormoneIntrauterineSystem.csv there are a total of 0"
[1] "Reading the file HormoneIUD.csv"
[1] "For the keyword HormoneIUD.csv there are a total of 219"
[1] "Reading the file HormoneIUS.csv"
[1] "For the keyword HormoneIUS.csv there are a total of 4"
[1] "Reading the file KyleenaIntrauterineDevice.csv"
[1] "For the keyword Kyleena

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   19.0    93.0   130.0   135.9   150.0   448.0 

[1] "Length of the tweets after cleaning"


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    7.0    71.0   108.0   117.4   135.0   286.0 

[1] "In total we have 16751 tweets that do not contain the keyword as username"
[1] "In total we have 15520 tweets without duplicates"
[1] "In total we have 11500 tweets that refer to a unique category"


In [8]:
#######
# IUD #
#######

#1st. Unify the tweets
tweets <-  unifyFiles( folder_path = "/home/ec2-user/SageMaker/Tweets/", 
                       folder_name = "IUD/")

tweets <- tweets[ tweets$text != "", ]

folder_name = "IUD/"
print( paste0( "In total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets <- tweets[!duplicated(tweets$text),]

folder_name = "IUD/"
print( paste0( "Without duplicates, in total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

IUD_NoClean <- tweets

#2nd. remove male BC, then count tweets
maleContraception <- grep(" male contraception", tolower(tweets$text))
maleContraceptive <- grep(" male contraceptive", tolower(tweets$text))
maleBirthControl <- grep(" male birth control", tolower(tweets$text))

maleBCrows <- c(maleContraception, maleContraceptive, maleBirthControl)
print(paste0("There were ", length(maleBCrows), " tweets about male birth control"))

tweets <- tweets[-maleBCrows, ]

folder_name = "IUD/"
print( paste0( "Without male contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

#3rd. remove emergency contraception, then count tweets
emergencyContraception <- grep(" emergency contraception", tolower(tweets$text))
emergencyContraceptive <- grep(" emergency contraceptive", tolower(tweets$text))
emergencyBirthControl <- grep(" emergency birth control", tolower(tweets$text))

emergencyContraceptionrows <- c(emergencyContraception, emergencyContraceptive, emergencyBirthControl)
print(paste0("There were ", length(emergencyContraceptionrows), " tweets about emergency contraception"))

tweets <- tweets[-emergencyContraceptionrows, ]

folder_name = "IUD/"
print( paste0( "Without emergency contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))
tweets$text <- as.character( tweets$text )

#4th. Clean tweet text
print("Length of the tweets before cleaning")
summary( nchar( tweets$text ) )
ourCleanTweets <- cleanTweets( tweets.df = tweets )

print("Length of the tweets after cleaning")
summary( nchar( ourCleanTweets$text ) )

#5th. Clean by username
ourCleanTweets <- usernameClean( tweets.df = ourCleanTweets, category = "IUD")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that do not contain the keyword as username" ))

#6th. Remove duplicates
ourCleanTweets <- ourCleanTweets[!duplicated(ourCleanTweets$text),]
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets without duplicates" ))

#7th. Save the file to make Figures
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/SemiCleanAndAggregateTweets/IUD_SemiCleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

IUDSemiClean <- ourCleanTweets

#8th. Remove tweets mentioning multiple categories
ourCleanTweets <- tweetsUniqueCategory( tweets.df = ourCleanTweets, category = "IUD")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that refer to a unique category" ))

#9th. Save the file to proceed to the AWS comprehend analysis
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/CleanAndAggregateTweets/IUD_CleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

IUD <- ourCleanTweets
rm(ourCleanTweets)
rm(tweets)

[1] "Reading the file IntrauterineDevice2006.csv"
[1] "For the keyword IntrauterineDevice2006.csv there are a total of 0"
[1] "Reading the file IntrauterineDevice2007.csv"
[1] "For the keyword IntrauterineDevice2007.csv there are a total of 0"
[1] "Reading the file IntrauterineDevice2008.csv"
[1] "For the keyword IntrauterineDevice2008.csv there are a total of 2"
[1] "Reading the file IntrauterineDevice2009.csv"
[1] "For the keyword IntrauterineDevice2009.csv there are a total of 39"
[1] "Reading the file IntrauterineDevice2010.csv"
[1] "For the keyword IntrauterineDevice2010.csv there are a total of 296"
[1] "Reading the file IntrauterineDevice2011.csv"
[1] "For the keyword IntrauterineDevice2011.csv there are a total of 438"
[1] "Reading the file IntrauterineDevice2012.csv"
[1] "For the keyword IntrauterineDevice2012.csv there are a total of 758"
[1] "Reading the file IntrauterineDevice2013.csv"
[1] "For the keyword IntrauterineDevice2013.csv there are a total of 488"
[1] "Reading th

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    3.0    80.0   124.0   150.2   183.0  1101.0 

[1] "Length of the tweets after cleaning"


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    65.0    90.0   106.3   126.0   293.0 

[1] "In total we have 427515 tweets that do not contain the keyword as username"
[1] "In total we have 397229 tweets without duplicates"
[1] "In total we have 280037 tweets that refer to a unique category"


In [9]:
###########
# Implant #
###########

#1st. Unify the tweets
tweets <-  unifyFiles( folder_path = "/home/ec2-user/SageMaker/Tweets/", 
                       folder_name = "Implant/")

tweets <- tweets[ tweets$text != "", ]

folder_name = "Implant/"
print( paste0( "In total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets <- tweets[!duplicated(tweets$text),]

folder_name = "Implant/"
print( paste0( "Without duplicates, in total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

implantNoClean <- tweets 

#2nd. remove male BC, then count tweets
maleContraception <- grep(" male contraception", tolower(tweets$text))
maleContraceptive <- grep(" male contraceptive", tolower(tweets$text))
maleBirthControl <- grep(" male birth control", tolower(tweets$text))

maleBCrows <- c(maleContraception, maleContraceptive, maleBirthControl)
print(paste0("There were ", length(maleBCrows), " tweets about male birth control"))

tweets <- tweets[-maleBCrows, ]

folder_name = "Implant/"
print( paste0( "Without male contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

#3rd. remove emergency contraception, then count tweets
emergencyContraception <- grep(" emergency contraception", tolower(tweets$text))
emergencyContraceptive <- grep(" emergency contraceptive", tolower(tweets$text))
emergencyBirthControl <- grep(" emergency birth control", tolower(tweets$text))

emergencyContraceptionrows <- c(emergencyContraception, emergencyContraceptive, emergencyBirthControl)
print(paste0("There were ", length(emergencyContraceptionrows), " tweets about emergency contraception"))

tweets <- tweets[-emergencyContraceptionrows, ]

folder_name = "Implant/"
print( paste0( "Without emergency contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))
tweets$text <- as.character( tweets$text )

#4th. Clean tweet text
print("Length of the tweets before cleaning")
summary( nchar( tweets$text ) )
ourCleanTweets <- cleanTweets( tweets.df = tweets )

print("Length of the tweets after cleaning")
summary( nchar( ourCleanTweets$text ) )

#5th. Clean by username
ourCleanTweets <- usernameClean( tweets.df = ourCleanTweets, category = "Implant")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that do not contain the keyword as username" ))

#6th. Remove duplicates
ourCleanTweets <- ourCleanTweets[!duplicated(ourCleanTweets$text),]
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets without duplicates" ))

#7th. Save the file to make Figures
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/SemiCleanAndAggregateTweets/Implant_SemiCleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

ImplantSemiClean <- ourCleanTweets

#8th. Remove tweets mentioning multiple categories
ourCleanTweets <- tweetsUniqueCategory( tweets.df = ourCleanTweets, category = "Implant")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that refer to a unique category" ))

#9th. Save the file to proceed to the AWS comprehend analysis
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/CleanAndAggregateTweets/Implant_CleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

Implant <- ourCleanTweets
rm(ourCleanTweets)
rm(tweets)

[1] "Reading the file ArmImplant.csv"
[1] "For the keyword ArmImplant.csv there are a total of 8591"
[1] "Reading the file B_C_Implant.csv"
[1] "For the keyword B_C_Implant.csv there are a total of 43"
[1] "Reading the file BCImplant.csv"
[1] "For the keyword BCImplant.csv there are a total of 1952"
[1] "Reading the file BirthControlImplant.csv"
[1] "For the keyword BirthControlImplant.csv there are a total of 13742"
[1] "Reading the file BirthControlRod.csv"
[1] "For the keyword BirthControlRod.csv there are a total of 590"
[1] "Reading the file ContraceptionImplant.csv"
[1] "For the keyword ContraceptionImplant.csv there are a total of 366"
[1] "Reading the file ContraceptionRod.csv"
[1] "For the keyword ContraceptionRod.csv there are a total of 19"
[1] "Reading the file ContraceptiveImplant.csv"
[1] "For the keyword ContraceptiveImplant.csv there are a total of 11731"
[1] "Reading the file ContraceptiveRod.csv"
[1] "For the keyword ContraceptiveRod.csv there are a total of 89"
[1] "

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    8.0    68.0   105.0   112.8   137.0  2955.0 

[1] "Length of the tweets after cleaning"


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    59.0    91.0   101.5   124.0   287.0 

[1] "In total we have 100495 tweets that do not contain the keyword as username"
[1] "In total we have 94115 tweets without duplicates"
[1] "In total we have 76356 tweets that refer to a unique category"


In [10]:
###########
# TheShot #
###########

#1st. Unify the tweets
tweets <-  unifyFiles( folder_path = "/home/ec2-user/SageMaker/Tweets/", 
                       folder_name = "TheShot/")

tweets <- tweets[ tweets$text != "", ]

folder_name = "TheShot/"
print( paste0( "In total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets <- tweets[!duplicated(tweets$text),]

folder_name = "TheShot/"
print( paste0( "Without duplicates, in total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

shotNoClean <- tweets

#2nd. remove male BC, then count tweets
maleContraception <- grep(" male contraception", tolower(tweets$text))
maleContraceptive <- grep(" male contraceptive", tolower(tweets$text))
maleBirthControl <- grep(" male birth control", tolower(tweets$text))

maleBCrows <- c(maleContraception, maleContraceptive, maleBirthControl)
print(paste0("There were ", length(maleBCrows), " tweets about male birth control"))

tweets <- tweets[-maleBCrows, ]

folder_name = "TheShot/"
print( paste0( "Without male contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

#3rd. remove emergency contraception, then count tweets
emergencyContraception <- grep(" emergency contraception", tolower(tweets$text))
emergencyContraceptive <- grep(" emergency contraceptive", tolower(tweets$text))
emergencyBirthControl <- grep(" emergency birth control", tolower(tweets$text))

emergencyContraceptionrows <- c(emergencyContraception, emergencyContraceptive, emergencyBirthControl)
print(paste0("There were ", length(emergencyContraceptionrows), " tweets about emergency contraception"))

tweets <- tweets[-emergencyContraceptionrows, ]

folder_name = "TheShot/"
print( paste0( "Without emergency contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))
tweets$text <- as.character( tweets$text )

#4th. Clean tweet text
print("Length of the tweets before cleaning")
summary( nchar( tweets$text ) )
ourCleanTweets <- cleanTweets( tweets.df = tweets )

print("Length of the tweets after cleaning")
summary( nchar( ourCleanTweets$text ) )

#5th. Clean by username
ourCleanTweets <- usernameClean( tweets.df = ourCleanTweets, category = "Shot")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that do not contain the keyword as username" ))

#6th. Remove duplicates
ourCleanTweets <- ourCleanTweets[!duplicated(ourCleanTweets$text),]
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets without duplicates" ))

#7th. Save the file to make Figures
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/SemiCleanAndAggregateTweets/Shot_SemiCleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

shotSemiClean <- ourCleanTweets

#8th. Remove tweets mentioning multiple categories
ourCleanTweets <- tweetsUniqueCategory( tweets.df = ourCleanTweets, category = "Shot")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that refer to a unique category" ))

#9th. Save the file to proceed to the AWS comprehend analysis
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/CleanAndAggregateTweets/Shot_CleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

shot <- ourCleanTweets
rm(ourCleanTweets)
rm(tweets)

[1] "Reading the file BirthControlInjection.csv"
[1] "For the keyword BirthControlInjection.csv there are a total of 2032"
[1] "Reading the file BirthControlShot.csv"
[1] "For the keyword BirthControlShot.csv there are a total of 35010"
[1] "Reading the file ContraceptionInjection.csv"
[1] "For the keyword ContraceptionInjection.csv there are a total of 594"
[1] "Reading the file ContraceptionShot.csv"
[1] "For the keyword ContraceptionShot.csv there are a total of 74"
[1] "Reading the file ContraceptiveInjection.csv"
[1] "For the keyword ContraceptiveInjection.csv there are a total of 7113"
[1] "Reading the file ContraceptiveShot.csv"
[1] "For the keyword ContraceptiveShot.csv there are a total of 800"
[1] "Reading the file Depo-Provera.csv"
[1] "For the keyword Depo-Provera.csv there are a total of 21829"
[1] "Reading the file DepoBC.csv"
[1] "For the keyword DepoBC.csv there are a total of 525"
[1] "Reading the file DepoBirthControl.csv"
[1] "For the keyword DepoBirthControl.csv the

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    4.0    61.0    92.0   100.4   131.0   564.0 

[1] "Length of the tweets after cleaning"


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   54.00   80.00   89.16  114.00  316.00 

[1] "In total we have 139039 tweets that do not contain the keyword as username"
[1] "In total we have 129025 tweets without duplicates"
[1] "In total we have 117907 tweets that refer to a unique category"


In [11]:
############
# The pill #
############

#1st. Unify the tweets
tweets <-  unifyFiles( folder_path = "/home/ec2-user/SageMaker/Tweets/", 
                       folder_name = "ThePill/")

tweets <- tweets[ tweets$text != "", ]

folder_name = "ThePill/"
print( paste0( "In total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets <- tweets[!duplicated(tweets$text),]

folder_name = "ThePill/"
print( paste0( "Without duplicates, in total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

pillNoClean <- tweets

#2nd. remove male BC, then count tweets
maleContraception <- grep(" male contraception", tolower(tweets$text))
maleContraceptive <- grep(" male contraceptive", tolower(tweets$text))
maleBirthControl <- grep(" male birth control", tolower(tweets$text))

maleBCrows <- c(maleContraception, maleContraceptive, maleBirthControl)
print(paste0("There were ", length(maleBCrows), " tweets about male birth control"))

tweets <- tweets[-maleBCrows, ]

folder_name = "ThePill/"
print( paste0( "Without male contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

#3rd. remove emergency contraception, then count tweets
emergencyContraception <- grep(" emergency contraception", tolower(tweets$text))
emergencyContraceptive <- grep(" emergency contraceptive", tolower(tweets$text))
emergencyBirthControl <- grep(" emergency birth control", tolower(tweets$text))

emergencyContraceptionrows <- c(emergencyContraception, emergencyContraceptive, emergencyBirthControl)
print(paste0("There were ", length(emergencyContraceptionrows), " tweets about emergency contraception"))

tweets <- tweets[-emergencyContraceptionrows, ]

folder_name = "ThePill/"
print( paste0( "Without emergency contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))
tweets$text <- as.character( tweets$text )

#4th. Clean tweet text
print("Length of the tweets before cleaning")
summary( nchar( tweets$text ) )
ourCleanTweets <- cleanTweets( tweets.df = tweets )

print("Length of the tweets after cleaning")
summary( nchar( ourCleanTweets$text ) )

#5th. Clean by username
ourCleanTweets <- usernameClean( tweets.df = ourCleanTweets, category = "Pill")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that do not contain the keyword as username" ))

#6th. Remove duplicates
ourCleanTweets <- ourCleanTweets[!duplicated(ourCleanTweets$text),]
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets without duplicates" ))

#7th. Save the file to make Figures
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/SemiCleanAndAggregateTweets/Pill_SemiCleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

pillSemiClean <- ourCleanTweets

#8th. Remove tweets mentioning multiple categories
ourCleanTweets <- tweetsUniqueCategory( tweets.df = ourCleanTweets, category = "Pill")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that refer to a unique category" ))

#9th. Save the file to proceed to the AWS comprehend analysis
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/CleanAndAggregateTweets/Pill_CleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

pill <- ourCleanTweets
rm(ourCleanTweets)
rm(tweets)

[1] "Reading the file B_C_Pill.csv"
[1] "For the keyword B_C_Pill.csv there are a total of 675"
[1] "Reading the file BCPill.csv"
[1] "For the keyword BCPill.csv there are a total of 17198"
[1] "Reading the file BirthControlPill.csv"
[1] "For the keyword BirthControlPill.csv there are a total of 32125"
[1] "Reading the file CHCPill.csv"
[1] "For the keyword CHCPill.csv there are a total of 2"
[1] "Reading the file COCPill.csv"
[1] "For the keyword COCPill.csv there are a total of 39"
[1] "Reading the file CombinedB_C_P.csv"
[1] "For the keyword CombinedB_C_P.csv there are a total of 1"
[1] "Reading the file CombinedBCP.csv"
[1] "For the keyword CombinedBCP.csv there are a total of 13"
[1] "Reading the file CombinedHormonalContraceptive.csv"
[1] "For the keyword CombinedHormonalContraceptive.csv there are a total of 98"
[1] "Reading the file CombinedOCP.csv"
[1] "For the keyword CombinedOCP.csv there are a total of 44"
[1] "Reading the file ContraceptivePill.csv"
[1] "For the keyword Co

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    7.0    90.0   124.0   130.5   140.0   697.0 

[1] "Length of the tweets after cleaning"


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      4      68      98     108     124     288 

[1] "In total we have 113887 tweets that do not contain the keyword as username"
[1] "In total we have 97568 tweets without duplicates"
[1] "In total we have 90836 tweets that refer to a unique category"


In [12]:
############
# The Ring #
############

#1st. Unify the tweets
tweets <-  unifyFiles( folder_path = "/home/ec2-user/SageMaker/Tweets/", 
                       folder_name = "TheRing/")

tweets <- tweets[ tweets$text != "", ]

folder_name = "TheRing/"
print( paste0( "In total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets <- tweets[!duplicated(tweets$text),]

folder_name = "TheRing/"
print( paste0( "Without duplicates, in total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

ringNoClean <- tweets

#2nd. remove male BC, then count tweets
maleContraception <- grep(" male contraception", tolower(tweets$text))
maleContraceptive <- grep(" male contraceptive", tolower(tweets$text))
maleBirthControl <- grep(" male birth control", tolower(tweets$text))

maleBCrows <- c(maleContraception, maleContraceptive, maleBirthControl)
print(paste0("There were ", length(maleBCrows), " tweets about male birth control"))

tweets <- tweets[-maleBCrows, ]

folder_name = "TheRing/"
print( paste0( "Without male contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

#3rd. remove emergency contraception, then count tweets
emergencyContraception <- grep(" emergency contraception", tolower(tweets$text))
emergencyContraceptive <- grep(" emergency contraceptive", tolower(tweets$text))
emergencyBirthControl <- grep(" emergency birth control", tolower(tweets$text))

emergencyContraceptionrows <- c(emergencyContraception, emergencyContraceptive, emergencyBirthControl)
print(paste0("There were ", length(emergencyContraceptionrows), " tweets about emergency contraception"))

tweets <- tweets[-emergencyContraceptionrows, ]

folder_name = "TheRing/"
print( paste0( "Without emergency contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))
tweets$text <- as.character( tweets$text )

#4th. Clean tweet text
print("Length of the tweets before cleaning")
summary( nchar( tweets$text ) )
ourCleanTweets <- cleanTweets( tweets.df = tweets )

print("Length of the tweets after cleaning")
summary( nchar( ourCleanTweets$text ) )

#5th. Clean by username
ourCleanTweets <- usernameClean( tweets.df = ourCleanTweets, category = "Ring")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that do not contain the keyword as username" ))

#6th. Remove duplicates
ourCleanTweets <- ourCleanTweets[!duplicated(ourCleanTweets$text),]
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets without duplicates" ))

#7th. Save the file to make Figures
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/SemiCleanAndAggregateTweets/Ring_SemiCleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

ringSemiClean <- ourCleanTweets

#8th. Remove tweets mentioning multiple categories
ourCleanTweets <- tweetsUniqueCategory( tweets.df = ourCleanTweets, category = "Ring")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that refer to a unique category" ))

#9th. Save the file to proceed to the AWS comprehend analysis
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/CleanAndAggregateTweets/Ring_CleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

ring <- ourCleanTweets
rm(ourCleanTweets)
rm(tweets)

[1] "Reading the file Annovera.csv"
[1] "For the keyword Annovera.csv there are a total of 314"
[1] "Reading the file BirthControlRing.csv"
[1] "For the keyword BirthControlRing.csv there are a total of 1313"
[1] "Reading the file ContraceptionRing.csv"
[1] "For the keyword ContraceptionRing.csv there are a total of 194"
[1] "Reading the file ContraceptiveRing.csv"
[1] "For the keyword ContraceptiveRing.csv there are a total of 1078"
[1] "Reading the file Nuva_Ring.csv"
[1] "For the keyword Nuva_Ring.csv there are a total of 37068"
[1] "Reading the file Nuva-ring.csv"
[1] "For the keyword Nuva-ring.csv there are a total of 29400"
[1] "Reading the file Nuvaring.csv"
[1] "For the keyword Nuvaring.csv there are a total of 22319"
[1] "Reading the file VaginalRing.csv"
[1] "For the keyword VaginalRing.csv there are a total of 15909"
[1] "In total we have 107595 initial tweets for the category TheRing/"
[1] "Without duplicates, in total we have 72886 initial tweets for the category TheRing/"

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    9.0    68.0   100.0   105.3   134.0   630.0 

[1] "Length of the tweets after cleaning"


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   4.00   54.00   78.00   86.71  110.00  288.00 

[1] "In total we have 72261 tweets that do not contain the keyword as username"
[1] "In total we have 64647 tweets without duplicates"
[1] "In total we have 56283 tweets that refer to a unique category"


In [13]:
#############
# The Patch #
############

#1st. Unify the tweets
tweets <-  unifyFiles( folder_path = "/home/ec2-user/SageMaker/Tweets/", 
                       folder_name = "ThePatch/")

tweets <- tweets[ tweets$text != "", ]

folder_name = "ThePatch/"
print( paste0( "In total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets <- tweets[!duplicated(tweets$text),]

folder_name = "ThePatch/"
print( paste0( "Without duplicates, in total we have ", nrow( tweets ), " initial tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

patchNoClean <- tweets

#2nd. remove male BC, then count tweets
maleContraception <- grep(" male contraception", tolower(tweets$text))
maleContraceptive <- grep(" male contraceptive", tolower(tweets$text))
maleBirthControl <- grep(" male birth control", tolower(tweets$text))

maleBCrows <- c(maleContraception, maleContraceptive, maleBirthControl)
print(paste0("There were ", length(maleBCrows), " tweets about male birth control"))

tweets <- tweets[-maleBCrows, ]

folder_name = "ThePatch/"
print( paste0( "Without male contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))

tweets$text <- as.character( tweets$text )

#3rd. remove emergency contraception, then count tweets
emergencyContraception <- grep(" emergency contraception", tolower(tweets$text))
emergencyContraceptive <- grep(" emergency contraceptive", tolower(tweets$text))
emergencyBirthControl <- grep(" emergency birth control", tolower(tweets$text))

emergencyContraceptionrows <- c(emergencyContraception, emergencyContraceptive, emergencyBirthControl)
print(paste0("There were ", length(emergencyContraceptionrows), " tweets about emergency contraception"))

tweets <- tweets[-emergencyContraceptionrows, ]

folder_name = "ThePatch/"
print( paste0( "Without emergency contraception, in total we have ", nrow( tweets ), " tweets for the category ", folder_name ))
tweets$text <- as.character( tweets$text )

#4th. Clean tweet text
print("Length of the tweets before cleaning")
summary( nchar( tweets$text ) )
ourCleanTweets <- cleanTweets( tweets.df = tweets )

print("Length of the tweets after cleaning")
summary( nchar( ourCleanTweets$text ) )

#5th. Clean by username
ourCleanTweets <- usernameClean( tweets.df = ourCleanTweets, category = "Patch")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that do not contain the keyword as username" ))

#6th. Remove duplicates
ourCleanTweets <- ourCleanTweets[!duplicated(ourCleanTweets$text),]
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets without duplicates" ))

#7th. Save the file to make Figures
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/SemiCleanAndAggregateTweets/Patch_SemiCleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

patchSemiClean <- ourCleanTweets

#8th. Remove tweets mentioning multiple categories
ourCleanTweets <- tweetsUniqueCategory( tweets.df = ourCleanTweets, category = "Patch")
print( paste0( "In total we have ", nrow(ourCleanTweets), " tweets that refer to a unique category" ))

#9th. Save the file to proceed to the AWS comprehend analysis
write.table( ourCleanTweets, 
            file = "/home/ec2-user/SageMaker/CleanAndAggregateTweets/Patch_CleanTweets.txt", 
            col.names = TRUE, 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE )

patch <- ourCleanTweets
rm(ourCleanTweets)

[1] "Reading the file BCPatch.csv"
[1] "For the keyword BCPatch.csv there are a total of 1957"
[1] "Reading the file BirthControlPatch.csv"
[1] "For the keyword BirthControlPatch.csv there are a total of 13362"
[1] "Reading the file ContraceptionPatch.csv"
[1] "For the keyword ContraceptionPatch.csv there are a total of 240"
[1] "Reading the file ContraceptivePatch.csv"
[1] "For the keyword ContraceptivePatch.csv there are a total of 3175"
[1] "Reading the file OrthoEvra.csv"
[1] "For the keyword OrthoEvra.csv there are a total of 3191"
[1] "Reading the file Xulane.csv"
[1] "For the keyword Xulane.csv there are a total of 610"
[1] "In total we have 22535 initial tweets for the category ThePatch/"
[1] "Without duplicates, in total we have 19656 initial tweets for the category ThePatch/"
[1] "There were 73 tweets about male birth control"
[1] "Without male contraception, in total we have 19583 tweets for the category ThePatch/"
[1] "There were 4 tweets about emergency contraception"
[1] 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    8.0    70.0    98.0   102.2   132.0   481.0 

[1] "Length of the tweets after cleaning"


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.00   52.00   78.00   83.31  110.00  282.00 

[1] "In total we have 19569 tweets that do not contain the keyword as username"
[1] "In total we have 16513 tweets without duplicates"
[1] "In total we have 14568 tweets that refer to a unique category"


In [14]:
#####################################
## Summary of the tweets extracted ##
#####################################

#all tweets extracted
totalTweets <- rbind( copperNoClean, lngNoClean, IUD_NoClean,
                     implantNoClean,shotNoClean,pillNoClean,
                     ringNoClean, patchNoClean)



dim(totalTweets)

In [15]:
sessionInfo()

R version 3.6.1 (2019-07-05)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Amazon Linux AMI 2018.03

Matrix products: default
BLAS/LAPACK: /home/ec2-user/anaconda3/envs/R/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] forcats_0.4.0   stringr_1.4.0   purrr_0.3.2     readr_1.3.1    
[5] tidyr_0.8.3     tibble_2.1.1    ggplot2_3.1.1   tidyverse_1.2.1
[9] dplyr_0.8.3    

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.1       cellranger_1.1.0 pillar_1.3.1     compiler_3.6.1  
 [5] plyr_1.8.4       base64en