# Analysis of the polarity results: numerical


This notebook contains the different functions developed to analyze the polarity analysis:
- formating the output of Amazon Comprehend
- creating a summary table for each category or group of categories (LARC, non-LARC). For each category or group of categories we will obtain: 
    - total number of tweets analyzed
    - total number of confident tweets (amazon comprehend confidence score >= 0.95 )
    - % positive, negative and neutral confidenent tweets

When running this notebook, we saved the output table also in a file. 

In [17]:
#####################
# Load Libraries    #
#####################
#install.packages(c("ggplot2","plotly","dplyr", "reshape2"))
library("ggplot2")
library("dplyr")
library("reshape2")
rm(list=ls())

In [43]:
######################
# Polarity Analysis  #
######################
polarityAnalysis <- function( categories ){
    
    for( i in 1:length(categories)){
        
        if( i == 1){
             polarityInput  <- read.delim( paste0( '/home/ec2-user/SageMaker/CleanAndAggregateTweets/', categories[i], "_CleanTweets.txt"), header = TRUE)
            polarityInput  <- polarityInput[, c("text", "date")]
            polarityInput$year <- sapply(strsplit( as.character(polarityInput$date), "-"), '[', 1)
            polarityInput$yearMonth <- paste0( sapply(strsplit( as.character(polarityInput$date), "-"), '[', 1), "-",
                                       sapply(strsplit( as.character(polarityInput$date), "-"), '[', 2) )
            colnames(polarityInput)[1] <- "tweets"
    
            polarityOutput <- read.csv( paste0( '/home/ec2-user/SageMaker/AWScomprenhend_output/', categories[i], "_AWScomprehend_complete.csv"), header = TRUE)
       
            final <- inner_join( polarityOutput, polarityInput, by="tweets")
        }else{
            polarityInputInt  <- read.delim( paste0( '/home/ec2-user/SageMaker/CleanAndAggregateTweets/', categories[i], "_CleanTweets.txt"), header = TRUE)
            polarityInputInt  <- polarityInputInt[, c("text", "date")]
            polarityInputInt$year <- sapply(strsplit( as.character(polarityInputInt$date), "-"), '[', 1)
            polarityInputInt$yearMonth <- paste0( sapply(strsplit( as.character(polarityInputInt$date), "-"), '[', 1), "-",
                                       sapply(strsplit( as.character(polarityInputInt$date), "-"), '[', 2) )
            colnames(polarityInputInt)[1] <- "tweets"
    
            polarityOutputInt <- read.csv( paste0( '/home/ec2-user/SageMaker/AWScomprenhend_output/', categories[i], "_AWScomprehend_complete.csv"), header = TRUE)
   
            finalInt <- inner_join( polarityOutputInt, polarityInputInt, by="tweets")

            final <- rbind( final, finalInt )

        }
        
    }
    return( final )
}

In [44]:
###################
# Summary by year #
###################
confidenceScore <- 0.95
polarityByYear <- function( polarityResults, confidence, outputPath, outputName ){
    
    print(paste0("There are a total of ", nrow(polarityResults), " tweets in this category") )
    years <- sort(unique( polarityResults$year))
    
    results <- as.data.frame( matrix( ncol = 7, nrow= length( years )))
    colnames(results) <- c("year", "totalTweets","confidentTweets","positiveConfident", 
                           "negativeConfident", "neutralConfident", "mixedConfident")
    results$year <- years
    
    selection <- polarityResults[ polarityResults$positive >= confidence | polarityResults$negative >= confidence | 
                                     polarityResults$neutral >= confidence | polarityResults$mixed >= confidence, ]
    
    print(paste0("There are a total of ", nrow(selection), " tweets in this category with ", confidence, "confidence") )

    for( i in 1:nrow(results)){
        
        results$totalTweets[i]       <- nrow( polarityResults[ polarityResults$year == results$year[i], ] )
        results$confidentTweets[i]   <- nrow( selection[ selection$year == results$year[i], ] )
        results$positiveConfident[i] <- nrow( selection[ selection$year == results$year[i] & selection$sentiments == "POSITIVE", ] )
        results$negativeConfident[i] <- nrow( selection[ selection$year == results$year[i] & selection$sentiments == "NEGATIVE", ] )
        results$neutralConfident[i]  <- nrow( selection[ selection$year == results$year[i] & selection$sentiments == "NEUTRAL", ] )
        results$mixedConfident[i]  <- nrow( selection[ selection$year == results$year[i] & selection$sentiments == "MIXED", ] )
    }
    
    totalValues <- c( "Total", sum( as.numeric( results$totalTweets) ), sum( as.numeric( results$confidentTweets)), 
                    sum( as.numeric( results$positiveConfident) ), sum( as.numeric( results$negativeConfident) ),
                    sum( as.numeric( results$neutralConfident) ), sum( as.numeric( results$mixedConfident) ))

    results <- rbind( results, totalValues )
    
    resultsToSave <- results
    resultsToSave$category <- outputName
    write.table( resultsToSave, file = paste0(outputPath, outputName, ".txt"), col.names = TRUE, 
                 row.names = FALSE, quote = FALSE,sep = "\t" )

    results$confidenceTweetsPerc <- paste0(round((as.numeric(results$confidentTweets)/ as.numeric(results$totalTweets))*100,2),"%")
    results$positiveConfidentPerc <- paste0(round((as.numeric(results$positiveConfident)/ as.numeric(results$confidentTweets))*100,2),"%")
    results$negativeConfidentPerc <- paste0(round((as.numeric(results$negativeConfident)/ as.numeric(results$confidentTweets))*100,2),"%")
    results$neutralConfidentPerc <- paste0(round((as.numeric(results$neutralConfident)/ as.numeric(results$confidentTweets))*100,2),"%")
    results$mixedConfidentPerc <- paste0(round((as.numeric(results$mixedConfident)/ as.numeric(results$confidentTweets))*100,2),"%")
    
    results$confidentTweets <- paste0( results$confidentTweets, " (", results$confidenceTweetsPerc, ")")
    results$positiveConfident <- paste0( results$positiveConfident, " (", results$positiveConfidentPerc, ")")
    results$negativeConfident <- paste0( results$negativeConfident, " (", results$negativeConfidentPerc, ")")
    results$neutralConfident <- paste0( results$neutralConfident, " (", results$neutralConfidentPerc, ")")
    results$mixedConfident <- paste0( results$mixedConfident, " (", results$mixedConfidentPerc, ")")

    results <- results[, c(1:7)]
    return( results )
}

In [45]:
##############
# Copper IUD #
##############
polarityResultsByYear <- polarityAnalysis( categories = "copperIUD" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore,
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "copperIUD" )
summaryResults

[1] "There are a total of 17577 tweets in this category"
[1] "There are a total of 4580 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2008,6,1 (16.67%),0 (0%),0 (0%),0 (0%),1 (100%)
2009,97,18 (18.56%),7 (38.89%),2 (11.11%),8 (44.44%),1 (5.56%)
2010,384,95 (24.74%),17 (17.89%),16 (16.84%),58 (61.05%),4 (4.21%)
2011,582,121 (20.79%),30 (24.79%),18 (14.88%),67 (55.37%),6 (4.96%)
2012,1473,381 (25.87%),76 (19.95%),140 (36.75%),137 (35.96%),28 (7.35%)
2013,1160,289 (24.91%),63 (21.8%),75 (25.95%),138 (47.75%),13 (4.5%)
2014,933,250 (26.8%),50 (20%),44 (17.6%),146 (58.4%),10 (4%)
2015,1023,260 (25.42%),64 (24.62%),62 (23.85%),114 (43.85%),20 (7.69%)
2016,1589,384 (24.17%),110 (28.65%),90 (23.44%),154 (40.1%),30 (7.81%)
2017,2316,645 (27.85%),163 (25.27%),179 (27.75%),258 (40%),45 (6.98%)


In [34]:
####################################
# Copper IUD all confidence scores #
####################################
polarityResultsByYear <- polarityAnalysis( categories = "copperIUD" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0,
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "copperIUD" )
summaryResults

[1] "There are a total of 17577 tweets in this category"
[1] "There are a total of 17577 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2008,6,6 (100%),2 (33.33%),2 (33.33%),1 (16.67%),1 (16.67%)
2009,97,97 (100%),20 (20.62%),37 (38.14%),34 (35.05%),6 (6.19%)
2010,384,384 (100%),57 (14.84%),91 (23.7%),223 (58.07%),13 (3.39%)
2011,582,582 (100%),117 (20.1%),140 (24.05%),292 (50.17%),33 (5.67%)
2012,1473,1473 (100%),248 (16.84%),537 (36.46%),603 (40.94%),85 (5.77%)
2013,1160,1160 (100%),225 (19.4%),338 (29.14%),549 (47.33%),48 (4.14%)
2014,933,933 (100%),164 (17.58%),267 (28.62%),467 (50.05%),35 (3.75%)
2015,1023,1023 (100%),220 (21.51%),309 (30.21%),444 (43.4%),50 (4.89%)
2016,1589,1589 (100%),360 (22.66%),539 (33.92%),608 (38.26%),82 (5.16%)
2017,2316,2316 (100%),506 (21.85%),780 (33.68%),904 (39.03%),126 (5.44%)


In [46]:
###########
# LNG IUD #
###########
polarityResultsByYear <- polarityAnalysis( categories = "LNG-IUD" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "LNG-IUD")
summaryResults

[1] "There are a total of 11500 tweets in this category"
[1] "There are a total of 3729 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2008,6,1 (16.67%),0 (0%),0 (0%),0 (0%),1 (100%)
2009,174,53 (30.46%),5 (9.43%),8 (15.09%),37 (69.81%),3 (5.66%)
2010,381,106 (27.82%),18 (16.98%),15 (14.15%),69 (65.09%),4 (3.77%)
2011,481,123 (25.57%),10 (8.13%),17 (13.82%),92 (74.8%),4 (3.25%)
2012,890,267 (30%),15 (5.62%),62 (23.22%),183 (68.54%),7 (2.62%)
2013,1658,684 (41.25%),41 (5.99%),71 (10.38%),569 (83.19%),3 (0.44%)
2014,839,351 (41.84%),27 (7.69%),34 (9.69%),287 (81.77%),3 (0.85%)
2015,731,253 (34.61%),41 (16.21%),31 (12.25%),178 (70.36%),3 (1.19%)
2016,945,280 (29.63%),62 (22.14%),43 (15.36%),167 (59.64%),8 (2.86%)
2017,1235,355 (28.74%),98 (27.61%),51 (14.37%),183 (51.55%),23 (6.48%)


In [35]:
#################################
# LNG IUD all confidence scores #
#################################
polarityResultsByYear <- polarityAnalysis( categories = "LNG-IUD" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "LNG-IUD")
summaryResults

[1] "There are a total of 11500 tweets in this category"
[1] "There are a total of 11500 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2008,6,6 (100%),0 (0%),3 (50%),2 (33.33%),1 (16.67%)
2009,174,174 (100%),26 (14.94%),32 (18.39%),110 (63.22%),6 (3.45%)
2010,381,381 (100%),51 (13.39%),83 (21.78%),233 (61.15%),14 (3.67%)
2011,481,481 (100%),57 (11.85%),123 (25.57%),289 (60.08%),12 (2.49%)
2012,890,890 (100%),82 (9.21%),258 (28.99%),528 (59.33%),22 (2.47%)
2013,1658,1658 (100%),120 (7.24%),346 (20.87%),1172 (70.69%),20 (1.21%)
2014,839,839 (100%),80 (9.54%),158 (18.83%),584 (69.61%),17 (2.03%)
2015,731,731 (100%),126 (17.24%),132 (18.06%),454 (62.11%),19 (2.6%)
2016,945,945 (100%),167 (17.67%),215 (22.75%),532 (56.3%),31 (3.28%)
2017,1235,1235 (100%),271 (21.94%),307 (24.86%),594 (48.1%),63 (5.1%)


In [47]:
#############
# The Patch #
############
polarityResultsByYear <- polarityAnalysis( categories = "Patch" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Patch")
summaryResults

[1] "There are a total of 14568 tweets in this category"
[1] "There are a total of 4586 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,2,0 (0%),0 (NaN%),0 (NaN%),0 (NaN%),0 (NaN%)
2008,43,16 (37.21%),0 (0%),1 (6.25%),15 (93.75%),0 (0%)
2009,418,134 (32.06%),7 (5.22%),38 (28.36%),84 (62.69%),5 (3.73%)
2010,1749,562 (32.13%),19 (3.38%),102 (18.15%),427 (75.98%),14 (2.49%)
2011,3447,1099 (31.88%),21 (1.91%),247 (22.47%),812 (73.89%),19 (1.73%)
2012,2036,630 (30.94%),28 (4.44%),262 (41.59%),322 (51.11%),18 (2.86%)
2013,1434,419 (29.22%),24 (5.73%),185 (44.15%),196 (46.78%),14 (3.34%)
2014,1228,472 (38.44%),31 (6.57%),135 (28.6%),297 (62.92%),9 (1.91%)
2015,692,222 (32.08%),16 (7.21%),84 (37.84%),116 (52.25%),6 (2.7%)
2016,548,170 (31.02%),18 (10.59%),83 (48.82%),63 (37.06%),6 (3.53%)


In [36]:
###################################
# The Patch all confidence scores #
###################################
polarityResultsByYear <- polarityAnalysis( categories = "Patch" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Patch")
summaryResults

[1] "There are a total of 14568 tweets in this category"
[1] "There are a total of 14568 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,2,2 (100%),1 (50%),0 (0%),1 (50%),0 (0%)
2008,43,43 (100%),2 (4.65%),6 (13.95%),35 (81.4%),0 (0%)
2009,418,418 (100%),37 (8.85%),151 (36.12%),213 (50.96%),17 (4.07%)
2010,1749,1749 (100%),124 (7.09%),509 (29.1%),1072 (61.29%),44 (2.52%)
2011,3447,3447 (100%),214 (6.21%),1069 (31.01%),2104 (61.04%),60 (1.74%)
2012,2036,2036 (100%),170 (8.35%),830 (40.77%),983 (48.28%),53 (2.6%)
2013,1434,1434 (100%),123 (8.58%),595 (41.49%),673 (46.93%),43 (3%)
2014,1228,1228 (100%),121 (9.85%),387 (31.51%),696 (56.68%),24 (1.95%)
2015,692,692 (100%),72 (10.4%),254 (36.71%),353 (51.01%),13 (1.88%)
2016,548,548 (100%),67 (12.23%),244 (44.53%),218 (39.78%),19 (3.47%)


In [48]:
############
# The Ring #
############
polarityResultsByYear <- polarityAnalysis( categories = "Ring" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Ring")
summaryResults

[1] "There are a total of 56283 tweets in this category"
[1] "There are a total of 13389 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2008,49,17 (34.69%),4 (23.53%),9 (52.94%),2 (11.76%),2 (11.76%)
2009,1451,359 (24.74%),49 (13.65%),127 (35.38%),143 (39.83%),40 (11.14%)
2010,3829,768 (20.06%),85 (11.07%),221 (28.78%),344 (44.79%),118 (15.36%)
2011,5813,1175 (20.21%),121 (10.3%),396 (33.7%),473 (40.26%),185 (15.74%)
2012,9879,2501 (25.32%),148 (5.92%),732 (29.27%),1340 (53.58%),281 (11.24%)
2013,5147,1112 (21.6%),106 (9.53%),300 (26.98%),552 (49.64%),154 (13.85%)
2014,6873,1579 (22.97%),146 (9.25%),453 (28.69%),856 (54.21%),124 (7.85%)
2015,4356,935 (21.46%),105 (11.23%),306 (32.73%),437 (46.74%),87 (9.3%)
2016,6267,1594 (25.43%),146 (9.16%),252 (15.81%),1105 (69.32%),91 (5.71%)
2017,3646,977 (26.8%),179 (18.32%),260 (26.61%),444 (45.45%),94 (9.62%)


In [37]:
##################################
# The Ring all confidence scores #
##################################
polarityResultsByYear <- polarityAnalysis( categories = "Ring" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Ring")
summaryResults

[1] "There are a total of 56283 tweets in this category"
[1] "There are a total of 56283 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2008,49,49 (100%),8 (16.33%),20 (40.82%),16 (32.65%),5 (10.2%)
2009,1451,1451 (100%),243 (16.75%),464 (31.98%),627 (43.21%),117 (8.06%)
2010,3829,3829 (100%),595 (15.54%),1136 (29.67%),1750 (45.7%),348 (9.09%)
2011,5813,5813 (100%),775 (13.33%),1902 (32.72%),2594 (44.62%),542 (9.32%)
2012,9879,9879 (100%),1033 (10.46%),3294 (33.34%),4797 (48.56%),755 (7.64%)
2013,5147,5147 (100%),670 (13.02%),1554 (30.19%),2535 (49.25%),388 (7.54%)
2014,6873,6873 (100%),736 (10.71%),2354 (34.25%),3330 (48.45%),453 (6.59%)
2015,4356,4356 (100%),449 (10.31%),1622 (37.24%),2029 (46.58%),256 (5.88%)
2016,6267,6267 (100%),684 (10.91%),1324 (21.13%),4002 (63.86%),257 (4.1%)
2017,3646,3646 (100%),551 (15.11%),1127 (30.91%),1706 (46.79%),262 (7.19%)


In [49]:
###########
# Implant #
###########
polarityResultsByYear <- polarityAnalysis( categories = "Implant" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Implant")
summaryResults

[1] "There are a total of 76356 tweets in this category"
[1] "There are a total of 22724 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2008,19,2 (10.53%),0 (0%),0 (0%),2 (100%),0 (0%)
2009,392,123 (31.38%),11 (8.94%),22 (17.89%),88 (71.54%),2 (1.63%)
2010,1274,349 (27.39%),45 (12.89%),60 (17.19%),237 (67.91%),7 (2.01%)
2011,4409,1225 (27.78%),146 (11.92%),261 (21.31%),788 (64.33%),30 (2.45%)
2012,5918,1653 (27.93%),277 (16.76%),588 (35.57%),741 (44.83%),47 (2.84%)
2013,5754,1726 (30%),266 (15.41%),618 (35.81%),786 (45.54%),56 (3.24%)
2014,5881,1927 (32.77%),250 (12.97%),583 (30.25%),1039 (53.92%),55 (2.85%)
2015,6337,2065 (32.59%),292 (14.14%),608 (29.44%),1107 (53.61%),58 (2.81%)
2016,7294,2240 (30.71%),446 (19.91%),926 (41.34%),784 (35%),84 (3.75%)
2017,9075,2675 (29.48%),706 (26.39%),1118 (41.79%),713 (26.65%),138 (5.16%)


In [38]:
#################################
# Implant all confidence scores #
#################################
polarityResultsByYear <- polarityAnalysis( categories = "Implant" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Implant")
summaryResults

[1] "There are a total of 76356 tweets in this category"
[1] "There are a total of 76356 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2008,19,19 (100%),3 (15.79%),4 (21.05%),12 (63.16%),0 (0%)
2009,392,392 (100%),55 (14.03%),92 (23.47%),234 (59.69%),11 (2.81%)
2010,1274,1274 (100%),185 (14.52%),297 (23.31%),762 (59.81%),30 (2.35%)
2011,4409,4409 (100%),594 (13.47%),1172 (26.58%),2534 (57.47%),109 (2.47%)
2012,5918,5918 (100%),1038 (17.54%),1966 (33.22%),2752 (46.5%),162 (2.74%)
2013,5754,5754 (100%),987 (17.15%),1927 (33.49%),2669 (46.39%),171 (2.97%)
2014,5881,5881 (100%),966 (16.43%),1929 (32.8%),2825 (48.04%),161 (2.74%)
2015,6337,6337 (100%),933 (14.72%),2044 (32.26%),3183 (50.23%),177 (2.79%)
2016,7294,7294 (100%),1336 (18.32%),2764 (37.89%),2937 (40.27%),257 (3.52%)
2017,9075,9075 (100%),1861 (20.51%),3707 (40.85%),3129 (34.48%),378 (4.17%)


In [50]:
############
# The pill #
############
polarityResultsByYear <- polarityAnalysis( categories = "Pill" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Pill")
summaryResults

[1] "There are a total of 90836 tweets in this category"
[1] "There are a total of 20848 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,14,2 (14.29%),0 (0%),0 (0%),2 (100%),0 (0%)
2008,114,21 (18.42%),0 (0%),6 (28.57%),15 (71.43%),0 (0%)
2009,1423,354 (24.88%),17 (4.8%),49 (13.84%),280 (79.1%),8 (2.26%)
2010,5092,1354 (26.59%),65 (4.8%),129 (9.53%),1129 (83.38%),31 (2.29%)
2011,7686,1906 (24.8%),103 (5.4%),337 (17.68%),1392 (73.03%),74 (3.88%)
2012,9770,2228 (22.8%),124 (5.57%),545 (24.46%),1468 (65.89%),91 (4.08%)
2013,7941,1920 (24.18%),103 (5.36%),406 (21.15%),1353 (70.47%),58 (3.02%)
2014,5697,1381 (24.24%),89 (6.44%),276 (19.99%),972 (70.38%),44 (3.19%)
2015,5591,1444 (25.83%),74 (5.12%),268 (18.56%),1068 (73.96%),34 (2.35%)
2016,6404,1525 (23.81%),99 (6.49%),319 (20.92%),1066 (69.9%),41 (2.69%)


In [39]:
##################################
# The pill all confidence scores #
##################################
polarityResultsByYear <- polarityAnalysis( categories = "Pill" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Pill")
summaryResults

[1] "There are a total of 90836 tweets in this category"
[1] "There are a total of 90836 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,14,14 (100%),0 (0%),2 (14.29%),12 (85.71%),0 (0%)
2008,114,114 (100%),6 (5.26%),40 (35.09%),67 (58.77%),1 (0.88%)
2009,1423,1423 (100%),149 (10.47%),353 (24.81%),886 (62.26%),35 (2.46%)
2010,5092,5092 (100%),564 (11.08%),983 (19.3%),3435 (67.46%),110 (2.16%)
2011,7686,7686 (100%),892 (11.61%),2020 (26.28%),4525 (58.87%),249 (3.24%)
2012,9770,9770 (100%),944 (9.66%),3029 (31%),5482 (56.11%),315 (3.22%)
2013,7941,7941 (100%),740 (9.32%),2442 (30.75%),4537 (57.13%),222 (2.8%)
2014,5697,5697 (100%),552 (9.69%),1697 (29.79%),3296 (57.86%),152 (2.67%)
2015,5591,5591 (100%),474 (8.48%),1616 (28.9%),3394 (60.7%),107 (1.91%)
2016,6404,6404 (100%),525 (8.2%),2167 (33.84%),3572 (55.78%),140 (2.19%)


In [51]:
###########
# TheShot #
###########
polarityResultsByYear <- polarityAnalysis( categories = "Shot" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Shot")
summaryResults

[1] "There are a total of 117907 tweets in this category"
[1] "There are a total of 30580 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,2,1 (50%),0 (0%),0 (0%),1 (100%),0 (0%)
2008,50,10 (20%),0 (0%),4 (40%),6 (60%),0 (0%)
2009,1426,382 (26.79%),33 (8.64%),102 (26.7%),241 (63.09%),6 (1.57%)
2010,4369,1080 (24.72%),97 (8.98%),401 (37.13%),551 (51.02%),31 (2.87%)
2011,11755,2775 (23.61%),277 (9.98%),1510 (54.41%),907 (32.68%),81 (2.92%)
2012,19490,4802 (24.64%),422 (8.79%),3059 (63.7%),1186 (24.7%),135 (2.81%)
2013,17055,4322 (25.34%),361 (8.35%),2849 (65.92%),991 (22.93%),121 (2.8%)
2014,13074,3614 (27.64%),318 (8.8%),2359 (65.27%),859 (23.77%),78 (2.16%)
2015,9967,2776 (27.85%),205 (7.38%),1599 (57.6%),925 (33.32%),47 (1.69%)
2016,9476,2529 (26.69%),259 (10.24%),1572 (62.16%),572 (22.62%),126 (4.98%)


In [40]:
##################################
# The Shot all confidence scores #
##################################
polarityResultsByYear <- polarityAnalysis( categories = "Shot" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "Shot")
summaryResults

[1] "There are a total of 117907 tweets in this category"
[1] "There are a total of 117907 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,2,2 (100%),0 (0%),1 (50%),1 (50%),0 (0%)
2008,50,50 (100%),6 (12%),19 (38%),23 (46%),2 (4%)
2009,1426,1426 (100%),158 (11.08%),472 (33.1%),763 (53.51%),33 (2.31%)
2010,4369,4369 (100%),516 (11.81%),1599 (36.6%),2122 (48.57%),132 (3.02%)
2011,11755,11755 (100%),1534 (13.05%),5166 (43.95%),4750 (40.41%),305 (2.59%)
2012,19490,19490 (100%),2607 (13.38%),9706 (49.8%),6636 (34.05%),541 (2.78%)
2013,17055,17055 (100%),2068 (12.13%),8915 (52.27%),5665 (33.22%),407 (2.39%)
2014,13074,13074 (100%),1604 (12.27%),6867 (52.52%),4304 (32.92%),299 (2.29%)
2015,9967,9967 (100%),1084 (10.88%),4972 (49.88%),3702 (37.14%),209 (2.1%)
2016,9476,9476 (100%),1258 (13.28%),4871 (51.4%),3039 (32.07%),308 (3.25%)


In [52]:
#######
# IUD #
#######
polarityResultsByYear <- polarityAnalysis( categories = "IUD" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "IUD")
summaryResults

[1] "There are a total of 280037 tweets in this category"
[1] "There are a total of 60277 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,13,2 (15.38%),1 (50%),0 (0%),1 (50%),0 (0%)
2008,265,52 (19.62%),7 (13.46%),19 (36.54%),23 (44.23%),3 (5.77%)
2009,509,131 (25.74%),16 (12.21%),29 (22.14%),81 (61.83%),5 (3.82%)
2010,7021,1692 (24.1%),300 (17.73%),466 (27.54%),841 (49.7%),85 (5.02%)
2011,14268,3417 (23.95%),542 (15.86%),900 (26.34%),1820 (53.26%),155 (4.54%)
2012,20051,4441 (22.15%),762 (17.16%),1364 (30.71%),2082 (46.88%),233 (5.25%)
2013,19026,4548 (23.9%),762 (16.75%),1461 (32.12%),2128 (46.79%),197 (4.33%)
2014,24814,5439 (21.92%),710 (13.05%),1482 (27.25%),2998 (55.12%),249 (4.58%)
2015,23265,5170 (22.22%),729 (14.1%),1435 (27.76%),2775 (53.68%),231 (4.47%)
2016,29167,5611 (19.24%),997 (17.77%),2035 (36.27%),2201 (39.23%),378 (6.74%)


In [41]:
#############################
# IUD all confidence scores #
#############################
polarityResultsByYear <- polarityAnalysis( categories = "IUD" )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "IUD")
summaryResults

[1] "There are a total of 280037 tweets in this category"
[1] "There are a total of 280037 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,13,13 (100%),4 (30.77%),4 (30.77%),4 (30.77%),1 (7.69%)
2008,265,265 (100%),40 (15.09%),101 (38.11%),115 (43.4%),9 (3.4%)
2009,509,509 (100%),87 (17.09%),153 (30.06%),250 (49.12%),19 (3.73%)
2010,7021,7021 (100%),1102 (15.7%),2245 (31.98%),3377 (48.1%),297 (4.23%)
2011,14268,14268 (100%),2212 (15.5%),4374 (30.66%),7141 (50.05%),541 (3.79%)
2012,20051,20051 (100%),3238 (16.15%),6887 (34.35%),9093 (45.35%),833 (4.15%)
2013,19026,19026 (100%),2880 (15.14%),6495 (34.14%),8919 (46.88%),732 (3.85%)
2014,24814,24814 (100%),3023 (12.18%),8235 (33.19%),12711 (51.23%),845 (3.41%)
2015,23265,23265 (100%),3116 (13.39%),7496 (32.22%),11841 (50.9%),812 (3.49%)
2016,29167,29167 (100%),4127 (14.15%),11055 (37.9%),12764 (43.76%),1221 (4.19%)


## Now we create also the same table for LARC and non-LARC

In [13]:
########
# LARC #
#######
polarityResultsByYear <- polarityAnalysis( categories = c( "IUD", "copperIUD", "LNG-IUD", "Implant") )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "LARC")
summaryResults

[1] "There are a total of 385470 tweets in this category"
[1] "There are a total of 91310 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,13,2 (15.38%),1 (50%),0 (0%),1 (50%),0 (0%)
2008,296,56 (18.92%),7 (12.5%),19 (33.93%),25 (44.64%),5 (8.93%)
2009,1172,325 (27.73%),39 (12%),61 (18.77%),214 (65.85%),11 (3.38%)
2010,9060,2242 (24.75%),380 (16.95%),557 (24.84%),1205 (53.75%),100 (4.46%)
2011,19740,4886 (24.75%),728 (14.9%),1196 (24.48%),2767 (56.63%),195 (3.99%)
2012,28332,6742 (23.8%),1130 (16.76%),2154 (31.95%),3143 (46.62%),315 (4.67%)
2013,27598,7247 (26.26%),1132 (15.62%),2225 (30.7%),3621 (49.97%),269 (3.71%)
2014,32467,7967 (24.54%),1037 (13.02%),2143 (26.9%),4470 (56.11%),317 (3.98%)
2015,31356,7748 (24.71%),1126 (14.53%),2136 (27.57%),4174 (53.87%),312 (4.03%)
2016,38995,8515 (21.84%),1615 (18.97%),3094 (36.34%),3306 (38.83%),500 (5.87%)


In [14]:
############
# Non-LARC #
############
polarityResultsByYear <- polarityAnalysis( categories = c( "Shot", "Pill", "Ring", "Patch") )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "NonLARC")
summaryResults

[1] "There are a total of 279594 tweets in this category"
[1] "There are a total of 69403 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,18,3 (16.67%),0 (0%),0 (0%),3 (100%),0 (0%)
2008,256,64 (25%),4 (6.25%),20 (31.25%),38 (59.38%),2 (3.12%)
2009,4718,1229 (26.05%),106 (8.62%),316 (25.71%),748 (60.86%),59 (4.8%)
2010,15039,3764 (25.03%),266 (7.07%),853 (22.66%),2451 (65.12%),194 (5.15%)
2011,28701,6955 (24.23%),522 (7.51%),2490 (35.8%),3584 (51.53%),359 (5.16%)
2012,41175,10161 (24.68%),722 (7.11%),4598 (45.25%),4316 (42.48%),525 (5.17%)
2013,31577,7773 (24.62%),594 (7.64%),3740 (48.12%),3092 (39.78%),347 (4.46%)
2014,26872,7046 (26.22%),584 (8.29%),3223 (45.74%),2984 (42.35%),255 (3.62%)
2015,20606,5377 (26.09%),400 (7.44%),2257 (41.98%),2546 (47.35%),174 (3.24%)
2016,22695,5818 (25.64%),522 (8.97%),2226 (38.26%),2806 (48.23%),264 (4.54%)


In [33]:
#########################################################################
# Chi square test of proportions for sentiments about LARC vs. non-LARC #
#########################################################################
positiveTweets <- c(17939, 7086)
negativeTweets <- c(35749, 29590)
neutralTweets <- c(31771, 29506)

df1 = data.frame(positiveTweets, negativeTweets, neutralTweets)
df1

chisq.test(df1)

###

df2 = data.frame(negativeTweets, neutralTweets)
df2

chisq.test(df2)

###

df3 = data.frame(positiveTweets, negativeTweets)
df3

chisq.test(df3)

###

df4 = data.frame(positiveTweets, neutralTweets)
df4

chisq.test(df4)

positiveTweets,negativeTweets,neutralTweets
17939,35749,31771
7086,29590,29506



	Pearson's Chi-squared test

data:  df1
X-squared = 2968.5, df = 2, p-value < 2.2e-16


negativeTweets,neutralTweets
35749,31771
29590,29506



	Pearson's Chi-squared test with Yates' continuity correction

data:  df2
X-squared = 104.16, df = 1, p-value < 2.2e-16


positiveTweets,negativeTweets
17939,35749
7086,29590



	Pearson's Chi-squared test with Yates' continuity correction

data:  df3
X-squared = 2160.6, df = 1, p-value < 2.2e-16


positiveTweets,neutralTweets
17939,31771
7086,29506



	Pearson's Chi-squared test with Yates' continuity correction

data:  df4
X-squared = 2861.9, df = 1, p-value < 2.2e-16


In [None]:
#IUD Chi Square#
#positive-negative
chisq.test(c(10602, 25039))
#positive-neutral
chisq.test(c(10602, 20314))
#negative-neutral
chisq.test(c(25039,20314))

In [None]:
#LNG-IUD Chi Square#
#positive-negative
chisq.test(c(920, 590))
#positive-neutral
chisq.test(c(920, 2071))
#negative-neutral
chisq.test(c(590, 2071))

In [None]:
#Copper IUD Chi Square#
#positive-negative
chisq.test(c( 1391, 1492))
#positive-neutral
chisq.test(c( 1391, 1371))
#negative-neutral
chisq.test(c( 1492, 1371))

In [None]:
#Implant Chi Square#
#positive-negative
chisq.test(c(5026, 8628))
#positive-neutral
chisq.test(c(5026, 8015))
#negative-neutral
chisq.test(c(8628, 8015))

In [None]:
#LARC Chi Square#
#positive-negative
chisq.test(c(17939, 35749))
#positive-neutral
chisq.test(c(17939, 31771 ))
#negative-neutral
chisq.test(c(35749, 31771))

In [None]:
#Pill Chi Square#
#positive-negative
chisq.test(c(1679, 5670))
#positive-neutral
chisq.test(c(1679, 12792 ))
#negative-neutral
chisq.test(c(5670, 12792))

In [None]:
#Patch Chi Square#
#positive-negative
chisq.test(c( 335, 1455))
#positive-neutral
chisq.test(c( 335, 2663))
#negative-neutral
chisq.test(c( 1455, 2663))

In [None]:
#Ring Chi Square#
#positive-negative
chisq.test(c( 1928, 3660))
#positive-neutral
chisq.test(c( 1928, 6353))
#negative-neutral
chisq.test(c( 3660, 6353))

In [None]:
#Shot Chi Square#
#positive-negative
chisq.test(c( 3144, 18805))
#positive-neutral
chisq.test(c( 3144, 7698))
#negative-neutral
chisq.test(c(18805, 7698))

In [None]:
#SARC Chi Square#
#positive-negative
chisq.test(c( 3144, 29590))
#positive-neutral
chisq.test(c( 3144, 29506 ))
#negative-neutral
chisq.test(c(29590, 29506 ))

In [None]:
#All Methods Chi Square#
#positive-negative
chisq.test(c( 25025, 65339))
#positive-neutral
chisq.test(c( 25025, 61277  ))
#negative-neutral
chisq.test(c(65339, 61277 ))

In [16]:
###############
# All Methods #
###############
polarityResultsByYear <- polarityAnalysis( categories = c( "IUD", "copperIUD", "LNG-IUD", "Implant", "Shot", "Pill", "Ring", "Patch") )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = confidenceScore, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "AllTweets")
summaryResults

[1] "There are a total of 665064 tweets in this category"
[1] "There are a total of 160713 tweets in this category with 0.95confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,31,5 (16.13%),1 (20%),0 (0%),4 (80%),0 (0%)
2008,552,120 (21.74%),11 (9.17%),39 (32.5%),63 (52.5%),7 (5.83%)
2009,5890,1554 (26.38%),145 (9.33%),377 (24.26%),962 (61.9%),70 (4.5%)
2010,24099,6006 (24.92%),646 (10.76%),1410 (23.48%),3656 (60.87%),294 (4.9%)
2011,48441,11841 (24.44%),1250 (10.56%),3686 (31.13%),6351 (53.64%),554 (4.68%)
2012,69507,16903 (24.32%),1852 (10.96%),6752 (39.95%),7459 (44.13%),840 (4.97%)
2013,59175,15020 (25.38%),1726 (11.49%),5965 (39.71%),6713 (44.69%),616 (4.1%)
2014,59339,15013 (25.3%),1621 (10.8%),5366 (35.74%),7454 (49.65%),572 (3.81%)
2015,51962,13125 (25.26%),1526 (11.63%),4393 (33.47%),6720 (51.2%),486 (3.7%)
2016,61690,14333 (23.23%),2137 (14.91%),5320 (37.12%),6112 (42.64%),764 (5.33%)


In [42]:
###############
# All Methods #
###############
polarityResultsByYear <- polarityAnalysis( categories = c( "IUD", "copperIUD", "LNG-IUD", "Implant", "Shot", "Pill", "Ring", "Patch") )
dim(polarityResultsByYear)

summaryResults <- polarityByYear( polarityResults = polarityResultsByYear, confidence = 0, 
                                  outputPath = "/home/ec2-user/SageMaker/polarityPerYear/", outputName = "AllTweets")
summaryResults

[1] "There are a total of 665064 tweets in this category"
[1] "There are a total of 665064 tweets in this category with 0confidence"


year,totalTweets,confidentTweets,positiveConfident,negativeConfident,neutralConfident,mixedConfident
2007,31,31 (100%),5 (16.13%),7 (22.58%),18 (58.06%),1 (3.23%)
2008,552,552 (100%),67 (12.14%),195 (35.33%),271 (49.09%),19 (3.44%)
2009,5890,5890 (100%),775 (13.16%),1754 (29.78%),3117 (52.92%),244 (4.14%)
2010,24099,24099 (100%),3194 (13.25%),6943 (28.81%),12974 (53.84%),988 (4.1%)
2011,48441,48441 (100%),6395 (13.2%),15966 (32.96%),24229 (50.02%),1851 (3.82%)
2012,69507,69507 (100%),9360 (13.47%),26507 (38.14%),30874 (44.42%),2766 (3.98%)
2013,59175,59175 (100%),7813 (13.2%),22612 (38.21%),26719 (45.15%),2031 (3.43%)
2014,59339,59339 (100%),7246 (12.21%),21894 (36.9%),28213 (47.55%),1986 (3.35%)
2015,51962,51962 (100%),6474 (12.46%),18445 (35.5%),25400 (48.88%),1643 (3.16%)
2016,61690,61690 (100%),8524 (13.82%),23179 (37.57%),27672 (44.86%),2315 (3.75%)


In [15]:
sessionInfo()

R version 3.6.1 (2019-07-05)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Amazon Linux AMI 2018.03

Matrix products: default
BLAS/LAPACK: /home/ec2-user/anaconda3/envs/R/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_0.8.3   ggplot2_3.1.1

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.1       magrittr_1.5     tidyselect_0.2.5 munsell_0.5.0   
 [5] uuid_0.1-2       colorspace_1.4-1 R6_2.4.0         rlang_0.4.2     
 [9] plyr_1.8.4       tools_3.6.1      grid_3.6.1       gtable_0.3.0    
[13] wit