# Build Structural Topic Model on NEH Grant Products

<b>Author:</b> Jaren Haber, PhD </br>
<b>Date</b>: September 16, 2023 </br>
<b>Description</b>: Text <br/>
<b>Sources</b>: Code borrowed from [my team's work here](https://github.com/comp-strat/text_analysis) and [this vignette for STM in R](https://cran.r-project.org/web/packages/stm/vignettes/stmVignette.pdf)

## Initialize

In [6]:
# Load packages
library('dplyr')
library('stm')
library('tibble')
library('tidyverse')
library('rhdf5')
library('data.table')


Attaching package: ‘data.table’


The following object is masked from ‘package:purrr’:

    transpose


The following objects are masked from ‘package:dplyr’:

    between, first, last




In [2]:
# Set working directory to data folder
setwd('../data/')
getwd()

## Import data

In [14]:
data = fread('NEH_GrantProducts_Combined.tsv', sep='\t')
str(data)

Classes ‘data.table’ and 'data.frame':	10514 obs. of  4 variables:
 $ ID         : int  72 73 74 75 91 92 93 94 95 96 ...
 $ Abstract   : chr  "This collection consists of recordings made by Dra. Lastra from 1959 to the present day of Mesoamerican languag"| __truncated__ "This collection consists of recordings made by Dra. Cervantes from 1986 to 2006 of the Bribri language of Costa"| __truncated__ "This collection consists of audio and video recordings made by Dr. Hopkins from 1964 to 2002 of the Mexican May"| __truncated__ "This collection consists of audio recordings, transcriptions, translations and other field notes and articles c"| __truncated__ ...
 $ Year       : int  2010 2010 2010 2010 2009 2009 2009 2009 2008 2008 ...
 $ ProductType: chr  "Collections" "Collections" "Collections" "Collections" ...
 - attr(*, ".internal.selfref")=<externalptr> 


In [15]:
# check whether rows contain any NAs
row.has.na <- apply(data, 1, function(x){any(is.na(x))})
sum(row.has.na)

## Prepare text data

In [16]:
processed <- textProcessor(data$Abstract, metadata = data)

Building corpus... 
Converting to Lower Case... 
Removing punctuation... 
Removing stopwords... 
Removing numbers... 
Stemming... 


ERROR: Error in `[.data.table`(metadata, , i): j (the 2nd argument inside [...]) is a single symbol but column name 'i' is not found. Perhaps you intended DT[, ..i]. This difference to data.frame is deliberate and explained in FAQ 1.1.


In [17]:
processed

ERROR: Error in eval(expr, envir, enclos): object 'processed' not found


In [None]:
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)

In [None]:
docs <- out$documents
vocab <- out$vocab
meta <- out$meta

In [None]:
plotRemoved(processed$documents, lower.thresh = seq(10, 210, by = 50))

In [None]:
out <- prepDocuments(processed$documents, processed$vocab,processed$meta, lower.thresh = 20)

## Estimate Structural Topic Model

In [None]:
model <- stm(documents = out$documents, 
             vocab = out$vocab, 
             K = 20, 
             prevalence =~ Year + Gender, 
             max.em.its = 25, 
             data = out$meta, 
             init.type = "Spectral")

## Examine words and example documents

In [None]:
# Explore the words associated with each topic
labelTopics(model, c(1:20))

In [None]:
# examine documents that are highly associated with topics
thoughts <- findThoughts(model, texts = shortdoc, n = 2, topics = 3)$docs[[1]]

In [None]:
thoughts20 <- findThoughts(charterPrevFit, texts = shortdoc, n = 2, topics = 20)$docs[[1]]

In [None]:
# Estimate metadata/topic relationships
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))

In [None]:
plotQuote(thoughts3, width = 30, main = "Topic 3")

In [None]:
plotQuote(thoughts20, width = 30, main = "Topic 20")

In [None]:
out$meta$rating <- as.factor(out$meta$rating)

In [None]:
prep <- estimateEffect(1:20 ~ rating + s(day), charterPrevFit, meta = out$meta, uncertainty = "Global")

In [None]:
summary(prep, topics=1)

## Visualize STM

In [None]:
# Summary visualization
plot(model, type = "summary", xlim = c(0, 1))
# Graphical display of estimated topic proportions.

In [None]:
# Topical content
plot(prep, covariate = "rating", topics = c(3, 7, 20),model = model, 
        method = "difference", cov.value1 = "Liberal", cov.value2 = "Conservative",
        xlab = "More Conservative ... More Liberal", main = "Effect of Liberal vs. Conservative",
        xlim = c(-.1, .1), labeltype = "custom", custom.labels = c('Obama', 'Sarah Palin','Bush Presidency'))

In [None]:
plot(prep, "day", method = "continuous", topics = 7, model = z, printlegend = FALSE, xaxt = "n", xlab = "Time (2008)")

In [None]:
yearseq <- seq(from = as.Date("1970-01-01"),to = as.Date("2023-12-01"), by = "year")

In [None]:
years <- years(yearseq)
axis(1,at = as.numeric(yearseq) - min(as.numeric(yearseq)),labels = years)

# Graphical display of topic prevalence.

In [None]:
plot(model, type = "perspectives", topics = 11)

# Graphical display of topical perspectives

In [None]:
plot(model, type = "perspectives", topics = c(12, 20))

# Graphical display of topical contrast between topics 12 and 20.