# PC Session 4

**Author:**
[Helge Liebert](https://hliebert.github.io/)

# **Text analysis**: Brexit debate

### Libraries

In [None]:
library("twfy")
library("jsonlite")
library("topicmodels")
library("textclean")
library("wordcloud")
library("slam")
library("tm")
library("data.table")
library("tidytext")
library("stringr")
library("dplyr")
library("ggplot2")
library("ggrepel")
library("uwot")
library("udpipe")
library("lsa")
library("factoextra")
library("word2vec")
library("plotly")
library("fpc")
library("text2vec")
library("doc2vec")

### Get data from TheyWorkForYou API

In [None]:
apikey <- "G3WVqtBtKAbdGVqrd8BKajm8"
set_api_key(apikey)

call <- getDebates(type = "commons", search = "Brexit", num = 1000, page = 1)
info <- call$info
pages <- ceiling(info$total_results/info$results_per_page)

## Form of data.frame, header only
brexit.debates <- flatten(as.data.frame(call$rows))[0, ]

## read all pages of results
for (p in seq(1, pages)) {
  call <- getDebates(type = "commons",  search = "Brexit", num = 1000, page = p)
  call$rows$speaker$office <- NULL
  brexit.debates <- rbind(brexit.debates, flatten(as.data.frame(call$rows)))
}

## save to file
fwrite(brexit.debates, "Data/brexit-debates.csv")
saveRDS(brexit.debates, "Data/brexit-debates.rds")

## read from file
## brexit.debates <- readRDS("Data/brexit-debates.rds")

## str(brexit.debates)
## length(unique(brexit.debates$gid))
## length(unique(brexit.debates$hdate))
## length(unique(brexit.debates$person_id))
names(brexit.debates)

## copy before transformations
brexit.debates$body.orig <- brexit.debates$body

In [None]:
brexit.debates

### Cleaning the text

Some cleaning and harmonizing, pre-written functions (e.g. `library(textclean)`) convenient compared to writing all regex on your own.

In [None]:
## brief check
## check_text(brexit.debates$body[1:100])
## Encoding(brexit.debates$body) <- "UTF-8"
head(brexit.debates$body)

In [None]:
brexit.debates$body <- replace_html(brexit.debates$body)
head(brexit.debates$body)

In [None]:
brexit.debates$body <- replace_non_ascii(brexit.debates$body)
head(brexit.debates$body)

In [None]:
brexit.debates$body <- gsub("&#8212;", " - ", brexit.debates$body)
head(brexit.debates$body)

In [None]:
brexit.debates$body <- gsub("&#[0-9]{3,4};", " ", brexit.debates$body)
head(brexit.debates$body)

In [None]:
brexit.debates$body <- replace_names(brexit.debates$body)
head(brexit.debates$body)

In [None]:
brexit.debates$body <- replace_money(brexit.debates$body, replacement = "MONEYHERE")
brexit.debates$body <- replace_date(brexit.debates$body, replacement = "DATEHERE")
brexit.debates$body <- replace_ordinal(brexit.debates$body, num.paste = TRUE)
brexit.debates$body <- replace_number(brexit.debates$body, remove = TRUE) ## makes a difference for topics!
brexit.debates$body <- add_comma_space(brexit.debates$body)
brexit.debates$body <- replace_contraction(brexit.debates$body)
brexit.debates$body <- replace_white(brexit.debates$body)
head(brexit.debates$body)

In [None]:
## remove blanks
## brexit.debates <- brexit.debates[brexit.debates$body != "", ]

### Topics, both parties

In [None]:
## Corpus
## corp <- brexit.debates
corp <- brexit.debates[, c("gid", "body")]
setnames(corp, "gid", "doc_id")
setnames(corp, "body", "text")

## initialize dtm
dtm <- DocumentTermMatrix(
  Corpus(DataframeSource(
    corp
  )),
  control = list(
    language = "english",
    ## weighting = weightTfIdf,
    weighting = weightTf,
    tolower = TRUE,
    removePunctuation = TRUE,
    removeNumbers = TRUE,
    stopwords = TRUE,
    stemming = FALSE,
    wordLengths = c(3, Inf)
  )
)

## checks
inspect(dtm)
## findFreqTerms(dtm, lowfreq = 10)
findFreqTerms(dtm, lowfreq = 1000)
## dtm <- removeSparseTerms(dtm, sparse=0.90) ## filter some
dtm <- dtm[row_sums(dtm) > 0, ] ## documents can't be empty

In [None]:
## Simple visualization
## wordcloud(brexit.debates$body, max.words = 100, random.order = FALSE,
##          colors = brewer.pal(8, "Dark2"))

## same plot, works with both tf and tf-idf weighting
counts <- sort(colSums(as.matrix(dtm)), decreasing = TRUE)
counts <- data.frame(word = names(counts), freq = counts)
wordcloud(words = counts$word, freq = counts$freq,
          max.words = 100, random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))

In [None]:
## Unsupervised clustering of documents: Topic model
tpm <- LDA(dtm, k = 3, control = list(seed = 100))
topic <- topics(tpm, 1)
freqterms <- terms(tpm, 50)
freqterms



In [None]:
## look at unique terms only per topic
duplicates <- c(freqterms)[duplicated(c(freqterms))]
distinctterms <- lapply(as.list(as.data.frame(freqterms)), function(x) x[!(x %in% duplicates)])
## distinctterms <- as.data.frame(distinctterms)
distinctterms

In [None]:
## Plot most frequent terms and associated probabilities by topic
tpmat <- tidy(tpm, matrix = "beta")
topterms <-
    tpmat %>%
    group_by(topic) %>%
    top_n(20, beta) %>%
    ungroup() %>%
    arrange(topic, -beta)
topterms %>%
    mutate(term = reorder(term, beta)) %>%
    ggplot(aes(term, beta, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free") +
    coord_flip()


### Conservative Party Topics

In [None]:
## differences by party?
table(brexit.debates$speaker.party)

In [None]:
dtm.con <- DocumentTermMatrix(
  Corpus(DataframeSource(
    corp[brexit.debates$speaker.party == "Conservative" ,]
  )),
  control = list(
    language = "english",
    weighting = weightTf,
    tolower = TRUE,
    removePunctuation = TRUE,
    removeNumbers = TRUE,
    stopwords = TRUE,
    stemming = FALSE,
    wordLengths = c(3, Inf)
  )
)
dtm.con <- dtm.con[row_sums(dtm.con) > 0, ] ## documents can't be empty

In [None]:
## Estimate lda topic model
tpm.con <- LDA(dtm.con, k = 3, control = list(seed = 100))
topic.con <- topics(tpm.con, 1)
freqterms.con <- terms(tpm.con, 50)
freqterms.con

In [None]:
## Plot most frequent terms and associated probabilities by topic
tpmat.con <- tidy(tpm.con, matrix = "beta")
topterms.con <-
    tpmat.con %>%
    group_by(topic) %>%
    top_n(20, beta) %>%
    ungroup() %>%
    arrange(topic, -beta)
topterms.con %>%
    mutate(term = reorder(term, beta)) %>%
    ggplot(aes(term, beta, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free") +
    coord_flip()

In [None]:
## look at unique terms only per topic
duplicates.con <- c(freqterms.con)[duplicated(c(freqterms.con))]
distinctterms.con <- lapply(as.list(as.data.frame(freqterms.con)), function(x) x[!(x %in% duplicates.con)])
distinctterms.con

### Labour Party Topics

In [None]:
dtm.lab <- DocumentTermMatrix(
  Corpus(DataframeSource(
    corp[brexit.debates$speaker.party == "Labour", ]
  )),
  control = list(
    language = "english",
    weighting = weightTf,
    tolower = TRUE,
    removePunctuation = TRUE,
    removeNumbers = TRUE,
    stopwords = TRUE,
    stemming = FALSE,
    wordLengths = c(3, Inf)
  )
)
dtm.lab <- dtm.lab[row_sums(dtm.lab) > 0, ] ## documents can't be empty

In [None]:
## Estimate LDA topic model
tpm.lab <- LDA(dtm.lab, k = 3, control = list(seed = 100))
topic.lab <- topics(tpm.lab, 1)
freqterms.lab <- terms(tpm.lab, 50)
freqterms.lab

In [None]:
## Plot most frequent terms and associated probabilities by topic
tpmat.lab <- tidy(tpm.lab, matrix = "beta")
topterms.lab <-
    tpmat.lab %>%
    group_by(topic) %>%
    top_n(20, beta) %>%
    ungroup() %>%
    arrange(topic, -beta)
topterms.lab %>%
    mutate(term = reorder(term, beta)) %>%
    ggplot(aes(term, beta, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free") +
    coord_flip()

In [None]:
## look at unique terms only per topic
duplicates.lab <- c(freqterms.lab)[duplicated(c(freqterms.lab))]
distinctterms.lab <- lapply(as.list(as.data.frame(freqterms.lab)), function(x) x[!(x %in% duplicates.lab)])
distinctterms.lab

### PCA/LSA

Since pca works less well for word similarity tasks and interpretations compared to other embeddings (when applied to the term-term matrix), lets apply it to the document-term-matrix.

We aggregate the document-term matrix by speaker, then use pca/lsa to get a reduced dimension that helps assessing document similarity. Then we use the smaller representation to apply k-means clustering to group politicians.

In [None]:
## check speakers and aggregate
## names(brexit.debates)
## length(unique((brexit.debates$speaker.name)))
## length(unique((brexit.debates$person_id)))
## nrow(unique((brexit.debates[, c("person_id", "speaker.party")])))

## aggregate, in base (more convenient with data.table; or dplyr if you must)
## brexit.speakers <- aggregate(body ~ speaker.name + person_id + speaker.party, data = brexit.debates, paste)
brexit.speakers <- aggregate(body ~ speaker.name + person_id, data = brexit.debates, paste)

## dismiss those who have empty text
brexit.speakers <- brexit.speakers[brexit.speakers$body != "", ]
brexit.speakers <- brexit.speakers[!is.na(brexit.speakers$body), ]

In [None]:
## checks
dim(brexit.speakers)
names(brexit.speakers)
## str(brexit.speakers)
brexit.speakers

In [None]:
## corpus base
corp <- brexit.speakers[, c("speaker.name", "body")]
setnames(corp, "speaker.name", "doc_id")
setnames(corp, "body", "text")

## initialize dtm
dtm <- DocumentTermMatrix(
  Corpus(DataframeSource(
    corp
  )),
  control = list(
    language = "english",
    weighting = weightTf,
    tolower = TRUE,
    removePunctuation = TRUE,
    removeNumbers = TRUE,
    stopwords = TRUE,
    stemming = FALSE,
    wordLengths = c(3, Inf)
  )
)

inspect(dtm)

In [None]:
## how many words?
brexit.speakers$wordcount <- str_count(as.character(brexit.speakers$body))
summary(brexit.speakers$wordcount)
qplot(brexit.speakers$wordcount, bins = 100)
qplot(brexit.speakers$wordcount[brexit.speakers$wordcount < 50000], bins = 100)

In [None]:
## LSA on the document term matrix
## ls <- lsa(dtm)
ls <- lsa(dtm, 2)
str(ls)
pcs <- as.data.frame(ls$tk)

## if you want to recoup a matrix of the original dimensions
M <- as.textmatrix(ls)

In [None]:
pcs

In [None]:
as.matrix(pcs) %*% diag(ls$sk)

In [None]:
## kmeans clustering. Try three clusters (three main parties)
km <- kmeans(pcs, centers = 3)
## km
## str(km)
fviz_cluster(km, data = pcs)

In [None]:
## checks
summary(brexit.speakers$wordcount)
brexit.speakers[brexit.speakers$speaker.name == "Jeremy Corbyn", "wordcount"]
brexit.speakers[brexit.speakers$speaker.name == "Valerie Vaz", "wordcount"]

In [None]:
## filter
outliers <- c("Jeremy Corbyn", "Valerie Vaz")
dtm <- DocumentTermMatrix(
  Corpus(DataframeSource(
    corp[!(corp$doc_id %in% outliers), ]
  )),
  control = list(
    language = "english",
    weighting = weightTf,
    tolower = TRUE,
    removePunctuation = TRUE,
    removeNumbers = TRUE,
    stopwords = TRUE,
    stemming = FALSE,
    wordLengths = c(3, Inf)
  )
)

## repeat lsa/km
ls <- lsa(dtm, 2)
pcs <- as.data.frame(ls$tk)
M <- as.textmatrix(ls)
km <- kmeans(pcs, centers = 3)
fviz_cluster(km, data = pcs)

In [None]:
## filter even more
dtm <- DocumentTermMatrix(
  Corpus(DataframeSource(
    corp[brexit.speakers$wordcount < quantile(brexit.speakers$wordcount, p = 0.95), ]
  )),
  control = list(
    language = "english",
    weighting = weightTf,
    tolower = TRUE,
    removePunctuation = TRUE,
    removeNumbers = TRUE,
    stopwords = TRUE,
    stemming = FALSE,
    wordLengths = c(3, Inf)
  )
)

## repeat lsa/km
ls <- lsa(dtm, 2)
pcs <- as.data.frame(ls$tk)
M <- as.textmatrix(ls)
km <- kmeans(pcs, centers = 3)
fviz_cluster(km, data = pcs)

In [None]:
## filter even more
dtm <- DocumentTermMatrix(
  Corpus(DataframeSource(
    corp[brexit.speakers$wordcount < quantile(brexit.speakers$wordcount, p = 0.95), ]
  )),
  control = list(
    language = "english",
    weighting = weightTfIdf,
    tolower = TRUE,
    removePunctuation = TRUE,
    removeNumbers = TRUE,
    stopwords = TRUE,
    stemming = FALSE,
    wordLengths = c(3, Inf)
  )
)

## repeat lsa/km
ls <- lsa(dtm, 2)
pcs <- as.data.frame(ls$tk)
M <- as.textmatrix(ls)
km <- kmeans(pcs, centers = 3)
fviz_cluster(km, data = pcs)

In [None]:
## sampling, adding document length as a column, ...


In [None]:
## do the clusters identify party membership?
## ...

## Word embeddings: Word2Vec

### Preparation and estimation

In [None]:
## this data is not ideal to train embeddings, it is too small.
## but it is fast and sufficient for illustration.

## use untransformed or only minimally transformed text as input
text <- brexit.debates$body.orig
text <- replace_html(text)
text <- replace_non_ascii(text)
text <- gsub("&#[0-9]{3,4};", " ", text)
## text <- replace_ordinal(text, num.paste = TRUE)
## text <- replace_number(text, remove = TRUE)
## text <- replace_contraction(text)
## text <- add_comma_space(text)
## text <- replace_white(text)
text <- tolower(text)

In [None]:
## train word2vec to learn embeddings
## vsmodel <- word2vec(x = text, type = "skip-gram", dim = 150, iter = 20)

## save model to file
## write.word2vec(vsmodel, "Data/w2v-brexit.bin")

In [None]:
## read again (save time)
vsmodel <- read.word2vec("Data/w2v-brexit.bin")

In [None]:
## all terms
terms <- summary(vsmodel, "vocabulary")
terms

In [None]:
## extract embeddings
embeddings <- as.matrix(vsmodel)
dim(embeddings)
head(embeddings)


### Semantics and similarity

In [None]:
## some word associations
predict(vsmodel, c("johnson", "corbyn", "bercow", "may", "starmer", "cummings"),
        type = "nearest", top_n = 5)

In [None]:
predict(vsmodel, c("negotiations", "deadline", "vote", "fisheries"),
        type = "nearest", top_n = 5)

In [None]:
## analogy tasks (better with pre-trained embeddings in a different context)
wv <- predict(vsmodel, newdata = c("uk", "continent", "eu"), type = "embedding")
wv <- wv["uk", ] - wv["eu", ] + wv["continent", ]
predict(vsmodel, newdata = wv, type = "nearest", top_n = 5)

In [None]:
## associations: uk without europe
wv <- embeddings["uk", ] - embeddings["europe", ]
predict(vsmodel, newdata = wv, type = "nearest", top_n = 5)

In [None]:
##  associations: brexit with agreement
wv <- embeddings["brexit", ] + embeddings["agreement", ]
predict(vsmodel, newdata = wv, type = "nearest", top_n = 10)

In [None]:
##  associations: brexit without agreement
wv <- embeddings["brexit", ] - embeddings["agreement", ]
predict(vsmodel, newdata = wv, type = "nearest", top_n = 10)

In [None]:
#==================== Project all adjectives in 2 dimensions ===================

## pos-tag the text to identify adjectives, takes quite a while, read already annotated file instead
## corp <- brexit.debates[, c("gid", "body.orig")]
## setnames(corp, "gid", "doc_id")
## setnames(corp, "body.orig", "text")
## corp$text <- text
## corp.pos <- udpipe(corp, "english")
## saveRDS(corp.pos, "Data/brexit-annotated.rds")
corp.pos <- readRDS("Data/brexit-annotated.rds")
head(corp.pos)

In [None]:
## get all adjectives in the corpus
length(unique(corp.pos[, "token"]))
length(unique(corp.pos[corp.pos$upos == "ADJ", "token"]))
adjectives <- unique(corp.pos[corp.pos$upos == "ADJ", "token"])

In [None]:
## get all nouns in the corpus
length(unique(corp.pos[, "token"]))
length(unique(corp.pos[corp.pos$upos == "NOUN", "token"]))
nouns <- unique(corp.pos[corp.pos$upos == "NOUN", "token"])

In [None]:
# visualize 2-dimensional projection of all adjectives in the brexit debate data
# project on 2dim space
viz <- umap(embeddings, n_neighbors = 15, n_threads = 2)
# filter for adjectives
df  <- data.frame(word = rownames(embeddings),
                  xpos = rownames(embeddings),
                  x = viz[, 1], y = viz[, 2],
                  stringsAsFactors = FALSE)
df  <- subset(df, xpos %in% adjectives)
head(df)

In [None]:
## Plot, restrict to first 300 for speed
ggplot(df[1:300, ], aes(x = x, y = y, label = word)) +
  geom_text_repel() + theme_void() +
  labs(title = "word2vec - adjectives in 2D using UMAP")

In [None]:
## Interactive plot - unfortunately this does not  work in the notebooks
## plot_ly(df[1:300, ], x = ~x, y = ~y, type = "scatter", mode = 'text', text = ~word)

In [None]:
## Similar 2d projection of all nouns
embeddings.nouns <- predict(vsmodel, nouns, type = "embedding")
embeddings.nouns <- embeddings.nouns[complete.cases(embeddings.nouns), ]
viz <- umap(embeddings.nouns, n_neighbors = 15, n_threads = 2)
df  <- data.frame(word = rownames(embeddings.nouns),
                  xpos = rownames(embeddings.nouns),
                  x = viz[, 1], y = viz[, 2],
                  stringsAsFactors = FALSE)
plot_ly(df[1:500, ], x = ~x, y = ~y, type = "scatter", mode = 'text', text = ~word)

### Pre-trained embeddings

In [None]:
## Download word2vec, glove or fasttext embeddings
## https://github.com/maxoodf/word2vec
## https://fasttext.cc/docs/en/crawl-vectors.html
## https://nlp.stanford.edu/projects/glove/

In [None]:
## word2vec on English texts corpus, Skip-Gram, Negative Sampling, vector size 500, window 10
model <- read.word2vec(file = "Data/sg_ns_500_10.w2v", normalize = TRUE)
length(summary(model))

In [None]:
## Examples for word similarities, classical analogies and embedding similarities
predict(model, newdata = c("loan", "money"), type = "nearest", top_n = 5)

In [None]:
wv <- predict(model, newdata = c("king", "man", "woman"), type = "embedding")
wv <- wv["king", ] - wv["man", ] + wv["woman", ]
predict(model, newdata = wv, type = "nearest", top_n = 5)

In [None]:
wv <- predict(model, newdata = c("france", "paris", "london"), type = "embedding")
wv <- wv["france", ] - wv["paris", ] + wv["london", ]
predict(model, newdata = wv, type = "nearest", top_n = 5)

In [None]:
wv <- predict(model, newdata = c("physician", "man", "woman"), type = "embedding")
wv <- wv["physician", ] - wv["man", ] + wv["woman", ]
predict(model, newdata = wv, type = "nearest", top_n = 20)

In [None]:
wv <- predict(model, newdata = c("ideology", "person", "racist", "xenophobia"), type = "embedding")
wv <- wv["ideology", ] - wv["person", ] + wv["racist", ]
predict(model, newdata = wv, type = "nearest", top_n = 10)

## Word embeddings: GloVe

In [None]:
## Create iterator over tokens
tokens <- space_tokenizer(text)
str(tokens)

In [None]:
## Create vocabulary. Terms will be unigrams (simple words).
it <- itoken(tokens)
vocab <- create_vocabulary(it)
vocab

In [None]:
## remove infrequent tokens
vocab <- prune_vocabulary(vocab, term_count_min = 5L)

In [None]:
## Use our filtered vocabulary
vectorizer <- vocab_vectorizer(vocab)
## use window of 5 for context words to construct term-co-occurence matrix
tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L)
str(tcm)

In [None]:
tcm

In [None]:
## inspect: standard is decay weighting with offset position
## (weight = 1 / distance_from_current_word)
tcm

In [None]:
## fit glove
glove <- GlobalVectors$new(rank = 50, x_max = 10)
wvmain <- glove$fit_transform(tcm, n_iter = 10, convergence_tol = 0.01, n_threads = 8)
dim(wvmain)
wvmain

In [None]:
## can also retrieve context vectors
wvcontext <- glove$components
tail(wvcontext)
dim(wvcontext)

In [None]:
## could use either of these (typically main),
## or aggregate them by averaging or summing them (suggested in glove paper)
## summing:
wordvectors <- wvmain + t(wvcontext)

In [None]:
## analogy tasks work the same
## (although not well here as the corpus is too small and specific, requires more data)
berlin <- wordvectors["paris", , drop = FALSE] - wordvectors["france", , drop = FALSE] + wordvectors["uk", , drop = FALSE]
cosinesim <- sim2(x = wordvectors, y = berlin, method = "cosine", norm = "l2")
head(sort(cosinesim[,1], decreasing = TRUE), 5)

### Simple document-level representation

Simple way to get a document representation: just averaging word vectors within a document.

In [None]:
## isolating common terms, assuming dtm is a document-term-matrix (using the one from above)
commonterms <- intersect(colnames(dtm), rownames(wordvectors))
commonterms

In [None]:
## filtering dtm (and normalizing)
## could also re-weight dtm with tf-idf instead of l1 norm
## dtmaveraged <-  as.matrix(dtm)[, common_terms]
dtmaveraged <-  normalize(as.matrix(dtm)[, commonterms], "l1")
dtmaveraged

In [None]:
## get averaged document vectors ('sentence' vectors)
docvectors <- dtmaveraged %*% wordvectors[commonterms, ]
docvectors

In [None]:
dim(dtmaveraged)
dim(wordvectors[commonterms, ])
dim(docvectors)

In [None]:
## analogy tasks work just as before, could use this to find e.g. speakers similar to a person
## check which is most similar to first document
cosinesim <- sim2(x = docvectors, y = docvectors[1, , drop = FALSE], method = "cosine", norm = "l2")
head(sort(cosinesim[,1], decreasing = TRUE), 5)

### Learning document/paragraph vectors

This does not return great results, corpus too small.

In [None]:
## input
corp <- data.frame(
    doc_id = brexit.debates$gid,
    text = text,
    stringsAsFactors = FALSE
)

In [None]:
## low dimension, just for illustrations
pv.model <- paragraph2vec(
  x = corp,
  type = "PV-DM",
  dim = 5,
  iter = 3,
  min_count = 5,
  lr = 0.05,
  threads = 1
)

In [None]:
## More realistic settings, careful, this will run for a bit. Not worth running with the limited data we use here.
## pv.model <- paragraph2vec(
##   x = corp,
##   type = "PV-DBOW",
##   dim = 100,
##   iter = 20,
##   min_count = 5,
##   lr = 0.05,
##   threads = 4
## )
## saveRDS(pv.model, "Data/pv-model.rds")
## pv.model <- readRDS("Data/pv-model.rds")

In [None]:
## Extract the word embeddings
word.embeddings <- as.matrix(pv.model, which = "words")
head(word.embeddings)

In [None]:
## Extract the document embeddings
doc.embeddings <- as.matrix(pv.model, which = "docs")
tail(doc.embeddings)

In [None]:
## Extract the vocabulary
doc.vocab <- summary(pv.model, which = "words")
doc.vocab

## word.vocab <- summary(pv.model, which = "docs")
## head(word.vocab)

In [None]:
# retriev word embeddings (as previously)
predict(pv.model, "brexit", type = "embedding")

In [None]:
# retrieve most similar words to a word (as previously)
predict(pv.model,
  newdata = "brexit",
  type = "nearest",
  which = "word2doc"
)

In [None]:
# retrieve document embeddings
predict(pv.model,
  newdata = c("2021-02-11b.563.0", "2021-02-11b.504.0", "2021-02-11b.468.2"),
  type = "embedding",
  which = "docs"
)

In [None]:
# retrieve most similar documents to a document
predict(pv.model,
  newdata = "2021-02-11b.563.0",
  type = "nearest",
  which = "doc2doc"
)

In [None]:
## find document closest to a sentence
predict(pv.model,
  newdata = list(prophecy = c("brexit", "will", "not", "disrupt", "trade")),
  type = "nearest",
  which = "sent2doc"
)

In [None]:
## Get embeddings of sentences.
sentences <- list(
  sent1 = c("germany", "and", "france", "dominate", "the", "eu"),
  sent2 = c("brexit", "was", "planned", "meticulously")
)
predict(pv.model, newdata = sentences, type = "embedding")