## General
Install packages

In [None]:
install.packages("stm", repos="http://cran.cnr.berkeley.edu")

preferred packages

In [None]:
library(dplyr)
library(sqldf)
library(tm)
library(h2o)

read and write base csv 

In [None]:
train <- read.csv("train.csv")
write.csv(train, "train.csv")

basic exploration of data

split data into training and testing set

In [None]:
set.seed(10)
assign <- runif(nrow(train_use), 0, 1)
trn <- train_use[assign > 0.2, ]
validate <- train_use[assign <= 0.2, ]

z-score normalize both training and testing sets with same mu and sigma

In [None]:
mu <- lapply(trn[ , 2:ncol(trn)], mean)
sigma <- lapply(trn[ , 2:ncol(trn)], sd)

for (x in 2:ncol(train)){
    m <- mu[(x-1)]
    s <- sigma[(x-1)]
    train[ , x] <- (train[ , x]-m[[1]])/s[[1]]
}

for (x in 2:ncol(train)){
    m <- mu[(x-1)]
    s <- sigma[(x-1)]
    test[ , x] <- (test[ , x]-m[[1]])/s[[1]]
}

## Data Exploration

basic exploration

In [None]:
dim(train)
colnames(train)
summary(train)
head(train, 5)

get mean, medians, etc. while removing NA values

In [None]:
means <- lapply(test[ ,65:ncol(test)], function(x) mean(x, na.rm=T))

t-test to test for statistical significance between population means

In [None]:
t.test(train[train$author == "MWS", ]$a, train[train$author != "MWS", ]$a)

convert multiple logical features to binary

In [None]:
train[ , reformat] <- lapply(train[ , reformat], function(x) ifelse(x==T, 1, 0))

## Data Shaping and Manipulation
column removal

In [None]:
train <- train %>% select(-X, id) # with dplyr
train <- train[ , 3:ncol(train)] # without dplyr

select only a subset of columns

In [None]:
train_stm <- train[ , c("id", "author", "text")] # without dplyr
train_stm <- train %>% select(id, author, text) # with dplyr

convert factor level to a single binary column

In [None]:
train$is_eap <- ifelse(train$author == "EAP", 1, 0)

create a series of column/row labels varying only by number

In [None]:
lbl <- seq(1, 61, by=1)
lbl <- as.character(lbl)
lbl <- sub("^", "topic", lbl)
cnames <- c("id", lbl)
colnames(train_processed) <- cnames

## Feature Selection

filter subset of filters based on threshold of correlation with any of several y features

In [None]:
correlations <- cor(train[, c(22:24, 398:647)]) # some of the correlations may be NA if the sd == 0
correlations[is.na(correlations)] <- 0

eap_corr <- correlations[1,]
hpl_corr <- correlations[2,]
mws_corr <- correlations[3,]

eap_corr <- eap_corr[4:length(eap_corr)]
hpl_corr <- hpl_corr[4:length(hpl_corr)]
mws_corr <- mws_corr[4:length(mws_corr)]

use1 <- names(eap_corr[eap_corr >= 0.02 | eap_corr <= -.02])
use2 <- names(hpl_corr[hpl_corr >= 0.02 | hpl_corr <= -.02])
use3 <- names(mws_corr[mws_corr >= 0.02 | mws_corr <= -.02])

use <- c(use1, use2, use3)
use <- unique(use) # these are the feature names to keep
keep <- colnames(train[ , 1:397])
train <- train[ , c(keep, use)] # final data frame doesn't have the features that didn't pass the correlation threshold

## Visualizations
Correlograms

In [None]:
corrplot(cor(train[ , c("is_eap", "is_hpl", "is_mws", "word_count", "n_comma", "n_commaSemiColon", "n_colons")]), 
        addCoef.col = "black", number.cex = 0.7)

## Text with Base
base text and regular expression processing of single column

In [None]:
test$lower <- tolower(test$text)
test$all_char <- nchar(test$lower)
train$AN_char <- nchar(gsub("[^a-zA-Z0-9]","", train$lower))
train$percent_ANchar <- train$AN_char / train$all_char
train$a <- nchar(gsub("[^a]", "", train$lower))
train$vowels <- nchar(gsub("[^aeiou]", "", train$lower))
train$percent_vowels <- train$vowels / train$AN_char
train$bare <- gsub("[^a-z ]", "", train$lower)

base and regex get word count of single column

In [None]:
temp <- train$bare
temp <- gsub("\\b[a-z]+\\b", "a", temp)
temp <- gsub(" ", "", temp)
train$word_count <- nchar(temp)

base, regex to create single binary column from multiple words

In [None]:
# 3rd person male (he, him, his)
temp <- gsub("\\bhe\\b", "A", train$bare)
temp <- gsub("[^A]", "", temp)
temp_sum <- nchar(temp)

temp <- gsub("\\bhim\\b", "A", train$bare)
temp <- gsub("[^A]", "", temp)
temp <- nchar(temp)
temp_sum <- temp_sum + temp

temp <- gsub("\\bhis\\b", "A", train$bare)
temp <- gsub("[^A]", "", temp)
temp <- nchar(temp)
temp_sum <- temp_sum + temp

train$has_he <- ifelse(temp_sum > 0, 1, 0)

base, regex create single binary column or numeric column from list of words

In [None]:
prepositions <- c("on", "at", "to", "by")
temp <- test$bare
for (x in prepositions){
    temp <- gsub(paste0("\\b", x, "\\b"), "A", temp)
}
temp <- gsub("[^A]", "", temp)
test$number_prepositions <- nchar(temp)
test$has_preposition <- ifelse(test$number_prepositions > 0, 1, 0)

base, regex create multiple binary columns from list of words

In [None]:
first <- c("pa", "ju", "af", "da", "on", "gu")
for (x in first){
    lbl = paste0("first_", x)
    train[ ,lbl] <- grepl(paste0("\\b", x, "\\w"), train$bare)
}

## Text with TM

In [None]:
library(tm)
library(snowballC)

create corpus from multiple text files (or a text file)

<ul>
<li>https://stackoverflow.com/questions/7927367/r-text-file-and-text-mining-how-to-load-data

In [None]:
source <- DirSource("more_text/") #input path for documents
AuthorsCorpus <- Corpus(source, readerControl=list(reader=readPlain))

create corpus of bi-grams from a document

In [None]:
library(RWeka)

options(mc.cores=4)
twogramTokenizer <- function(x) {
    NGramTokenizer(x, Weka_control(min=2, max=2))
}

poe_2dtm <- DocumentTermMatrix(poe_corpus, control=list(tokenize=twogramTokenizer))
poe_2dtm
poe_2dtm <- removeSparseTerms(poe_2dtm, 0.999)
poe_2dtm

create and process corpus from data frame column

In [None]:
corp_eap <- Corpus(VectorSource(train[train$author == "EAP",]$bare))

corp_eap <- tm_map(corp_eap, stripWhitespace)
dtm_eap <- DocumentTermMatrix(corp_eap)
dtm_eap
dim(dtm_eap)
eap_words <- colnames(dtm_eap)

stem words of corpus

In [None]:
corp_eap_stem <- tm_map(corp_eap, stemDocument, language = "english")
dtm_eap_stem <- DocumentTermMatrix(corp_eap_stem)
dtm_eap_stem
dtm_eap_stem <- removeSparseTerms(dtm_eap_stem, 0.99)
dtm_eap_stem
eap_stems <- colnames(dtm_eap_stem)

create list of words that are unique to one of multiple corpora

In [None]:
not_eap_stems <- unique(c(mws_stems, hpl_stems))
not_mws_stems <- unique(c(eap_stems, hpl_stems))
not_hpl_stems <- unique(c(eap_stems, mws_stems))

eap_only <- eap_stems[! (eap_stems %in% not_eap_stems)]
mws_only <- mws_stems[!(mws_stems %in% not_mws_stems)]
hpl_only <- hpl_stems[! (hpl_stems %in% not_hpl_stems)]

find words of interest in a general corpus, convert the features to binary from logical

In [None]:
words_of_interest <- c(eap_only, mws_only, hpl_only)
for (word in words_of_interest){
    lbl <- paste0("has_", word)
    train2[ , lbl] <- grepl(word, train2$bare)
}
train2[18:245] <- lapply(train2[18:245], function(x) ifelse(x == TRUE, 1, 0))

## Topic Modeling with stm

### structural topic models with stm

<ul>
<li>https://www.rdocumentation.org/packages/stm/versions/1.3.0</li>
<li>https://www.rdocumentation.org/packages/stm/versions/1.3.0/topics/fitNewDocuments</li>
<li>https://www.rdocumentation.org/packages/stm/versions/1.3.0</li>
<li>https://www.rdocumentation.org/packages/stm/versions/1.3.0/topics/selectModel</li>
<li>https://www.rdocumentation.org/packages/stm/versions/1.3.0/topics/searchK</li>
<li>https://www.rdocumentation.org/packages/stm/versions/1.3.0/topics/toLDAvis</li>
</ul>

In [None]:
library(stm)

build corpus using stm

In [None]:
train_processed <- textProcessor(train_stm$text, metadata = train[ , c("id", "author")], lowercase = TRUE,
                                  removestopwords = TRUE, removenumbers = TRUE, removepunctuation = TRUE,
                                  stem = TRUE, wordLengths = c(3, Inf), sparselevel = 1,
                                  language = "en", verbose = TRUE, onlycharacter = FALSE)
train_processed

Process the data for analysis.

In [None]:
meta <- train_processed$meta
vocab <- train_processed$vocab
docs <- train_processed$documents
train_out <- prepDocuments(docs, vocab, meta)

train the topic models

In [None]:
set.seed(1002)
train_model <- stm(train_out$documents, train_out$vocab, 0, init.type="Spectral", 
                   data=train_out$meta$author) #prevalence=~treatment + s(pid_rep)  
# eventually used to create features to predict "author"

get model summary

In [None]:
summary(train_model)

plot comparissons for 2 of the topics at a time

In [None]:
plot.STM(train_model, type="perspectives", topics=c(2,3))

plot distribution of MAP estimates of document-topic proportions

In [None]:
plot.STM(train_model, type="hist")

plot topic correlation network

In [None]:
plot(topicCorr(train_model))

apply the model from the training set to the test set of data

In [None]:
temp <- textProcessor(test_stm$text, metadata = test_stm)
newdocs <- alignCorpus(new = temp, old.vocab = train_model$vocab)
newdocs_fit <- fitNewDocuments(model = train_model, documents = newdocs$documents, 
                               newData=newdocs$meta, origData=train_out$meta) #didn't have 'author' in meta here, but accepted anyway
#, prevalence=~treatment + s(pid_rep),prevalencePrior="Covariate"))

dim(newdocs_fit$theta) # thetas are measurements for each of the topics 
# will need to impute column means (or other values) for any docs with no topic info

figure out which documents in test set didn't have any topic info

In [None]:
docs_modeled <- newdocs$meta$id
all_docs <- test_stm$id
setdiff(all_docs, docs_modeled)

add topic model theta values

In [None]:
id_vect <- as.character(newdocs$meta$id)
test_processed <- cbind(id_vect, newdocs_fit$theta)
cnames <- c("id", lbl) #use 'lbl', a previously created list of label names for the topics
colnames(test_processed) <- cnames
test_processed <- data.frame(test_processed)

test <- left_join(test, test_processed, by = "id") # add the topic model theta values as features to test frame
test[ ,65:ncol(test)] <- lapply(test[ ,65:ncol(test)], function(x) as.numeric(as.character(x))) # are added as factors
    # convert to character, then numeric (just numeric produces wrong numbers)

impute column means for documents that didn't have topic info

In [None]:
for(i in 65:ncol(test)){
  test[is.na(test[,i]), i] <- mean(test[,i], na.rm = TRUE)
}    

## Topic Modeling with topicmodels

create a multi-document corpus from multiple text files in a folder

<ul>
<li>https://rstudio-pubs-static.s3.amazonaws.com/163802_0f005a14bcfb4c4b8ee17ac8a8e6c3e9.html</li>
<li>https://stackoverflow.com/questions/7927367/r-text-file-and-text-mining-how-to-load-data</li>
<li>http://www.mjdenny.com/Text_Processing_In_R.html</li>
</ul>

In [None]:
library(tm)
library(topicmodels)

source <- DirSource("more_text/") #input path for documents
AuthorsCorpus <- Corpus(source, readerControl=list(reader=readPlain))  
summary(AuthorsCorpus)

process the corpus

In [None]:
AuthorsCorpus <- tm_map(AuthorsCorpus, content_transformer(tolower))
AuthorsCorpus <- tm_map(AuthorsCorpus, removeWords, stopwords(kind="en"))
AuthorsCorpus <- tm_map(AuthorsCorpus, removeWords, c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", 
                                     "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", 
                                     "u", "v", "w", "x", "y", "z")) #remove single letters
AuthorsCorpus <- tm_map(AuthorsCorpus, removeWords, c("lovecraft", "poe", "shelley")) #remove their names, just in case
AuthorsCorpus <- tm_map(AuthorsCorpus, removePunctuation)
AuthorsCorpus <- tm_map(AuthorsCorpus, removeNumbers)
AuthorsCorpus <- tm_map(AuthorsCorpus, stripWhitespace)

inspect(AuthorsCorpus)

examine the metadata of one of the documents in a corpus

In [None]:
meta(AuthorsCorpus[[3]])

add and populate a metadata field for the documents in a corpus

In [None]:
meta(AuthorsCorpus[[1]], "category") <- "HPL"
meta(AuthorsCorpus[[2]], "category") <- "EAP"
meta(AuthorsCorpus[[3]], "category") <- "MWS"

stem the documents, create document-term matrix using term frequency

create a dictionary corpus and use it to map the original words to the stemmed words  (doesn't work--- solve?)

In [None]:
dictCorpus = AuthorsCorpus
AuthorsCorpus <- tm_map(AuthorsCorpus, stemDocument)
#AuthorsCorpus <- tm_map(AuthorsCorpus, stemCompletion, dictionary=dictCorpus)  
dtm <- DocumentTermMatrix(AuthorsCorpus, control = list(minWordLength = 3))

view terms from the dtm

In [None]:
Terms(dtm)[1:10]

create dtm using tf-idf

In [None]:
dtm2 = DocumentTermMatrix(AuthorsCorpus, control = list(weighting = weightTfIdf, minWordLength = 3))

list the frequent terms in the corpus
<li>https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf

In [None]:
findFreqTerms(dtm, lowfreq=100)

remove sparse words from dtm

In [None]:
dtm <- removeSparseTerms(dtm, 0.999)
dtm2 <- removeSparseTerms(dtm2, 0.999)

<b>train models using LDA

In [None]:
k <- 30
SEED <- 1234
my_TM <- list(VEM = LDA(dtm, k = k, control = list(seed = SEED)),
              VEM_fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
              Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000, 
                                                          thin = 100, iter = 1000)), 
              CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), 
                                                    em = list(tol = 10^-3))))

get the topics for one of the methods used for modeling, and the top terms for it 

In [None]:
Topic = topics(my_TM[["VEM"]], 1)
Topic

Terms = terms(my_TM[["VEM"]], 5) #top 5 terms for each topic in LDA
Terms

print the sets of terms for each topic in a given model

In [None]:
for (x in 1:ncol(Terms)){
    print(paste(Terms[1:5, x], collapse = ", "))
}

get the most frequent terms used in the topics of a model

In [None]:
my_topics = topics(my_TM[["Gibbs"]])
most_frequent = which.max(tabulate(my_topics))
terms(my_TM[["Gibbs"]], 10)[, most_frequent]

## Machine Learning - h2o

initialize h2o

In [None]:
h2o.init(nthreads=5, max_mem_size = "4G")    
## specify the memory size for the H2O cloud; default nthreads (-1) is maximum number of CPUS

h2o.removeAll() # Clean slate - just in case the cluster was already running

import data file

In [None]:
train <- h2o.importFile("train_to_use_121417.csv")

### create random forest model

In [None]:
rf1 <- h2o.randomForest(training_frame = train, 
                        y=1, ntrees=20,    ## use a maximum of 20 trees to start.. (default 50)
                        max_depth=20, stopping_rounds=2, 
                        ## stop fitting new trees when 2-tree avg w/in 0.001 (default) of prior two 2-tree avgs
                        seed=10) 

get results from categorical prediction model

In [None]:
predictions <- h2o.predict(object = rf1 ,newdata = test)
cat("Overall Accuracy: ", nrow(test[test$author == predictions$predict, ])/nrow(test), 
       "\nEAP prec: " , nrow(predictions[predictions$predict == "EAP" & test$author == "EAP", ])/nrow(predictions[predictions$predict == "EAP", ]), 
        ", EAP recall: ", nrow(test[test$author == "EAP" & predictions$predict == "EAP", ])/nrow(test[test$author == "EAP", ]), 
        ", EAP acc:", (nrow(test[test$author == "EAP" & predictions$predict == "EAP", ]) + 
                       nrow(test[test$author != "EAP" & predictions$predict != "EAP", ])) / nrow(test),
        "\nHPL prec: ", nrow(predictions[predictions$predict == "HPL" & test$author == "HPL", ])/nrow(predictions[predictions$predict == "HPL", ]), 
        ", HPL recall: ", nrow(test[test$author == "HPL" & predictions$predict == "HPL", ])/nrow(test[test$author == "HPL", ]), 
        ", HPL acc:", (nrow(test[test$author == "HPL" & predictions$predict == "HPL", ]) + 
                       nrow(test[test$author != "HPL" & predictions$predict != "HPL", ])) / nrow(test),
        "\nMWS prec:" , nrow(predictions[predictions$predict == "MWS" & test$author == "MWS", ])/nrow(predictions[predictions$predict == "MWS", ]), 
        ", MWS recall: ", nrow(test[test$author == "MWS" & predictions$predict == "MWS", ])/nrow(test[test$author == "MWS", ]), 
        ", MWS acc: ", (nrow(test[test$author == "MWS" & predictions$predict == "MWS", ]) + 
                       nrow(test[test$author != "MWS" & predictions$predict != "MWS", ])) / nrow(test))

get model desctiption and variable importance info

In [None]:
rf1
h2o.varimp(h2o)

export h2o data frame to a file

In [None]:
h2o.exportFile(submission, "submission_121417.csv", force = TRUE)