### install and load packages (RBERT and tensorflow)
* this takes a while the first time you run this 

In [None]:
# devtools::install_github(
#   "jonathanbratt/RBERT", 
#   build_vignettes = TRUE
# )

# tensorflow::install_tensorflow(version = "1.13.1")

In [3]:
library(RBERT)
library(dplyr)

In [None]:
# might need to re-install pillar if there's an error message 
# install.packages('pillar')

In [5]:
# load pre-trained BERT embeddings 
BERT_PRETRAINED_DIR <- RBERT::download_BERT_checkpoint(
  model = "bert_base_uncased"
)

### load data from okcupid 

In [None]:
#load okcupid essays
df = read.csv('/srv/jeremy.yang.tmp/text/okcupid_text.csv')

In [75]:
dim(df)

In [76]:
names(df)

### pre-process the essay text
* using essay 0 as an example 

In [77]:
essay0 = as.character(text$essay0)

In [82]:
length(essay0)

In [78]:
essay0[1]

In [79]:
# keep the punctuations as they might provide some contexual information 
test = gsub('<br />|\\n|','',essay0[1])

# alternatively: remove all punctuations and only keep numbers and letters  
# test = gsub("[^[:alnum:]]",'',essay0[1])

In [98]:
test

In [81]:
essay0 = gsub('<br />|\\n|','',essay0)

### extract embeddings from the final layer of hidden states
* for more details of BERT's architechure see [this](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/) 

In [88]:
# use a small block of texts to check how long it takes
start_time <- Sys.time()

BERT_feats <- extract_features(
  examples = essay0[1:100],
  ckpt_dir = BERT_PRETRAINED_DIR,
  layer_indexes = 12
)

end_time <- Sys.time()
end_time - start_time

Time difference of 1.12877 mins

In [91]:
# this is roughly how long it takes to process all the texts
(end_time - start_time)/100*length(essay0)

Time difference of 676.6525 mins

In [None]:
# run it on the whole thing (max token length is currently set at 128, can change it if necessary)
BERT_feats <- extract_features(
  examples = essay0,
  ckpt_dir = BERT_PRETRAINED_DIR,
  layer_indexes = 12
)

In [92]:
# save the tokens and their corresponding features into a data frame
features = BERT_feats$output %>% select(-segment_index, -layer_index)

In [None]:
head(features)

In [None]:
# write the features into a csv
features = write.csv(features, 'bert_feats_essay0.csv')

### some utility functions that we are not using now but might be useful in the future

In [None]:
# extract the final layer output vector for the "[CLS]" token of the first sentence. 
# output_vector1 <- BERT_feats$output %>%
#   dplyr::filter(
#     sequence_index == 1, 
#     token == "[CLS]", 
#     layer_index == 12
#   ) %>% 
#   dplyr::select(dplyr::starts_with("V")) %>% 
#   unlist()
# output_vector1

In [74]:
# Extract output vectors for all sentences...
# These vectors can be used as input features for downstream models.
# output_vectors <- BERT_feats$output %>% 
#   dplyr::filter(token_index == 1, layer_index == 12)
# output_vectors

In [56]:
# tokens <- tokenize_text(text = test,
#                         ckpt_dir = BERT_PRETRAINED_DIR)