### install and load packages (RBERT and tensorflow)
* this takes a while the first time you run this 

In [None]:
# devtools::install_github(
#   "jonathanbratt/RBERT", 
#   build_vignettes = TRUE
# )

# tensorflow::install_tensorflow(version = "1.13.1")

In [3]:
library(RBERT)
library(dplyr)

In [None]:
# might need to re-install pillar if there's an error message 
# install.packages('pillar')

In [5]:
# load pre-trained BERT encoding layers 
BERT_PRETRAINED_DIR <- RBERT::download_BERT_checkpoint(
  model = "bert_base_uncased"
)

### load data from okcupid 

In [None]:
#load okcupid essays
df = read.csv('/srv/jeremy.yang.tmp/text/okcupid_text.csv')

In [75]:
dim(df)

names(df)

### pre-process the essay text
* using essay 0 as an example 

In [77]:
essay0 = as.character(df$essay0)

In [82]:
length(essay0)

In [78]:
essay0[1]

In [79]:
# keep the punctuations as they might provide some contexual information 
test = gsub('<br />|\\n|','',essay0[1])

# alternatively: remove all punctuations and only keep numbers and letters  
# test = gsub("[^[:alnum:]]",'',essay0[1])

In [98]:
test

In [81]:
essay0 = gsub('<br />|\\n|','',essay0)

### extract embeddings from the final layer of hidden states
* for more details of BERT's architechure see [this](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/) 

In [88]:
# use a small block of texts to check how long it takes
start_time <- Sys.time()

BERT_feats <- extract_features(
  examples = essay0[1:100],
  ckpt_dir = BERT_PRETRAINED_DIR,
  layer_indexes = 12
)

end_time <- Sys.time()
end_time - start_time

Time difference of 1.12877 mins

In [91]:
# this is roughly how long it takes to process all the texts
(end_time - start_time)/100*length(essay0)

Time difference of 676.6525 mins

In [None]:
# run it on the whole thing (max token length is currently set at 128, can change it if necessary)
BERT_feats <- extract_features(
  examples = essay0,
  ckpt_dir = BERT_PRETRAINED_DIR,
  layer_indexes = 12
)

In [92]:
# save the tokens and their corresponding features into a data frame
features = BERT_feats$output %>% select(-segment_index, -layer_index)

In [102]:
head(features)

sequence_index,token_index,token,V1,V2,V3,V4,V5,V6,V7,⋯,V759,V760,V761,V762,V763,V764,V765,V766,V767,V768
<int>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,[CLS],0.02769635,0.206880182,0.1352212,-0.23510653,-0.14495373,-0.26982272,0.74935043,⋯,-0.08145078,-0.3356011,0.2532236,-0.03561111,0.493324,-0.17433338,-0.1894813478,-0.06771588,0.7238847,0.52680546
1,2,about,0.95925152,0.575973034,-0.2187169,0.07080545,0.13084382,0.32556975,-0.1375636,⋯,0.31716979,-0.4298106,0.466109,0.24759261,-0.0514899,-0.44786376,0.004567666,-0.0296302,0.846746,-0.1567664
1,3,me,-0.26578623,-0.263226688,-0.4654093,-0.21929964,-0.50533813,0.05498304,0.23048586,⋯,-0.42206591,0.228994,-0.1931382,-0.08697254,0.017177,-0.22041345,-0.0006214902,-0.20233682,0.9016798,0.851988733
1,4,:,-0.23830779,0.001658076,-0.1490056,-0.27663341,-0.05879457,0.57688606,0.66798979,⋯,-0.24448639,0.1912105,-0.6891336,0.31816801,0.375408,0.08612782,-0.1983464509,-0.28207332,0.2686216,0.378298134
1,5,i,0.16260415,-0.161255732,0.7713182,-0.1334399,0.59710145,0.53311062,0.3725341,⋯,0.03365694,0.3225724,0.409159,-0.49834368,-0.3436869,0.03299091,0.1313328892,0.15666685,0.72386,0.412904024
1,6,would,-0.06572168,-0.651838303,-0.7283822,-0.60188961,0.86171061,-0.04663589,-0.07681288,⋯,-0.40667245,0.8721913,0.6491868,-0.64515674,-0.5734687,-0.81917334,0.1835416108,-0.45278186,0.6536664,0.009809494


In [None]:
# write the features into a csv
features = write.csv(features, 'bert_feats_essay0.csv')

### some utility functions that we are not using now but might be useful in the future

In [None]:
# extract the final layer output vector for the "[CLS]" token of the first sentence. 
# output_vector1 <- BERT_feats$output %>%
#   dplyr::filter(
#     sequence_index == 1, 
#     token == "[CLS]", 
#     layer_index == 12
#   ) %>% 
#   dplyr::select(dplyr::starts_with("V")) %>% 
#   unlist()
# output_vector1

In [74]:
# Extract output vectors for all sentences...
# These vectors can be used as input features for downstream models.
# output_vectors <- BERT_feats$output %>% 
#   dplyr::filter(token_index == 1, layer_index == 12)
# output_vectors

In [56]:
# tokens <- tokenize_text(text = test,
#                         ckpt_dir = BERT_PRETRAINED_DIR)