### Google Colab Setup ###

In [2]:
# upload /bert which is in my Desktop nlp folder
# uploead bert base model /uncased_L-12_H-768_A-12 which is also in my Desktop nlp folder
# upload data

In [None]:
### Google Colab Mount Drive ###

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
# Select Tensorflow version
# Google default is 2.x, but it does not work with Bert pretraining
# gives flag error
%tensorflow_version 1.x
import tensorflow as tf
print(tf.__version__)

In [None]:
# Check for GPU, please give me Tesla P100 PCI-E 16 GB
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# cd to where bert code is 
%cd drive/My\ Drive/bert

In [None]:
# download transformer repo
#!pip install transformers

In [None]:
# download pytorch-pretrained-bert repo
!pip install pytorch-pretrained-bert

In [None]:

from transformers import BertConfig, BertModel, BertForPreTraining, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_json_file('../uncased_L-12_H-768_A-12/bert_config.json')

### Avg Word2Vec ###

In [1]:
import numpy as np
import pandas as pd
import re
import gensim
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# download GoogleNews-vectors-negative300.bin
# should be in my Desktop nlp folder

# load word2vec matrix
model = gensim.models.KeyedVectors.load_word2vec_format('../../../nlp/GoogleNews-vectors-negative300.bin', binary = True) 

In [None]:
# test
model.word_vec('social')

In [None]:
# Represent each sentence as the average of the word2vec vector of
# each individual word (with names removed)

# preprocess a string, remove punctuation, convert to lowercase
# text: string
# return: preprocessed string
def preprocess(text):
    text = re.sub(r'[^\w\s]', ' ', text)
    return text.lower()

# get the mean vector for a string
# words: string
def get_mean_vector(model, words):
    names = ['amy', 'jenny', 'mitch', 'john', 'alice', 'sam', 'jeff', 'mark', 'kate', 'jane', 'naomi', 'noah', 'matthew', 'emma', 'neil', 'james', 'susan', 'olivia', 'jacob', 'tony']
    # remove out-of-vocabulary words
    words = words.split()
    words = [word for word in words if word in model.vocab and word not in names]
    if len(words) >= 1:
        return np.mean([model.word_vec(word) for word in words], axis = 0)
        #return np.mean(wv_from_bin.word_vec(word), axis=0)
    else:
        return []

# get the cosine similarity of two mean vectors
# context: avg word2vec vector for context sentence(s)
# choice: avg word2vec vector for choice sentence(s)
# return: float
def get_similarity(context, choice):
    return cosine_similarity(context.reshape(1, 300), choice.reshape(1, 300))

# solve social mcq
# model: word2vec model
# test_folder: directory containing test file
# return: list of pairwise similarity between context of choice
# ie. every question takes 5 rows if there are 5 choices
def word2vec_solver(model, test_folder):
    test_file = test_folder + 'test.tsv'
    df = pd.read_csv(test_file, sep = '\t')
    similarity = []
    for _, row in df.iterrows():
        sim = get_similarity(get_mean_vector(model, preprocess(row['#1 String'])), 
                             get_mean_vector(model, preprocess(row['#2 String'])))
        similarity.append([sim[0][0], row['Quality']])
    pd.DataFrame(similarity, columns = ['similarity', 'label']).to_csv(test_folder + 'similarity.csv', header = True, index = False)
    return similarity

# get accuracy
def mcq_accuracy(similarity, num_choices):
    #predictions = np.array(pd.read_csv(similarity_file, sep = '\t')).reshape((-1, num_choices))
    predictions = np.array(similarity)[:,0].reshape((-1, num_choices))
    predicted_labels = np.argmax(predictions, axis = 1)
    return np.sum(predicted_labels == 0) / 125

# get recall@k
# correct answer is predicted in top k
from collections import Counter
def top_k_correct(similarity, num_choices, topk):
    #predictions = np.array(pd.read_csv(similarity_file, sep = '\t')).reshape((-1, num_choices))
    predictions = np.array(similarity)[:,0].reshape((-1, num_choices))
    #predictions = np.array(similarity).reshape((-1, num_choices))
    #predicted_labels = np.argmax(predictions, axis = 1)
    # get index of correct label
    #ranks = np.argsort(predictions, axis = 1)
    # check whether the index of correct choies is in topk
    indices = []
    for p in predictions:
        correct = p[0]
        sorted_array = sorted(p, reverse = True)
        indices.append(sorted_array.index(correct))
    assert len(indices) == 125
    #print(Counter(indices))
    return sum(np.array(indices) < topk) / 125


In [None]:
# Example to use Word2VecSolver
similarity = word2vec_solver(model, "./goal_mcq_full/five_choices/fold10/")
print(mcq_accuracy)
print(top_k_correct(similarity, 5, 2))



### Pretrained Bert Next Sent ###

In [None]:
# Run below on google colab

In [None]:
### Google Colab Mount Drive ###

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
%cd drive/My\ Drive/bert

In [None]:
!pip install transformers

In [None]:
from torch.nn.functional import softmax
from transformers import BertConfig, BertModel, BertForNextSentencePrediction, BertForPreTraining, BertTokenizer

In [None]:
# Test
seq_A = 'I like cookies !'
seq_B = 'Mitochondia are the powerhouse of the cell .'

In [None]:
# load pretrained model and pretrained tokenizer
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# logits always changing
# and wrong predictions
config = modeling_bert.BertConfig.from_json_file("../social/uncased_L-12_H-768_A-12/bert_config.json")
model = BertForNextSentencePrediction(config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# correct and stable
config = BertConfig.from_json_file('../social/uncased_L-12_H-768_A-12/bert_config.json')
model = BertForPreTraining.from_pretrained('../social/uncased_L-12_H-768_A-12/bert_model.ckpt.index', from_tf=True, config=config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# combined as one input to the model
encoded = tokenizer.encode_plus(seq_A, text_pair=seq_B, return_tensors='pt')
print(encoded)

In [None]:
seq_relationship_logits = model(**encoded)[0]
print(seq_relationship_logits)

In [None]:
# convert logits to probabilities 
# index 0: sequence B is a continuation of sequence A
# index 1: sequence B is a random sequence
probs = softmax(seq_relationship_logits, dim = 1)
print(probs[0][0])

In [None]:
import numpy as np
label = np.argmax(probs.detach().numpy(), axis = 1)
print(label)

In [None]:
# solve from files 
# ultimate bigthree: config, model, tokenizer

In [None]:
import pandas as pd

In [None]:
model = BertForNextSentencePrediction.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
# reverse #1 String and #2 String for resolution prediction
def bert_nextsent_solver(test_folder):
    test_file = test_folder + "test.tsv"
    df = pd.read_csv(test_file, sep = '\t')
    predictions = []
    for _, row in df.iterrows():
        seqA = row['#1 String']
        seqB = row['#2 String']
        encoded = tokenizer.encode_plus(seqA, text_pair=seqB, return_tensors='pt')
        seq_relationship_logits = model(**encoded)[0]
        probs = softmax(seq_relationship_logits, dim = 1).detach().numpy()
        predicted_label = np.argmax(probs, axis = 1)
        predictions.append([probs[0][0], predicted_label[0], 1 - row['Quality']])
        pd.DataFrame(predictions, columns = ['similarity', 'predicted_label', 'label']).to_csv(test_folder + 'bert_pretrained.csv', header = True, index = False)
    return predictions

In [None]:
# accuracy of binary classificaton
def task_accuracy(predictions):
    predicted_labels = np.array(predictions)[:,1]
    gold_labels = np.array(predictions)[:, 2]
    return np.sum(np.equal(predicted_labels, gold_labels)) / len(predicted_labels)

In [None]:
# accuracy for mcq 
def mcq_accuracy(predictions, num_choices):
    predictions = np.array(predictions)[:,0].reshape((-1, num_choices))
    predicted_labels = np.argmax(predictions, axis = 1)
    return np.sum(predicted_labels == 0) / 125

In [None]:
# Example
predictions = bert_nextsent_solver("./fold1/")
print(task_accuracy(predictions))
print(mcq_accuracy(predictions, 2))

### Bert Next Sent Trained Attention ###

In [None]:
# pretrained the attention layer/encoder
# code file: create_pretraining_data.py
#            run_pretraining.py
# data: two sets of text files, resolution + outlook/full story


In [None]:
!python create_pretraining_data.py \
  --input_file=../attention_cv/fold1.txt \
  --output_file=../fold1/tf_examples.tfrecord \
  --vocab_file=../uncased_L-12_H-768_A-12/vocab.txt \
  --do_lower_case=True \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --masked_lm_prob=0.15 \
  --dupe_factor=5

In [None]:
!python run_pretraining.py \
  --input_file=../fold1/tf_examples.tfrecord \
  --output_dir=../fold1/pretraining_output \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=../uncased_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=../uncased_L-12_H-768_A-12/bert_model.ckpt \
  --train_batch_size=32 \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --num_train_steps=100\
  --num_warmup_steps=10 \
  --learning_rate=2e-5

In [None]:
# same set up as bert for next sent prediction no pretrain attention
# except for model
# initialize model using config and tf checkpoint(created in run_pretraining.py)
config = BertConfig.from_json_file('../uncased_L-12_H-768_A-12/bert_config.json')
model = BertForPreTraining.from_pretrained('../fold1/pretraining_output/bert_model.ckpt-100.index', from_tf=True, config=config)

### BertForMultipleChoice no Pretrained Attention ###

In [None]:
# code: run_swag.py (from pytorch-pretrained-bert)
#    or run_multiple_choice.py (in the attention repo, not the Bert repo)
# edit preprocessing, add eval on training and testing data

In [None]:
!python run_social_output.py \ # edited version for run_swag
--do_train \
--do_eval \
--do_lower_case \
--data_dir ../outlook_partial/fold1/ \
--bert_model bert-base-uncased \
--max_seq_length 128 \
--train_batch_size 10 \
--learning_rate 2e-6 \
--num_train_epochs 2.0 \
--output_dir ../fold1_lr/

In [None]:
!python run_social_output.py \
--do_train \
--do_eval \
--do_lower_case \
--data_dir ../resolution_partial/fold1/ \
--bert_model bert-base-uncased \
--max_seq_length 128 \
--train_batch_size 5 \
--learning_rate 2e-6 \
--num_train_epochs 2.0 \
--seed 44 \
--output_dir ../fold1/

In [None]:
# truncate from front of context
!python run_social_output_outlook_full.py \
--do_train \
--do_eval \
--do_lower_case \
--data_dir ../outlook_full/fold1/ \
--bert_model bert-base-uncased \
--max_seq_length 256 \
--train_batch_size 5 \
--learning_rate 2e-6 \
--num_train_epochs 2.0 \
--output_dir ../fold1/

In [None]:
# diff file preprocessing
!python run_social_output_res_full.py \
--do_train \
--do_eval \
--do_lower_case \
--data_dir ../resolution_full/fold1/ \
--bert_model bert-base-uncased \
--max_seq_length 128 \
--train_batch_size 5 \
--learning_rate 2e-6 \
--num_train_epochs 2.0 \
--output_dir ../fold1/
# the data file for this has two columns for context
# context1: seed+buildup+climax
# context2: outlook
# in preprocessing, the three sequences are truncated together
# ie truncate the front of the longest sequence
# which cannot be done in encode plus

### BertForMultipleChoice Pretrained Attention ###

In [None]:
# pretrain attention layer/enocde, train BertForMultipleChoice on top
# of the custom attention layer
# if directly load custom attention weights into BertForMultipleChoice
# will result in error "has no attribute bias"
# the walkaround is to dump the pretained attention weights as a pytorch model for BertForPreTraining ie general bert model

In [None]:
# code: create_pretraining_data.py
#       run_pretraining.py
#       run_multiple_choice.py

In [None]:
!python create_pretraining_data.py \
  --input_file=../attention_cv_full/fold1.txt \
  --output_file=../fold1/tf_examples.tfrecord \
  --vocab_file=../uncased_L-12_H-768_A-12/vocab.txt \
  --do_lower_case=True \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --masked_lm_prob=0.15 \
  --dupe_factor=5

In [None]:
!python run_pretraining.py \
  --input_file=../fold1/tf_examples.tfrecord \
  --output_dir=../fold1/pretraining_output \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=../uncased_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=../uncased_L-12_H-768_A-12/bert_model.ckpt \
  --train_batch_size=32 \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --num_train_steps=10 \
  --num_warmup_steps=10 \
  --learning_rate=1e-7

In [None]:
# save pretrained attention weights
# Bert config only loads configuration, not the weights
config = BertConfig.from_json_file('../uncased_L-12_H-768_A-12/bert_config.json')
model = BertForPreTraining.from_pretrained('../fold1/pretraining_output/model.ckpt-10.index', from_tf=True, config=config)
model.save_pretrained("../fold1/pretraining_output/")

In [None]:
!python ./run_multiple_choice.py \
--model_type bert \
--task_name swag \
--model_name_or_path ../fold1/pretraining_output/ \
--config_name ../uncased_L-12_H-768_A-12/bert_config.json \
--tokenizer_name bert-base-uncased \
--do_train \
--do_eval \
--do_test \
--do_lower_case \
--data_dir ../outlook_full/fold1/ \
--learning_rate 5e-5 \
--num_train_epochs 2 \
--max_seq_length 128 \
--output_dir ../fold1/outlook_output/ \
--per_gpu_eval_batch_size=4 \
--per_gpu_train_batch_size=4 \
--gradient_accumulation_steps 4 \
--overwrite_output