In [1]:
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import torch
from bpemb import BPEmb
from tqdm import tqdm
import nltk


In [2]:
# import training data
dft_eng = pd.read_csv('../../data/dft_eng.csv')
dft_jap = pd.read_csv('../../data/dft_jap.csv')
dft_fin = pd.read_csv('../../data/dft_fin.csv')

# import validation data
dfv_eng = pd.read_csv('../../data/dfv_eng.csv')
dfv_jap = pd.read_csv('../../data/dfv_jap.csv')
dfv_fin = pd.read_csv('../../data/dfv_fin.csv')

#import word count
word_count = pd.read_csv('../../data/question_word_count.csv')

dft_eng.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,answer_start,answer_text,question_text_tokenized,document_plaintext_tokenized,answer_text_tokenized,labels
0,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': array([159]), 'answer_text': ...",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,[159],['1920s'],"['when', 'was', 'quantum', 'field', 'theory', ...","['quantum', 'field', 'theory', 'naturally', 'b...",['1920s'],1
1,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': array([610]), 'answer_text': ...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,[610],['Sully Prudhomme'],"['who', 'was', 'the', 'first', 'nobel', 'prize...","['the', 'nobel', 'prize', 'in', 'literature', ...","['sully', 'prudhomme']",1
2,When is the dialectical method used?,Dialectic,english,"{'answer_start': array([129]), 'answer_text': ...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,[129],['discourse between two or more people holding...,"['when', 'is', 'the', 'dialectical', 'method',...","['dialectic', 'or', 'dialectics', '(', 'greek'...","['discourse', 'between', 'two', 'or', 'more', ...",1
3,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': array([88]), 'answer_text': a...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,[88],['Sejong the Great'],"['who', 'invented', 'hangul', '?']","['hangul', 'was', 'personally', 'created', 'an...","['sejong', 'the', 'great']",1
4,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': array([0]), 'answer_text': ar...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,[0],"['Grasshoppers are plant-eaters, with a few sp...","['what', 'do', 'grasshoppers', 'eat', '?']","['grasshoppers', 'are', 'plant-eaters', ',', '...","['grasshoppers', 'are', 'plant-eaters', ',', '...",1


In [3]:
# Load english model with 25k word-pieces
bpemb_en = BPEmb(lang='en', dim=100, vs=25000)

In [4]:
def get_bpemb_features(dataset, bpemb):
  # With bpemb we can tokenize and embed an entire document using .embed(x)
  X = [bpemb.embed(x).mean(0) for x in (dataset.document_plaintext)]
  y = dataset.labels
 
  return X,y

In [5]:
X_train,y_train = get_bpemb_features(dft_eng, bpemb_en)
X_test,y_test = get_bpemb_features(dfv_eng, bpemb_en)
lr_bpemb = LogisticRegression(penalty='l2', max_iter=1000, multi_class='multinomial')
lr_bpemb.fit(X_train, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [6]:
preds_bpemb = lr_bpemb.predict(X_test)
preds_valid_bpemb = lr_bpemb.predict(X_test)

In [7]:
# BPEmb model 
report = classification_report(y_test, preds_bpemb, output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.708061,0.656566,0.681342,495.0
1,0.679849,0.729293,0.703704,495.0
accuracy,0.692929,0.692929,0.692929,0.692929
macro avg,0.693955,0.692929,0.692523,990.0
weighted avg,0.693955,0.692929,0.692523,990.0


Text generation

In [8]:
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint, output_hidden_states = True)

In [28]:
def generate_text(text, model, tokenizer, max_length=100):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def get_hidden_states(text, model, tokenizer):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output = model(input_ids)
    return output.hidden_states

def generate_all(text, model, tokenizer, max_length, num_beams, no_reapeat_ngrams, temperature, top_k, top_p):
    input_ids = tokenizer.encode(text, return_tensors='pt')

    greedy_output = model.generate(input_ids, max_length=max_length, do_sample=False)
    beam_search_output = model.generate(input_ids, max_length=max_length, do_sample=True, num_beams=num_beams)
    n_grams_output = model.generate(input_ids, max_length=max_length, no_repeat_ngram_size=no_reapeat_ngrams, num_beams=num_beams)
    sample_output = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=temperature)
    top_k_otput = model.generate(input_ids, max_length=max_length, do_sample=True, top_k=top_k)
    top_p_output = model.generate(input_ids, max_length=max_length, do_sample=True, top_k=top_k, top_p=top_p)

    output_lst = [greedy_output, beam_search_output, n_grams_output, sample_output, top_k_otput, top_p_output]
    decoded_samples = [tokenizer.decode(g[0], skip_special_tokens=True) for g in output_lst]

    return decoded_samples

def generate_top_p(text, model, tokenizer, max_length, top_p, top_k):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, do_sample=True,top_k=top_k, top_p=top_p)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
samples = generate_all("The weather is", model, tokenizer, 50, 5, 2, 0.7, 50, 0.95)
print(samples)


In [42]:
#generate_top_p("The weather is", model, tokenizer, 50, 0.95, 50)
get_hidden_states("The weather is", model, tokenizer)[-1]


tensor([[[-0.0301,  0.3668,  0.0901,  ..., -0.2221,  0.1273, -0.1497],
         [ 0.5815,  0.2135,  0.1955,  ...,  0.4937,  0.0897, -0.2369],
         [ 0.2250,  0.5953, -0.3460,  ...,  0.3857, -0.0740,  0.1617]]],
       grad_fn=<ViewBackward0>)

In [11]:
input_ids = tokenizer.encode('I was meaning to', return_tensors='pt')

In [None]:
# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

In [None]:
output = model.generate(input_ids, max_length=50, num_beams=5)
print("Beam Search Output:")
print(tokenizer.decode(output[0], skip_special_tokens=True))

output = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2)
print("N-Gram Penalty on Beam Search Output:")
print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
greedy_output = model.generate(input_ids, max_length=50, do_sample=True)
print("Sampling Output:")
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))

greedy_output = model.generate(input_ids, max_length=50, do_sample=True, temperature=10.)
print("Sampling Output:")
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))

greedy_output = model.generate(input_ids, max_length=50, do_sample=True, temperature=0.5)
print("Sampling Output:")
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))