In [2]:
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import torch
from bpemb import BPEmb
from tqdm import tqdm
import nltk


In [3]:
# import training data
dft_eng = pd.read_csv('../../data/dft_eng.csv')
dft_jap = pd.read_csv('../../data/dft_jap.csv')
dft_fin = pd.read_csv('../../data/dft_fin.csv')

# import validation data
dfv_eng = pd.read_csv('../../data/dfv_eng.csv')
dfv_jap = pd.read_csv('../../data/dfv_jap.csv')
dfv_fin = pd.read_csv('../../data/dfv_fin.csv')

#import word count
word_count = pd.read_csv('../../data/question_word_count.csv')

dft_eng.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,answer_start,answer_text,question_text_tokenized,document_plaintext_tokenized,answer_text_tokenized,labels
0,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': array([159]), 'answer_text': ...",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,[159],['1920s'],"['when', 'was', 'quantum', 'field', 'theory', ...","['quantum', 'field', 'theory', 'naturally', 'b...",['1920s'],1
1,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': array([610]), 'answer_text': ...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,[610],['Sully Prudhomme'],"['who', 'was', 'the', 'first', 'nobel', 'prize...","['the', 'nobel', 'prize', 'in', 'literature', ...","['sully', 'prudhomme']",1
2,When is the dialectical method used?,Dialectic,english,"{'answer_start': array([129]), 'answer_text': ...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,[129],['discourse between two or more people holding...,"['when', 'is', 'the', 'dialectical', 'method',...","['dialectic', 'or', 'dialectics', '(', 'greek'...","['discourse', 'between', 'two', 'or', 'more', ...",1
3,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': array([88]), 'answer_text': a...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,[88],['Sejong the Great'],"['who', 'invented', 'hangul', '?']","['hangul', 'was', 'personally', 'created', 'an...","['sejong', 'the', 'great']",1
4,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': array([0]), 'answer_text': ar...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,[0],"['Grasshoppers are plant-eaters, with a few sp...","['what', 'do', 'grasshoppers', 'eat', '?']","['grasshoppers', 'are', 'plant-eaters', ',', '...","['grasshoppers', 'are', 'plant-eaters', ',', '...",1


In [4]:
# Load english model with 25k word-pieces
bpemb_en = BPEmb(lang='en', dim=100, vs=25000)  

In [5]:
def get_bpemb_features(dataset, bpemb):
  # With bpemb we can tokenize and embed an entire document using .embed(x)
  X = [bpemb.embed(x).mean(0) for x in (dataset.document_plaintext)]
  y = dataset.labels
 
  return X,y

In [6]:
X_train,y_train = get_bpemb_features(dft_eng, bpemb_en)
X_test,y_test = get_bpemb_features(dfv_eng, bpemb_en)
lr_bpemb = LogisticRegression(penalty='l2', max_iter=1000, multi_class='multinomial')
lr_bpemb.fit(X_train, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [7]:
preds_bpemb = lr_bpemb.predict(X_test)
preds_valid_bpemb = lr_bpemb.predict(X_test)

In [8]:
# BPEmb model 
report = classification_report(y_test, preds_bpemb, output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.708061,0.656566,0.681342,495.0
1,0.679849,0.729293,0.703704,495.0
accuracy,0.692929,0.692929,0.692929,0.692929
macro avg,0.693955,0.692929,0.692523,990.0
weighted avg,0.693955,0.692929,0.692523,990.0


Text generation

In [9]:
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint, output_hidden_states = True)

In [10]:
def generate_text(text, model, tokenizer, max_length=100):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def get_hidden_states(text, model, tokenizer):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output = model(input_ids)
    return output.hidden_states

def generate_all(text, model, tokenizer, max_length, num_beams, no_reapeat_ngrams, temperature, top_k, top_p):
    input_ids = tokenizer.encode(text, return_tensors='pt')

    greedy_output = model.generate(input_ids, max_length=max_length, do_sample=False)
    beam_search_output = model.generate(input_ids, max_length=max_length, do_sample=True, num_beams=num_beams)
    n_grams_output = model.generate(input_ids, max_length=max_length, no_repeat_ngram_size=no_reapeat_ngrams, num_beams=num_beams)
    sample_output = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=temperature)
    top_k_otput = model.generate(input_ids, max_length=max_length, do_sample=True, top_k=top_k)
    top_p_output = model.generate(input_ids, max_length=max_length, do_sample=True, top_k=top_k, top_p=top_p)

    output_lst = [greedy_output, beam_search_output, n_grams_output, sample_output, top_k_otput, top_p_output]
    decoded_samples = [tokenizer.decode(g[0], skip_special_tokens=True) for g in output_lst]

    return decoded_samples

def generate_top_p(text, model, tokenizer, max_length, top_p, top_k):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, do_sample=True,top_k=top_k, top_p=top_p)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def generate_sample(text, model, tokenizer, max_length, temperature):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=temperature)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [11]:
samples = generate_top_p(dft_eng.question_text[0], model, tokenizer, 100, 0.9, 50)
print(samples)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When was quantum field theory developed?




The notion of a quantum field theory was first developed in 1947 with the creation of the first quantum field theory, but the following theories developed in the 1950s and 1960s and their publication, "Quantum Field Theory in the Early Twentieth Century" by John St. Louis, was a classic example of a very well understood theory.
St. Louis, in a letter from the physicist David Schindler, said:
"


In [12]:
#generate_top_p("The weather is", model, tokenizer, 50, 0.95, 50)
get_hidden_states("The weather is", model, tokenizer)[-1]


tensor([[[-0.0301,  0.3668,  0.0901,  ..., -0.2221,  0.1273, -0.1497],
         [ 0.5815,  0.2135,  0.1955,  ...,  0.4937,  0.0897, -0.2369],
         [ 0.2250,  0.5953, -0.3460,  ...,  0.3857, -0.0740,  0.1617]]],
       grad_fn=<ViewBackward0>)

In [13]:
input_ids = tokenizer.encode('I was meaning to', return_tensors='pt')

In [14]:
def get_sentence_perplexity(sentence, model, vocabulary, seq_len):
  states = (torch.zeros(lstm_layers, 1, lstm_dim).to(device),
              torch.zeros(lstm_layers, 1, lstm_dim).to(device))
  token_ids = [{'input_ids': bpemb_en.encode_ids(sentence)}]
  batch = collate_batch_bilstm(token_ids)
  loss_fn = torch.nn.CrossEntropyLoss()
  logits, states = model(batch[0].to(device), batch[1].to(device), states)

  target = batch[2].to(device)[:len(token_ids[0]['input_ids'])-1]
  loss = loss_fn(logits, target.reshape(-1))
  loss = loss.detach().cpu().numpy()
  return np.exp(loss)

In [15]:
from datasets import load_dataset

dataset = load_dataset("wikitext")

ValueError: Config name is missing.
Please pick one among the available configs: ['wikitext-103-v1', 'wikitext-2-v1', 'wikitext-103-raw-v1', 'wikitext-2-raw-v1']
Example of usage:
	`load_dataset('wikitext', 'wikitext-103-v1')`

In [16]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

Found cached dataset wikitext (C:/Users/Hallgrimur/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [17]:
"\n\n".join(dft_eng["question_text"])
encodings
device = "cpu"


In [None]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over input tokens.
        # Multiply it with trg_len to get the summation instead of average.
        # We will take average over all the tokens to get the true average
        # in the last step of this example.
        neg_log_likelihood = outputs.loss * trg_len

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

  0%|          | 1/562 [00:00<-1:58:43, -7.25it/s]


KeyboardInterrupt: 

tensor(39.2577)

Fine tuning models 

In [18]:
import transformers
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import math

In [19]:
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [20]:
tokenizer(dft_eng.document_plaintext[0])

{'input_ids': [24915, 388, 2214, 4583, 8752, 2540, 351, 262, 2050, 286, 31094, 12213, 11, 355, 262, 31094, 2214, 373, 262, 691, 1900, 15993, 2214, 355, 286, 262, 14062, 82, 3693, 23, 5974, 16], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
block_size = 128
def tokenize_function(examples):
    return tokenizer(examples["document_plaintext"])

def group_texts(examples):
    # Concatenate all texts.
    keys = ['attention_mask', 'input_ids']
    concatenated_examples = {k: sum(examples[k], []) for k in keys}
    total_length = len(concatenated_examples[list(keys)[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # this is needed as the used dataset is a subclass of ClassificationDataset, which requires label as a field...
    result["label"] = result["input_ids"].copy()
    result["labels"] = result["input_ids"].copy()
    return result

In [22]:
train = {'train': dft_eng.document_plaintext[x] for x in range(len(dft_eng.document_plaintext))}
train

{'train': 'The previous mayor, Bill Laforet faced a recall election in November 2018, after a resident group submitted in June a list of 5,000 petition signatures that they had collected calling for the action, in excess of the 25% needed to place the measure in front of voters.[85] In the November 2018 general election, Laforet was recalled from office and John Roth was elected mayor. The successful recall was the first in the county for at least 25 years.[86]'}

In [23]:
textlst = np.full((1, len(dft_eng.document_plaintext)), 'text').tolist()
train = {textlst[x]: dft_eng.document_plaintext[x] for x in range(len(dft_eng.document_plaintext))}
train

TypeError: unhashable type: 'list'

In [24]:
tokonizedt = [tokenizer(x) for x in dft_eng.question_text]
tokonizedv = [tokenizer(x) for x in dfv_eng.question_text]


In [25]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    max_steps=300
)

  return torch._C._cuda_getDeviceCount() > 0


In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokonizedt,
    eval_dataset=tokonizedv
)

max_steps is given, it will override any value given in num_train_epochs


In [27]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 990
  Batch size = 8


ValueError: expected sequence of length 11 at dim 1 (got 10)

In [28]:
train = load_dataset('tydiqa', 'primary_task')
#unsupervised_imdb_splits = unsupervised_imdb.train_test_split(test_size=0.01)
#print(unsupervised_imdb_splits.keys())
#print(unsupervised_imdb_splits['train'][0])
train

Found cached dataset tydiqa (C:/Users/Hallgrimur/.cache/huggingface/datasets/tydiqa/primary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148)


  0%|          | 0/2 [00:00<?, ?it/s]

In [34]:
import pandas as pd 

df = pd.DataFrame.from_dict(train['train'][0:100000]) 
df

Unnamed: 0,passage_answer_candidates,question_text,document_title,language,annotations,document_plaintext,document_url
0,"{'plaintext_start_byte': [1, 660, 844, 1196, 1...",berapakah jenis ras yang ada didunia?,Ras manusia,indonesian,"{'passage_answer_candidate_index': [-1], 'mini...","\ntransl.\n\nRas (dari bahasa Prancis race, ya...",https://id.wikipedia.org/wiki/Ras%20manusia
1,"{'plaintext_start_byte': [1, 271, 995, 1763, 2...",2018年アメリカで一番治安の悪い州はどこ,デトロイト,japanese,"{'passage_answer_candidate_index': [-1], 'mini...",\n\n\nデトロイト（ /dɨˈtrɔɪt/）は、アメリカ合衆国ミシガン州南東部にある都市...,https://ja.wikipedia.org/wiki/%E3%83%87%E3%83%...
2,"{'plaintext_start_byte': [0, 208, 542, 891, 10...","Je,Ngamia anaweza kaa bila maji kwa muda gani?",Kuku,swahili,"{'passage_answer_candidate_index': [-1], 'mini...",\nKuku (Gallus gallus domesticus) ni ndege ana...,https://sw.wikipedia.org/wiki/Kuku
3,"{'plaintext_start_byte': [5, 401, 1185, 2103, ...",কম্পিউটার বিজ্ঞানের মোট কয়টি শাখা রয়েছে ?,বিজ্ঞান,bengali,"{'passage_answer_candidate_index': [-1], 'mini...",\n\n\n\n\n\n\n\nভৌত বিশ্বের যা কিছু পর্যবেক্ষণ...,https://bn.wikipedia.org/wiki/%E0%A6%AC%E0%A6%...
4,"{'plaintext_start_byte': [0, 395, 716, 1320, 3...",మెదక్ నగర విస్తీర్ణం ఎంత?,మెదక్ జిల్లా,telugu,"{'passage_answer_candidate_index': [-1], 'mini...",మెదక్ జిల్లా తెలంగాణ రాష్ట్రంలోని 31 జిల్లాలలో...,https://te.wikipedia.org/wiki/%E0%B0%AE%E0%B1%...
...,...,...,...,...,...,...,...
99995,"{'plaintext_start_byte': [1, 1142, 3214, 3713,...",మొట్టమొదటి కెమెరా పేరేమిటి ?,కెమెరా,telugu,"{'passage_answer_candidate_index': [-1], 'mini...",\nకెమెరా (ఆంగ్లం: Camera) అనగా స్థిర చిత్రాలను...,https://te.wikipedia.org/wiki/%E0%B0%95%E0%B1%...
99996,"{'plaintext_start_byte': [0, 249, 541, 1064, 1...",大和民族より前に日本列島に住んでいた民族はいる？,大和民族,japanese,"{'passage_answer_candidate_index': [37], 'mini...",\n\n大和民族（やまとみんぞく）は、日本列島の住民の大半を占める民族である。ほとんどが日本...,https://ja.wikipedia.org/wiki/%E5%A4%A7%E5%92%...
99997,"{'plaintext_start_byte': [0, 378, 457, 1348, 1...",كم عدد آيات سورة الحديد؟,سورة الحديد,arabic,"{'passage_answer_candidate_index': [0], 'minim...",\nسورة الحديد هي سورة مدنية عدد آياتها 29 وتر...,https://ar.wikipedia.org/wiki/%D8%B3%D9%88%D8%...
99998,"{'plaintext_start_byte': [0, 544, 675, 1140, 1...",Kuinka monta romaania Stephen King on kirjoitt...,Stephen King,finnish,"{'passage_answer_candidate_index': [-1], 'mini...",\n\n\nStephen Edwin King (s. 21. syyskuuta 194...,https://fi.wikipedia.org/wiki/Stephen%20King


In [37]:
df.loc[df['language'] == 'english']

Unnamed: 0,passage_answer_candidates,question_text,document_title,language,annotations,document_plaintext,document_url
28,"{'plaintext_start_byte': [2, 740, 1381, 1941, ...",When did the art deco movement begin?,Art Deco,english,"{'passage_answer_candidate_index': [-1], 'mini...","\n\nArt Deco, sometimes referred to as Deco, i...",https://en.wikipedia.org/wiki/Art%20Deco
50,"{'plaintext_start_byte': [5, 378, 956, 1342, 3...",Is Creole a pidgin of French?,French-based creole languages,english,"{'passage_answer_candidate_index': [1], 'minim...",\n\n\n\n\nPart of a series on theFrench langua...,https://en.wikipedia.org/wiki/French-based%20c...
65,"{'plaintext_start_byte': [7, 1637, 1938, 2380,...",When was quantum field theory developed?,Quantum field theory,english,"{'passage_answer_candidate_index': [12], 'mini...",\n\n\n\n\n\n\nQuantum field theoryFeynman diag...,https://en.wikipedia.org/wiki/Quantum%20field%...
76,"{'plaintext_start_byte': [2, 284, 580, 837, 10...",What was the highest value of the yen in 2018?,Banknotes of the Japanese yen,english,"{'passage_answer_candidate_index': [-1], 'mini...",\n\nThe banknotes of the Japanese yen are part...,https://en.wikipedia.org/wiki/Banknotes%20of%2...
87,"{'plaintext_start_byte': [3, 118, 357, 1045, 1...",Does plastic decompose at all?,Biodegradable plastic,english,"{'passage_answer_candidate_index': [0], 'minim...",\n\n\nBiodegradable plastics are plastics that...,https://en.wikipedia.org/wiki/Biodegradable%20...
...,...,...,...,...,...,...,...
99899,"{'plaintext_start_byte': [3, 517, 858, 1228, 1...",When was ultrasound first used in medicine?,Medical ultrasound,english,"{'passage_answer_candidate_index': [59], 'mini...",\n\n\nMedical ultrasound (also known as diagno...,https://en.wikipedia.org/wiki/Medical%20ultras...
99917,"{'plaintext_start_byte': [0, 171, 498, 868, 15...",Do steam locomotives have gears?,Geared steam locomotive,english,"{'passage_answer_candidate_index': [0], 'minim...",A geared steam locomotive is a type of steam l...,https://en.wikipedia.org/wiki/Geared%20steam%2...
99942,"{'plaintext_start_byte': [1, 459, 636, 1198, 1...",When was the West Virginia Mountaineers basket...,West Virginia Mountaineers men's basketball,english,"{'passage_answer_candidate_index': [4], 'minim...",\nThe West Virginia Mountaineers men's basketb...,https://en.wikipedia.org/wiki/West%20Virginia%...
99943,"{'plaintext_start_byte': [3, 595, 1443, 2355, ...",How long does it take solar wind to reach the ...,Solar wind,english,"{'passage_answer_candidate_index': [-1], 'mini...",\n\n\nThe solar wind is a stream of charged pa...,https://en.wikipedia.org/wiki/Solar%20wind
