<a href="https://colab.research.google.com/github/konductor000/Domane-adaptation/blob/main/domain_adaptation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers
!pip install scattertext
!pip install seqeval
!pip install neptune-client

In [None]:
import neptune.new as neptune

run = neptune.init(
    project="skorodumov000/ysda-hw5",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjhiZWIzNC03YjVmLTRiZmEtYTY1OS1lNTZmZDIzOWViOGQifQ==",)

In [None]:
import datasets
import typing as tp

In [None]:
conll = datasets.load_dataset("conll2003")
wnut = datasets.load_dataset("wnut_17")

In [None]:
import inspect
inspect.getmro(type(conll))

In [None]:
print(conll.keys(), wnut.keys())

In [None]:
conll["test"].features

In [None]:
conll["test"].dataset_size, len(conll["test"])

In [None]:
conll["test"][50]

In [None]:
# let's view available tags
CONLL_NER_TAGS = conll['train'].features['ner_tags'].feature.names
print(CONLL_NER_TAGS)

In [None]:
wnut['test'].features

In [None]:
wnut["test"].dataset_size, len(wnut["test"])

In [None]:
print(wnut["test"][50])

In [None]:
WNUT_NER_TAGS = wnut['train'].features['ner_tags'].feature.names
print(WNUT_NER_TAGS)

In [None]:
from spacy import displacy

In [None]:
def ner_render(tokens: tp.Sequence[str], ner_tags: tp.Sequence[str], 
               tags_list=CONLL_NER_TAGS, title: tp.Optional[str] = None, **kwargs):
    pos = 0
    ents = []
    for word, tag_ in zip(tokens, ner_tags):
        tag = tags_list[tag_]
        if tag.startswith('B'):
            ents.append({
                "start": pos,
                "end": pos + len(word),
                "label": tag.split("-")[1]
            })
        elif tag.startswith('I'):
            ents[-1]["end"] = pos + len(word)
        pos += (len(word) + 1)
    displacy.render({
        "text": " ".join(tokens),
        "ents": ents,
        "title": title
    }, style="ent", manual=True)
            

In [None]:
conll["test"][50]

In [None]:
for test_id in [50, 200]:
    ner_render(**conll["test"][test_id], tags_list=CONLL_NER_TAGS, title = f'conll[{test_id}]')

In [None]:
for test_id in [100, 142]:
    ner_render(**wnut["test"][test_id], tags_list=WNUT_NER_TAGS, title = f'wnut[{test_id}]')

In [None]:
from itertools import chain
from collections import Counter

# Count every type of tag in CONLL and WNUT datasets:

conll_tag_counts = Counter()
wnut_tag_counts = Counter()

for data in conll["test"]:
    for label in data['ner_tags']:
        conll_tag_counts[CONLL_NER_TAGS[label]] += 1

for data in wnut["test"]:
    for label in data['ner_tags']:
        wnut_tag_counts[WNUT_NER_TAGS[label]] += 1

In [None]:
wnut_tag_counts

In [None]:
conll_tag_counts

In [None]:
label_mapping = {
    'O': 'O',
    'B-location': 'B-LOC',
    'I-location': 'I-LOC',
    'B-group': 'B-ORG',
    'B-corporation': 'B-ORG',
    'B-person': 'B-PER',
    'B-creative-work': 'B-MISC',
    'B-product': 'B-MISC',
    'I-person': 'I-PER',
    'I-creative-work': 'I-MISC',
    'I-corporation': 'I-ORG',
    'I-group': 'I-ORG',
    'I-product': 'I-MISC'
}

labelindexmapping = {WNUT_NER_TAGS.index(k):CONLL_NER_TAGS.index(v) for k, v in label_mapping.items()}
print(labelindexmapping)

In [None]:
def convert_label_sequence(example: tp.Dict[str, tp.Any], label_mapping: tp.Dict[str, str]) -> tp.Dict[str, tp.Any]:
    converted_example = dict(**example)
    converted_example['ner_tags'] = [label_mapping[label] for label in example['ner_tags']]
    return converted_example

In [None]:
converted_wnut = wnut.map(lambda x: convert_label_sequence(x, labelindexmapping))

In [None]:
for i in [1, 2, 3]:
    ner_render(**wnut["train"][i], tags_list=WNUT_NER_TAGS, title = f'wnut_train_[{i}]')

In [None]:
for i in [1, 2, 3]:
    ner_render(**converted_wnut["train"][i], tags_list=CONLL_NER_TAGS, 
               title = f'converted_wnut_train_[{i}]')

In [None]:
from IPython.display import HTML, IFrame
HTML("<style>.container { width:98% !important; }</style>")
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
import scattertext as st
import pandas as pd

In [None]:
conll_df = pd.DataFrame([{"text": " ".join(example["tokens"]), "ner": example["ner_tags"], "dataset": "conll"} for example in conll["train"]])
wnut_df = pd.DataFrame([{"text": " ".join(example["tokens"]), "ner": example["ner_tags"], "dataset": "wnut"} for example in converted_wnut["train"]])

In [None]:
df = pd.concat([conll_df, wnut_df])
df['parse'] = df.text.apply(st.whitespace_nlp_with_sentences)
corpus = st.CorpusFromParsedDocuments(df, category_col='dataset', parsed_col='parse') \
    .build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

In [None]:
with pd.option_context('mode.chained_assignment', None):
    # scattertext has pd.SettingWithCopyWarning in ScatterChart._add_term_freq_to_json_df
    html = st.produce_scattertext_explorer(
        corpus,
        category='conll', category_name='CONLL', not_category_name='WNUT',
        minimum_term_frequency=0, pmi_threshold_coefficient=0,
        width_in_pixels=1000,
        transform=st.Scalers.dense_rank
    )

with open("difference.html", "w", encoding="utf-8") as outf:
    print(html, file=outf)

In [None]:
IFrame("difference.html", width=1200, height=1000)

In [None]:
import numpy as np
import sklearn

import matplotlib.pyplot as plt
%matplotlib inline

import torch 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

from tqdm.auto import tqdm

In [None]:
from transformers import (pipeline, 
        AutoModelForTokenClassification, AutoTokenizer, 
        BertForTokenClassification, BertTokenizer)

# Load pretrained model and tokenizer for English NER task (dslim/bert-base-NER)

checkpoint = 'dslim/bert-base-NER'
model = AutoModelForTokenClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
inspect.getmro(type(model))

In [None]:
model.config

In [None]:
model.classifier

In [None]:
model.bert

In [None]:
ner_render(**conll["test"][10], tags_list=CONLL_NER_TAGS)

In [None]:
encoded = tokenizer(conll["test"][10]["tokens"], is_split_into_words=True)

In [None]:
print(f"Original sentence:\n---\n{' '.join(conll['test'][10]['tokens'])}")

tokenized = " ".join([tokenizer.convert_ids_to_tokens(id_) for id_ in encoded["input_ids"]])
print(f"\nTokenized sentence:\n---\n{tokenized}")


In [None]:
def tokenize_and_preserve_tags(example: tp.Dict[str, tp.Any],
                               tokenizer: BertTokenizer,
                               label2id: tp.Dict[str, int],
                               tokenizer_params={}) -> tp.Dict[str, tp.Any]:
    # write your own function to split each pair of word-token to same number of pieces.
    encoded = tokenizer(example["tokens"], is_split_into_words=True, **tokenizer_params)
    encoded.update(example)
    conll2ner = [label2id[i] for i in CONLL_NER_TAGS]
    id2label = {label2id[i]: i for i in label2id}
    
    raw_label_id = 0
    encoded['labels'] = []
    curr_token = ''
    for i in encoded["input_ids"]:
        #print()
        decoded = tokenizer.decode(i)
        #print(decoded, '++', curr_token, '++', end=' | ')
        if decoded in ['[CLS]', '[SEP]']:
            encoded['labels'].append(0)
            #print('start or end', end=' | ')
            continue

        assert len(encoded['labels']) != 0
        prev_label = encoded['labels'][-1]
        prev_label_text = id2label[prev_label]
        #print(prev_label, prev_label_text, end=' | ', sep=' / ')
        if decoded.startswith("##"):
            decoded = decoded[2:]
        
        if len(curr_token) == 0:
            encoded['labels'].append(conll2ner[example['ner_tags'][raw_label_id]])
            #print('00100', encoded['labels'][-1], '/', id2label[encoded['labels'][-1]], end=' | ')
        else:
            encoded['labels'].append(prev_label if prev_label_text.startswith("O") or \
                                     prev_label_text.startswith("I") else prev_label + 1)
            #print('00200', encoded['labels'][-1], '/', id2label[encoded['labels'][-1]], end=' | ')
        
        curr_token += decoded
        if len(curr_token) == len(encoded['tokens'][raw_label_id]):
            #print('00300', end=' | ')
            curr_token = ''
            raw_label_id += 1
        
        

    encoded['text_labels'] = [id2label[i] for i in encoded['labels']]
    
    #print(encoded)
        
    assert len(encoded['labels']) == len(encoded["input_ids"])
    return encoded

In [None]:
test_sentence = "AL-AIN, United Arab Emirates 1996-12-06"
test_example = {"tokens": ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'], "ner_tags": [5, 0, 5, 6, 6, 0]}
test_result = tokenize_and_preserve_tags(test_example, tokenizer, model.config.label2id)

print(tokenizer.decode(test_result['input_ids']))

assert tokenizer.decode(test_result['input_ids']) == '[CLS] AL - AIN, United Arab Emirates 1996 - 12 - 06 [SEP]'

print(test_result['text_labels'])

assert test_result['text_labels'] == ['O'] + ['B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O'] + ["O"]

assert test_result['labels'] == [0, 7, 8, 8, 8, 0, 7, 8, 8, 0, 0, 0, 0, 0, 0]

In [None]:
test_sentence = "His name is Jerry Abrahamson"
test_example = {"tokens": test_sentence.split(" "), "ner_tags": [0, 0, 0, 1, 2]}
test_result = tokenize_and_preserve_tags(test_example, tokenizer, model.config.label2id)

assert tokenizer.decode(test_result['input_ids']) == '[CLS] His name is Jerry Abrahamson [SEP]'

                                     #CLS     His  name is    Jerry    Abraham   ##son      SEP
assert test_result['text_labels'] == ['O'] + ["O", "O", "O", "B-PER", "I-PER",  "I-PER"] + ["O"]

assert test_result['labels'] == [0, 0, 0, 0, 3, 4, 4, 0]

In [None]:
conll = conll.map(lambda x: tokenize_and_preserve_tags(x, tokenizer, model.config.label2id))

In [None]:
wnut = converted_wnut
wnut = wnut.map(lambda x: tokenize_and_preserve_tags(x, tokenizer, model.config.label2id))

In [None]:
conll.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], output_all_columns=True)
wnut.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], output_all_columns=True)

In [None]:
from collections import defaultdict
class PadSequence:
    def __init__(self, padded_columns, device=device):
        self.padded_columns = set(padded_columns)
        self.device = device

    def __call__(self, batch):
        padded_batch = defaultdict(list)
        for example in batch:
            for key, tensor in example.items():
                padded_batch[key].append(tensor)
                
        for key, val in padded_batch.items():
            if key in self.padded_columns:
                padded_batch[key] = torch.nn.utils.rnn.pad_sequence(val, batch_first=True).to(self.device)
        return padded_batch

In [None]:
conll_test_dataloader = torch.utils.data.DataLoader(conll["test"], batch_size=4, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask', 'labels']))

In [None]:
test_batch = next(iter(conll_test_dataloader))

to_change = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']

for i in to_change:
    for j in test_batch[i]:
        j.to(device)

model.to(device)

model_output = model(input_ids=test_batch["input_ids"],
                     token_type_ids=test_batch["token_type_ids"],
                     attention_mask=test_batch["attention_mask"],
                     labels=test_batch["labels"], return_dict=True)

In [None]:
class NamedEntityPredictor:
    def __init__(self,
                 model: BertForTokenClassification,
                 tokenizer: BertTokenizer,
                 id2label: tp.Optional[tp.Dict[str, int]] = None):
        self.model = model
        self.tokenizer = tokenizer
        self.id2label = model.config.id2label if id2label is None else id2label
    
    def predict(self, batch: tp.Dict[str, tp.Any]):
        
        self.model.eval()

        to_change = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']

        for i in to_change:
            for j in test_batch[i]:
                j.to(device)

        with torch.no_grad():
            model_output = self.model(input_ids=batch["input_ids"],
                                      token_type_ids=batch["token_type_ids"],
                                      attention_mask=batch["attention_mask"],
                                      labels=batch["labels"],
                                      return_dict=True)
        indices = torch.argmax(model_output.logits, axis=2)
        indices = indices.detach().cpu().numpy()
        attention_mask = batch["attention_mask"].cpu().numpy()
        batch_size = len(batch["input_ids"])
        predicted_labels = []
        for i in range(batch_size):
            predicted_labels.append([self.id2label[id_] for id_ in indices[i][attention_mask[i] == 1]])
            
        return {
            "predicted_labels": predicted_labels,
            "loss": model_output.loss,
            "logits": model_output.logits
        }

In [None]:
ner = NamedEntityPredictor(model.to(device), tokenizer)
test_prediction = ner.predict(test_batch)

assert test_prediction['predicted_labels'][2] == list(test_batch["text_labels"][2])

In [None]:
from seqeval.metrics import classification_report, f1_score


def test_model(model, to_print=False):
    conll_test_dataloader = torch.utils.data.DataLoader(conll["test"], batch_size=16, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask', 'labels']))
    wnut_test_dataloader = torch.utils.data.DataLoader(wnut["test"], batch_size=16, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask', 'labels']))

    model.eval()
    ner = NamedEntityPredictor(model, tokenizer)
    predicted_labels = {"wnut_test": [], "conll_test": []}

    for batch in conll_test_dataloader:
        predicted_labels["conll_test"].extend(ner.predict(batch)["predicted_labels"])
        
    for batch in wnut_test_dataloader:
        predicted_labels["wnut_test"].extend(ner.predict(batch)["predicted_labels"])

    conll_report = classification_report(y_true=[list(example["text_labels"]) for example in conll["test"]],
                                                     y_pred=predicted_labels["conll_test"])
    
    wnut_report = classification_report(y_true=[list(example["text_labels"]) for example in wnut["test"]],
                                                    y_pred=predicted_labels["wnut_test"])
    
    conll_f1 = f1_score(y_true=[list(example["text_labels"])
                                                             for example in conll["test"]],
                                                     y_pred=predicted_labels["conll_test"])

    wnut_f1 = f1_score(y_true=[list(example["text_labels"])
                                                             for example in wnut["test"]],
                                                     y_pred=predicted_labels["wnut_test"])

    if to_print:
        print(f"CONLL:\n {conll_report}")
        print(f"WNUT:\n {wnut_report}")

    return predicted_labels, conll_f1, wnut_f1

In [None]:
predicted_labels, conll_f1, wnut_f1 = test_model(model, to_print=True)

In [None]:
def get_sentence_embeddings(model, batch):
    model.eval()
    with torch.no_grad():
        return model.bert(input_ids=batch["input_ids"],
                          token_type_ids=batch["token_type_ids"],
                          attention_mask=batch["attention_mask"],
                          return_dict=True)["last_hidden_state"].cpu().numpy()[:,0]

In [None]:
X = []
Y = []

conll_train_dataloader = torch.utils.data.DataLoader(conll["train"], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask', 'labels']))
wnut_train_dataloader = torch.utils.data.DataLoader(wnut["train"], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask', 'labels']))

for batch in tqdm(conll_train_dataloader):
    X.append(get_sentence_embeddings(model, batch))
    Y.extend([0] * len(batch["input_ids"]))
    
for batch in tqdm(wnut_train_dataloader):
    X.append(get_sentence_embeddings(model, batch))
    Y.extend([1] * len(batch["input_ids"]))

In [None]:
X = np.concatenate(X)
Y = np.array(Y)

In [None]:
dataset_classifier = sklearn.linear_model.LogisticRegression(max_iter=1000)
dataset_classifier.fit(X, Y)

In [None]:
wnut_test_scores = []

wnut_test_dataloader = torch.utils.data.DataLoader(wnut["test"], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask', 'labels']))
for batch in tqdm(wnut_test_dataloader):
    x = get_sentence_embeddings(model, batch)
    wnut_test_scores.append(dataset_classifier.predict_proba(x)[:,1])
    

In [None]:
wnut_test_scores = np.concatenate(wnut_test_scores)

In [None]:
plt.hist(wnut_test_scores)
plt.xlabel("WNUT score.")
plt.show()

In [None]:
score_indices = np.argsort(wnut_test_scores)

In [None]:
wnut_predicted_labels = np.array(predicted_labels["wnut_test"], dtype=object
                                )[np.argsort(wnut_test_scores)]

In [None]:
wnut_true_labels = np.array([list(example["text_labels"]) for example in wnut["test"]], dtype=object
                           )[np.argsort(wnut_test_scores)]

In [None]:
predicted_splits = np.array_split(wnut_predicted_labels, 5, )
true_splits = np.array_split(wnut_true_labels, 5)
score_splits = np.array_split(wnut_test_scores[np.argsort(wnut_test_scores)], 5)

In [None]:
print("score\tf1")
for scores, true_split, predicted_split in zip(score_splits, true_splits, predicted_splits):
    mean_score = np.mean(scores)
    f1 = f1_score(true_split, predicted_split)
    print(f"{mean_score:.3f}\t{f1:.3f}")

In [None]:
from transformers import get_scheduler
from tqdm.auto import tqdm
from datetime import datetime


def train_model(num_epochs, model, dataset, model_name=''):
    run["train/start"] = datetime.now()

    results = {
        'model_f1_conll': [],
        'model_f1_wnut': [],
        'model_loss': []
    }

    num_training_steps = num_epochs * len(dataset)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    print(num_training_steps)

    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch in dataset:

            to_change = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']
            for key in to_change:
                for tens in test_batch[key]:
                    tens.to(device)

            outputs = model(input_ids=batch["input_ids"],
                                        token_type_ids=batch["token_type_ids"],
                                        attention_mask=batch["attention_mask"],
                                        labels=batch["labels"],
                                        return_dict=True)
            loss = outputs.loss
            loss.backward()
            epoch_loss += loss

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        _, f1_conll, f1_wnut = test_model(model) 
        results['model_f1_conll'].append(f1_conll)
        results['model_f1_wnut'].append(f1_wnut)
        results['model_loss'].append(epoch_loss / len(dataset))

        run[f"train/{model_name}wnut_accuracy"].log(f1_wnut)
        run[f"train/{model_name}conll_accuracy"].log(f1_conll)
        run[f"train/{model_name}loss"].log(epoch_loss / len(dataset))
    
    run["train/end"] = datetime.now()

    return results

In [None]:
num_epochs = 25

In [None]:
from transformers import AdamW

full_train_model = AutoModelForTokenClassification.from_pretrained(checkpoint)
full_train_model.to(device)
optimizer = AdamW(full_train_model.parameters(), lr=5e-5)

In [None]:
#full_results = train_model(num_epochs, full_train_model, wnut_train_dataloader)

In [None]:
class_train_model = AutoModelForTokenClassification.from_pretrained(checkpoint)
class_train_model.to(device)
optimizer = AdamW(class_train_model.parameters(), lr=5e-5)

for param in class_train_model.bert.parameters():
    param.requires_grad = False
param.requires_grad = True
print()

In [None]:
#class_results = train_model(num_epochs, class_train_model, wnut_train_dataloader)

In [None]:
def class_dataset(conll, wnut):
    X = []
    Y = []
    conll_X = []

    conll_train_dataloader = torch.utils.data.DataLoader(conll["train"], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask']))
    wnut_train_dataloader = torch.utils.data.DataLoader(wnut["train"], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask']))

    print(len(conll_train_dataloader))
    print(conll_train_dataloader)

    for batch in tqdm(conll_train_dataloader):
        X.append(get_sentence_embeddings(model, batch))
        conll_X.append(get_sentence_embeddings(model, batch))
        Y.extend([0] * len(batch["input_ids"]))
        
    for batch in tqdm(wnut_train_dataloader):
        X.append(get_sentence_embeddings(model, batch))
        Y.extend([1] * len(batch["input_ids"]))

    X = np.concatenate(X)
    Y = np.array(Y)

    idx = np.random.permutation(range(len(Y)))

    X = X[idx]
    Y = Y[idx]

    n = 5
    splitted_X = np.array_split(X, n)
    splitted_Y = np.array_split(Y, n)

    conll_X = np.concatenate(conll_X)
    preds = np.zeros(len(conll_X))

    for i in range(n):
        X_, Y_ = splitted_X[i], splitted_Y[i]
        dataset_classifier = sklearn.linear_model.LogisticRegression(max_iter=1000)
        dataset_classifier.fit(X_, Y_)

        pred = np.array(dataset_classifier.predict_proba(conll_X))
        preds += pred[:, 1]

    preds /= (n - 1)

    arg_idx = preds.argsort()[::-1][:(len(conll_X) // 4)]

    return arg_idx

In [None]:
arg_idx = class_dataset(conll, wnut)

In [None]:
top_train_model = AutoModelForTokenClassification.from_pretrained(checkpoint)
top_train_model.to(device)
optimizer = AdamW(top_train_model.parameters(), lr=5e-5)

conll_train_dataloader = torch.utils.data.DataLoader(np.array(conll["train"])[arg_idx], \
                         batch_size=32, collate_fn=PadSequence(['input_ids', \
                         'token_type_ids', 'attention_mask', 'labels']))

num_epochs = 50

In [None]:
#train_model(num_epochs, top_train_model, conll_train_dataloader, model_name='UnStop50/')

In [None]:
proxy_train_model = AutoModelForTokenClassification.from_pretrained(checkpoint)
proxy_train_model.to(device)
optimizer = AdamW(proxy_train_model.parameters(), lr=5e-5)

wnut_train_dataloader = torch.utils.data.DataLoader(wnut["train"], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask', 'labels']))

results = []
num_epochs = 30

In [None]:
results = []

proxy_train_model.eval()
for batch in tqdm(wnut_train_dataloader):
    with torch.no_grad():
        outputs = proxy_train_model(input_ids=batch["input_ids"],
                                    token_type_ids=batch["token_type_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"],
                                    return_dict=True)
    
    for j, single_out in enumerate(outputs['logits']):
        new_labels = []
        cnt = 0
        score = 0
        for logit in single_out:
            srtd = np.argsort(logit.cpu().detach().numpy())[::-1]
            new_labels.append(srtd[0])
            if srtd[0] == 0:
                continue
            cnt += 1
            score += logit[srtd[0]] - logit[srtd[1]]

        results.append(float(score / max(1, cnt)))


  0%|          | 0/107 [00:00<?, ?it/s]

In [None]:
idx = np.argsort(results)[::-1][:len(results) // 2]
wnut_train_dataloader = torch.utils.data.DataLoader(np.array(wnut["train"])[idx], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask', 'labels']))

In [None]:
#train_model(num_epochs, proxy_train_model, wnut_train_dataloader, model_name='UnStop50%/')

In [None]:
X = []
Y = []
conll_X = []

conll_train_dataloader = torch.utils.data.DataLoader(conll["train"], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask']))
wnut_train_dataloader = torch.utils.data.DataLoader(wnut["train"], batch_size=32, collate_fn=PadSequence(['input_ids', 'token_type_ids', 'attention_mask']))

print(len(conll_train_dataloader))
print(conll_train_dataloader)

for batch in tqdm(conll_train_dataloader):
    X.append(get_sentence_embeddings(model, batch))
    conll_X.append(get_sentence_embeddings(model, batch))
    Y.extend([0] * len(batch["input_ids"]))
    
for batch in tqdm(wnut_train_dataloader):
    X.append(get_sentence_embeddings(model, batch))
    Y.extend([1] * len(batch["input_ids"]))

X = np.concatenate(X)
Y = np.array(Y)

idx = np.random.permutation(range(len(Y)))

X = X[idx]
Y = Y[idx]

n = 5
splitted_X = np.array_split(X, n)
splitted_Y = np.array_split(Y, n)

conll_X = np.concatenate(conll_X)
preds = np.zeros(len(conll_X))

dataset_classifier = sklearn.linear_model.LogisticRegression(max_iter=1000)
dataset_classifier.fit(X, Y)

In [None]:
!wget https://ysda-seminars.s3.eu-central-1.amazonaws.com/reddit_sample.json
reddit = datasets.load_dataset('json', data_files='reddit_sample.json')

In [None]:
def encode(example):
    out = tokenizer([' '.join(text or ['']) for text in example['words']], truncation='longest_first')
    out['token_type_ids'] = [[1] * len(out['input_ids'][i]) for i in range(len(out['input_ids']))]

    return out

reddit = reddit.map(lambda example: encode(example), batched=True)

  0%|          | 0/500 [00:00<?, ?ba/s]

In [None]:
reddit.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'], output_all_columns=True)

In [None]:
reddit_train_dataloader = torch.utils.data.DataLoader(reddit["train"], batch_size=32, collate_fn=PadSequence(['input_ids', 'attention_mask', 'token_type_ids']))

In [None]:
# results = []
# res = 0

# model.eval()
# for batch in tqdm(reddit_train_dataloader):
#     with torch.no_grad():
#         res = model.bert(input_ids=batch["input_ids"],
#                           token_type_ids=batch["token_type_ids"],
#                           attention_mask=batch["attention_mask"],
#                           return_dict=True)["last_hidden_state"].cpu().numpy()[:,0]
#     results.append(dataset_classifier.predict_proba(get_sentence_embeddings(model, batch)))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dir = 'drive/MyDrive/saves'

In [None]:
results = torch.load(dir + '/res.pt')

In [None]:
results = np.array(results)
results = np.concatenate(results[:, :, 1])

In [None]:
idx_res = np.argsort(results)[::-1][:10**5]

In [None]:
"""Masked LM training"""

In [None]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
text = "This is a great [MASK]."

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
import numpy as np
import tensorflow as tf

inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

In [None]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

In [None]:
!wget https://ysda-seminars.s3.eu-central-1.amazonaws.com/reddit_sample.json
reddit = datasets.load_dataset('json', data_files='reddit_sample.json')

reddit['train'] = reddit['train'].select(idx_res)

In [None]:
def tokenize_function(examples):
    result = tokenizer([" ".join(text) for text in examples["words"]])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = reddit.map(
    tokenize_function, batched=True, remove_columns=["words"]
)
tokenized_datasets

In [None]:
tokenizer.model_max_length

In [None]:
chunk_size = 128

In [None]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

In [None]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

In [None]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

In [None]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
import collections
import numpy as np

from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return tf_default_data_collator(features)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    downsampled_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = model.prepare_tf_dataset(
    downsampled_dataset["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [None]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

In [None]:
for i in tf_train_dataset:
    break
print(len(i[1]))

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset)

In [None]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

In [None]:
model.bert.paramerers.classifier

In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="huggingface-course/distilbert-base-uncased-finetuned-imdb"
)

In [None]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
results