In [None]:
!pip3 install torch==1.2.0+cpu torchvision==0.4.0+cpu -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip install pytorch-pretrained-bert

# prepare training data

In [None]:
import pandas as pd

# text column is the feature column, label column is the target column
train_data_pd = pd.read_csv(open('train_data.csv','r'))
train_data_pd = train_data_pd[train_data_pd['text'].notnull()
                              & train_data_pd['label'].notnull()
                             ]                        
print("train data:", train_data_pd.shape)

In [None]:
len(truncated_doc_icx)/len(train_data)

In [None]:
import pandas as pd

train_df = pd.DataFrame({
    'id':range(len(train_data_pd)),
    'label':train_data_pd['label'],
    'alpha':['a']*train_data_pd.shape[0],
    'text': list(train_data_pd.text)
})
train_df['label'] = train_df.label.astype(int)
train_df.shape

In [None]:
from sklearn.model_selection import train_test_split

train_df_bert, dev_df_bert = train_test_split(train_df, test_size=0.3, random_state=42)
print(train_df_bert.shape, dev_df_bert.shape)

In [None]:
# make sure columns are in right order, as they are hard coded in the process code
train_df_bert[cols].to_csv('./data_pytorch/train.tsv', sep='\t', index=False, header=False)
dev_df_bert[cols].to_csv('./data_pytorch/dev.tsv', sep='\t', index=False, header=False)

# extract features

In [None]:
from ml.models.relevancy_geography.v3_0_0.bert_utils import BinaryClassificationProcessor

DATA_DIR = './data_pytorch/'

processor = BinaryClassificationProcessor()

train_examples = processor.get_train_examples(DATA_DIR)
train_examples_len = len(train_examples)
print('train_examples_len:', train_examples_len)

label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)
print('num_labels:', num_labels)

label_map = {label: i for i, label in enumerate(label_list)}

In [None]:
from pytorch_pretrained_bert import BertTokenizer

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-uncased'

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False)

In [None]:
from multiprocessing import Pool, cpu_count
from tqdm.auto import tqdm
from ml.models.relevancy_geography.v3_0_0.bert_utils import convert_example_to_feature

MAX_SEQ_LENGTH = 512 # maximum for bert model
OUTPUT_MODE = 'classification'

train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) 
                                 for example in train_examples]

process_count = cpu_count() - 2
print(f'Preparing to convert {train_examples_len} examples..')
print(f'Spawning {process_count} processes..')
with Pool(process_count) as p:
    train_features = list(tqdm(p.imap(convert_example_to_feature, train_examples_for_processing), 
                               total=train_examples_len)
                         )

In [None]:
import pickle

with open(DATA_DIR + "train_features.pkl", "wb") as f:
    pickle.dump(train_features, f)

In [None]:
import pickle

DATA_DIR = './data_pytorch/'

with open(DATA_DIR + "train_features.pkl", "rb") as f:
    train_features = pickle.load(f)

# train

## build model

In [None]:
from pytorch_pretrained_bert import BertForSequenceClassification

# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = './cache/'

BERT_MODEL = 'bert-base-uncased'

num_labels = 2
# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)

In [None]:
model.to(device)

In [None]:
from pytorch_pretrained_bert.optimization import BertAdam

# optimizer
TRAIN_BATCH_SIZE = 24
NUM_TRAIN_EPOCHS = 1
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 2e-5
WARMUP_PROPORTION = 0.1

train_examples_len = len(train_features)
    
num_train_optimization_steps = int(
    train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS
print('num_train_optimization_steps', num_train_optimization_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=WARMUP_PROPORTION,
                     t_total=num_train_optimization_steps)

In [None]:
import torch
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
import logging

logger = logging.getLogger()
logger.info("***** Running training *****")
logger.info("  Num examples = %d", train_examples_len)
logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
logger.info("  Num steps = %d", num_train_optimization_steps)

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

OUTPUT_MODE = 'classification'
if OUTPUT_MODE == "classification":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
    
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [None]:
from tqdm import trange
from tqdm.auto import tqdm
from torch.nn import CrossEntropyLoss, MSELoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_TRAIN_EPOCHS = 1

model.train()

global_step = 0
nb_tr_steps = 0
tr_loss = 0
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        logits = model(input_ids, segment_ids, input_mask, labels=None)

        if OUTPUT_MODE == "classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif OUTPUT_MODE == "regression":
            loss_fct = MSELoss()
            loss = loss_fct(logits.view(-1), label_ids.view(-1))

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        print("\r%f" % loss, end='')
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

## save model

In [None]:
import os

model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'./outputs/{TASK_NAME}/'

# The name of the task to train.
TASK_NAME = 'task_pytorch'

CONFIG_NAME = "bert_config.json"
WEIGHTS_NAME = "pytorch_model.bin"

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)

In [None]:
from pytorch_pretrained_bert import BertTokenizer

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-uncased'

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False)
tokenizer.save_vocabulary(OUTPUT_DIR)

In [None]:
!tar -cvzf ./cache/task_pytorch.tar.gz -C ./outputs/task_pytorch bert_config.json pytorch_model.bin

# evaluation

In [None]:
from ml.models.relevancy_geography.v3_0_0.bert_utils import BinaryClassificationProcessor

# Load pre-trained model tokenizer (vocabulary)
OUTPUT_DIR = f'./outputs/{TASK_NAME}/'
tokenizer = BertTokenizer.from_pretrained(OUTPUT_DIR + 'vocab.txt', do_lower_case=False)

processor = BinaryClassificationProcessor()
eval_examples = processor.get_dev_examples(DATA_DIR)

label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)

eval_examples_len = len(eval_examples)
print('number of examples for evaluation:', eval_examples_len)

MAX_SEQ_LENGTH = 512
OUTPUT_MODE = 'classification'
label_map = {label: i for i, label in enumerate(label_list)}
eval_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in eval_examples]

In [None]:
from multiprocessing import Pool, cpu_count
from tqdm.auto import tqdm
from ml.models.relevancy_geography.v3_0_0.bert_utils import convert_example_to_feature

process_count = cpu_count() - 1

print(f'Preparing to convert {eval_examples_len} examples..')
print(f'Spawning {process_count} processes..')
with Pool(process_count) as p:
    eval_features = list(tqdm(p.imap(convert_example_to_feature, eval_examples_for_processing), 
                              total=eval_examples_len)
                         )

In [None]:
# Load pre-trained model (weights)
CACHE_DIR = './cache/'
CURR_BERT_MODEL = 'task_pytorch.tar.gz'

curr_model = BertForSequenceClassification.from_pretrained(CACHE_DIR + CURR_BERT_MODEL, cache_dir=CACHE_DIR, num_labels=len(label_list))

In [None]:
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

if OUTPUT_MODE == "classification":
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# Run prediction for full data
EVAL_BATCH_SIZE = 8
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE)

In [None]:
from sklearn.metrics import matthews_corrcoef, confusion_matrix

def get_eval_report(task_name, labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    return {
        "task": task_name,
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }

def compute_metrics(task_name, labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(task_name, labels, preds)

In [None]:
import numpy as np

curr_model.eval()
eval_loss = 0
nb_eval_steps = 0
preds = []

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        logits = curr_model(input_ids, segment_ids, input_mask, labels=None)

    # create eval loss and other metric required by the task
    if OUTPUT_MODE == "classification":
        loss_fct = CrossEntropyLoss()
        tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
    elif OUTPUT_MODE == "regression":
        loss_fct = MSELoss()
        tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

    eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

eval_loss = eval_loss / nb_eval_steps
preds = preds[0]
if OUTPUT_MODE == "classification":
    preds = np.argmax(preds, axis=1)
elif OUTPUT_MODE == "regression":
    preds = np.squeeze(preds)

In [None]:
result = compute_metrics(TASK_NAME, all_label_ids.numpy(), preds)

result['eval_loss'] = eval_loss

# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'./reports/{TASK_NAME}_evaluation_report/'
output_eval_file = os.path.join(REPORTS_DIR, "eval_results_add_sample_holdout.txt")
with open(output_eval_file, "w") as writer:
    logger.info("***** Eval results *****")
    for key in (result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

In [None]:
from sklearn.metrics.classification import precision_recall_fscore_support

precision_recall_fscore_support(all_label_ids.numpy(), preds)

## holdout data

In [None]:
import pandas as pd

holdout_data_raw = pd.read_csv('holdout_data.csv')
holdout_data_pd = holdout_data_raw[(holdout_data_raw['text'].str.len().notnull())
                                   & holdout_data_raw['label'].notnull()
                                   ] 
print("holdout data:", holdout_data_pd.shape)

In [None]:
holdout_bert = pd.DataFrame({
    'id':range(len(holdout_data_pd)),
    'label':holdout_data_pd['label'],
    'alpha':['a']*holdout_data_pd.shape[0],
    'text': list(holdout_data_pd.text)
})
holdout_bert['label'] = holdout_bert.label.astype(int)
print(holdout_bert.shape)

In [None]:
cols = ['id', 'label', 'alpha', 'text']
holdout_bert[cols].to_csv('./data_pytorch/holdout.tsv', sep='\t', index=False, header=False)