<a href="https://colab.research.google.com/github/harry-at-cogwrite/describe_data/blob/main/172_evidence_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install environment

In [None]:
!pip install transformers
!pip install sentencepiece

Upload data to colab

In [None]:
# after shift+enter, you have to click choose file,choose the 'tr.csv'
from google.colab import files
uploaded = files.upload()

Read data

In [None]:
import pandas as pd
df = pd.read_csv("tr.csv")

In [None]:
# pick 7000 data as training data
texts = df.irsen_text.values.tolist()[:7000]
labels = df.evid_s.values.tolist()[:7000]
# 1 of the data has wrong label, delete. Now we have 6999 training data
labels = labels[:4487] + labels[4488:7000]
texts = texts[:4487] + texts[4488:7000]
len(texts)

Using GPU

In [None]:
import numpy as np
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

data prepare

In [None]:
# split data to training and validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score
from sklearn.metrics import confusion_matrix
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, random_state=2, test_size=0.1)

In [None]:
# load tokenizer, turn data to bert type token
from transformers import BertTokenizerFast,RobertaTokenizer
from transformers import XLNetTokenizer, XLNetForSequenceClassification
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
# turn data to torch dataset
class bertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = bertDataset(train_encodings, train_labels)
val_dataset = bertDataset(val_encodings, val_labels)

Prepare model

In [None]:
# training metrics, will show result during training
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch import nn

# set some parameter
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    learning_rate=2e-05,
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=400,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_steps=100,
    evaluation_strategy="steps",
    load_best_model_at_end=True
)
# using XLnet classification model
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")
model.to(device) # put model to gpu
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,# evaluation dataset
    compute_metrics=compute_metrics,             
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],

    )


trainer.train()

Predict 

In [None]:
model.eval()
import torch.nn.functional as F
#this will predict one sentence each time
def predict(content):

    inputs = tokenizer(content,
                       
                       padding='max_length',
                       truncation=True, return_tensors="pt")
    # move to gpu
    ids = inputs["input_ids"].to(device)
    idt = inputs["token_type_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    # forward pass
    outputs = model(ids,token_type_ids=idt,attention_mask=mask)
    logits = outputs[0]
    x = F.softmax(logits, dim=-1)
    active_logits = logits.view(-1, model.num_labels)  # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits,
                                         axis=1)
    return x.cpu().detach().numpy()[0][1], flattened_predictions.cpu().numpy()[0]

prepare test data

In [None]:
sentences = df.irsen_text.values.tolist()[7000:]
real = df.evid_s.values.tolist()[7000:]
len(sentences)

In [None]:
pre = []# predict label
pre_pro = []# predict probility
for i in sentences:
  x = predict(i)
  pre.append(x[1])
  pre_pro.append(x[0])

Show result

In [None]:
print('f1:'+str(f1_score(real, pre, average=None))+'\n'+'recall:'+str(recall_score(real, pre, average=None))+'\n'+'precision:'+str(precision_score(real, pre, average=None))+'\n'+'accuracy:'+str(accuracy_score(real, pre))+'\n')

In [None]:
confusion_matrix(real, pre)

Show probability graph

In [None]:
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots(1,1,figsize = (10,5))
ax.hist(np.array(pre_pro)[np.array(real) == 1], color = "darkred",bins = "scott", alpha = .5, edgecolor = "red")
ax.hist(np.array(pre_pro)[np.array(real) == 0], color = "darkgreen",bins = "scott", alpha = .5, edgecolor = "green")

In [None]:
# number of samples model gives probabilty more than .8 but real label are non-evidence
print('num of samples have score more than 0.8 but are non-evidence : '+ str(np.sum(np.array(pre_pro)[np.array(real) == 0]>=0.8)))
print('num of samples have score more than 0.8 are non-evidence : '+ str(np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.8)))
# how much confidence if the socore is higher than 0.8, we are 85.7% confident that the sentence is evidence if the score is higher than .8
confi80 = str(round((np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.8))/(np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.8) + np.sum(np.array(pre_pro)[np.array(real) == 0]>=0.8)),4))
print('we are '+ confi80 +' confident that the sentence is evidence if the score is higher than .8')
# number of samples model gives probabilty more than .6 less than .8 but real label is non-evidence
num6080_nevid = np.sum(np.array(pre_pro)[np.array(real) == 0]>=0.6) - np.sum(np.array(pre_pro)[np.array(real) == 0]>=0.80)
num6080_evid = np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.6) - np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.80)
print('num of samples have score more than 0.6 and less than 0.8 but are non-evidence :'+ str(num6080_nevid))
print('num of samples have score more than 0.6 and less than 0.8 are evidence :'+ str(num6080_evid))
confi6080 = num6080_evid/(num6080_evid + num6080_nevid)
print('we are '+ str(round(confi6080,4)) +' confident that the sentence is evidence if the score is higher than .6 and less than .8')

In [None]:
print('num of samples have score more than 0.8 but are non-evidence : '+ str(np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.2)))
print('num of samples have score more than 0.8 are non-evidence : '+ str(np.sum(np.array(pre_pro)[np.array(real) == 1]<=0.2)))
confi20 = str(round((np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.2))/(np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.2) + np.sum(np.array(pre_pro)[np.array(real) == 1]<=0.2)),4))
print('we are '+ confi80 +' confident that the sentence is non-evidence if the score is higher than .2')
num2040_nevid = np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.4) - np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.2)
num2040_evid = np.sum(np.array(pre_pro)[np.array(real) == 1]<=0.4) - np.sum(np.array(pre_pro)[np.array(real) == 1]<=0.2)
print('num of samples have score more than 0.2 and less than 0.4 but are non-evidence :'+ str(num2040_nevid))
print('num of samples have score more than 0.2 and less than 0.4 are evidence :'+ str(num2040_evid))
confi2040 = num2040_nevid/(num2040_evid + num2040_nevid)
print('we are '+ str(round(confi2040,4)) +' confident that the sentence is non-evidence if the score is higher than .2 and less than .4')

In [None]:
# print the wrong predictions. The end of sentence shows the real labels
for i in range(len(real)):
  if real[i] != pre[i]:
    print(sentences[i] + ' claim' if real[i] else sentences[i] + ' noclaim')