In [1]:
import argparse
import constant as config
import torch
from util.dataset import read_dataset, sampling_dataset, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from trainer import Trainer
import random
import torch
import numpy as np
# from util.augment import *

In [2]:
def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

In [3]:
!nvidia-smi

Wed Jul 12 10:30:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| 49%   41C    P8    38W / 370W |    318MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [4]:
dataset = list(zip(['one', 'two', 'three'], [1, 2 , 3]))
print(dataset.pop(0))

('one', 1)


In [5]:
import pandas as pd

set_seed(42)
fold = 10
# percentage = "10_percent"
# train_path = f'../spin-off/dataset/fold_{fold}/10-fold_original_{percentage}/algocite_utilize_dataset_train_fold_{fold}.csv'
# test_path = f'../spin-off/dataset/fold_{fold}/10-fold_original_{percentage}/algocite_utilize_dataset_test_fold_{fold}.csv'
# val_path = f'../spin-off/dataset/fold_{fold}/10-fold_original_{percentage}/algocite_utilize_dataset_val_fold_{fold}.csv'
train_path = f'../dataset/10-fold_labeled_increasing_relabel_ver3/algocite_utilize_dataset_train_fold_{fold}.csv'
test_path = f'../dataset/10-fold_labeled_increasing_relabel_ver3/algocite_utilize_dataset_test_fold_{fold}.csv'
val_path = f'../dataset/10-fold_labeled_increasing_relabel_ver3/algocite_utilize_dataset_val_fold_{fold}.csv'
unlabel_path = f'../spin-off/algocitecontexts_unlabeled_10000_random_new1.csv'

# Read dataset
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_val = pd.read_csv(val_path)
df_unlabeled = pd.read_csv(unlabel_path)

# df_unlabeled.dropna(inplace=True)

train_texts, train_labels = df_train['CITATIONS_CONTEXTS'].values, df_train['USAGE_LABELS'].values
test_texts, test_labels = df_test['CITATIONS_CONTEXTS'].values, df_test['USAGE_LABELS'].values
val_texts, val_labels = df_val['CITATIONS_CONTEXTS'].values, df_val['USAGE_LABELS'].values
unlabeled_texts, unlabeled_labels = df_unlabeled['CONTENTS'].values, df_unlabeled['LABELS'].values

labeled_data = list(zip(train_texts, train_labels))
unlabeled_data = list(zip(unlabeled_texts, unlabeled_labels))
dev_data = list(zip(val_texts, val_labels))

print('labeled num {}, unlabeled num {}, valid num {}'.format(len(labeled_data), len(unlabeled_data), len(dev_data)))

labeled num 7927, unlabeled num 10000, valid num 986


In [6]:
# Tokenizing 
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', do_lower_case=True)

# encoding = { 
#     'NOTUTILIZE': 0,
#     'UTILIZE': 1,
# }

encoding = { 
    'EXTEND': 0,
    'MENTION': 1,
    'NOTALGO': 2,
    'USE': 3
}

unlabeled_encoding = {
    'UNK_UNK': 0
}

labeled_texts = [data[0] for data in labeled_data]
labeled_labels = [data[1] for data in labeled_data]
labeled_labels = [encoding[key] for key in labeled_labels]
labeled_encodings = tokenizer(labeled_texts, 
                              add_special_tokens=True, 
                              max_length=256,
                              truncation=True, 
                              padding='max_length',
                              return_attention_mask=True
                             )
labeled_dataset = Dataset(labeled_encodings, labeled_labels)

dev_texts = [data[0] for data in dev_data]
dev_labels = [data[1] for data in dev_data]
dev_labels = [encoding[key] for key in dev_labels]
dev_encodings = tokenizer(dev_texts, 
                          add_special_tokens=True, 
                          max_length=256,
                          truncation=True, 
                          padding='max_length',
                          return_attention_mask=True
                         )
dev_dataset = Dataset(dev_encodings, dev_labels)

test_texts = list(test_texts)
test_labels = [encoding[key] for key in test_labels]
test_encodings = tokenizer(test_texts, 
                           add_special_tokens=True, 
                           max_length=256,
                           truncation=True, 
                           padding='max_length',
                           return_attention_mask=True
                          )
test_dataset = Dataset(test_encodings, test_labels)

# We keep the label of unlabeled data to track for accuracy of pseudo-labeling
unlabeled_texts = [data[0] for data in unlabeled_data]
unlabeled_labels = [data[1] for data in unlabeled_data]
unlabeled_labels = [unlabeled_encoding[key] for key in unlabeled_labels]
unlabeled_encodings = tokenizer(unlabeled_texts, 
                                add_special_tokens=True, 
                                max_length=256,
                                truncation=True, 
                                padding='max_length',
                                return_attention_mask=True
                               )
unlabeled_dataset = Dataset(unlabeled_encodings, unlabeled_labels)

In [7]:
# for i in labeled_dataset:
#     print(i['labels'])
#     break

label_int = [data['labels'].item() for data in labeled_dataset]
print(label_int)

[3, 1, 1, 1, 1, 1, 3, 1, 0, 1, 1, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 2, 3, 3, 3, 2, 3, 1, 1, 1, 2, 1, 1, 1, 1, 3, 2, 1, 2, 1, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 2, 3, 2, 3, 1, 0, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 0, 1, 3, 3, 2, 1, 1, 2, 3, 3, 2, 1, 3, 1, 1, 3, 1, 1, 1, 1, 2, 0, 3, 3, 3, 1, 3, 1, 3, 3, 2, 1, 3, 1, 1, 1, 1, 1, 0, 3, 1, 3, 1, 3, 2, 3, 2, 3, 2, 1, 1, 3, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 3, 2, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 2, 3, 1, 2, 0, 3, 3, 1, 1, 2, 3, 1, 1, 3, 3, 1, 3, 1, 1, 1, 0, 3, 3, 3, 1, 1, 1, 3, 1, 3, 3, 1, 1, 1, 3, 3, 3, 3, 3, 1, 0, 1, 3, 1, 1, 0, 3, 3, 1, 2, 2, 1, 1, 3, 1, 3, 1, 1, 3, 1, 3, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 2, 1, 3, 0, 2, 1, 3, 2, 1, 3, 1, 3, 3, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 1, 3, 3, 0, 1, 1, 3, 1, 3, 1, 3, 1, 3, 3, 1, 3, 1, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 1, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 3, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 3, 1, 1, 0, 3, 3, 

In [8]:
from collections import Counter
import numpy as np

count = Counter(label_int)
class_count = np.array([count.get(idx) for idx in range(len(encoding))])
weight = 1./class_count
print(weight)
samples_weight = np.array([weight[t] for t in label_int])
samples_weight = torch.from_numpy(samples_weight)
print(samples_weight)

[0.00210526 0.00022774 0.00068871 0.0006215 ]
tensor([0.0006, 0.0002, 0.0002,  ..., 0.0006, 0.0006, 0.0006],
       dtype=torch.float64)


In [9]:
# Build model 
model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', 
                                                      num_labels=config.class_num, 
                                                      output_attentions=False, 
                                                      output_hidden_states=False)

# Criterion & optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8) #or AdamW
# total_steps = len(labeled_dataset) * epochs

# scheduler = get_linear_schedule_with_warmup(optimizer,
#                                             num_warmup_steps=0,
#                                             num_training_steps=total_steps
#             )

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [10]:
# Init Trainer
save_path = './test/'
trainer = Trainer(config, model, optimizer, save_path, dev_dataset, test_dataset)

# Initial training (supervised leraning)
trainer.initial_train(labeled_dataset)

initial train module
train_epoch 0
Training Loss per 1000 steps: 1.4241783618927002
Training Accuracy per 1000 steps: 6.25
Training Loss Epoch: 0.6912925983028065
Training Accuracy Epoch: 74.32824523779487
Validation Loss per 1000 steps: 4.138816833496094
Validation Accuracy per 1000 steps: 40.625
Validation Loss Epoch: 5.1617597382395495
Validation Accuracy Epoch: 20.28397565922921
Test Loss per 1000 steps: 4.212571144104004
Test Accuracy per 1000 steps: 40.625
Test Loss Epoch: 5.718214758804867
Test Accuracy Epoch: 10.97424412094065
train_epoch 1
Training Loss per 1000 steps: 4.918692588806152
Training Accuracy per 1000 steps: 28.125
Training Loss Epoch: 0.7245903590453728
Training Accuracy Epoch: 75.21130314116311
Validation Loss per 1000 steps: 3.3611509799957275
Validation Accuracy per 1000 steps: 40.625
Validation Loss Epoch: 4.138680005986845
Validation Accuracy Epoch: 24.746450304259636
Validation loss decreased (inf --> 4.138680).  
train_epoch 2
Training Loss per 1000 steps: 

In [11]:
# load checkpoint 
checkpoint_path = trainer.sup_path +'/checkpoint.pt'
checkpoint = torch.load(checkpoint_path)

del model, optimizer, trainer.model, trainer.optimizer
model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', 
                                                      num_labels=config.class_num, 
                                                      output_attentions=False, 
                                                      output_hidden_states=False).to(config.device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

trainer.model = model
trainer.optimizer = optimizer

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [12]:
trainer.test_loader

<torch.utils.data.dataloader.DataLoader at 0x7fcdb184c220>

In [13]:
# from evaluator import Evaluator

# # eval supervised trained model 
# Evaluator.evaluate(model, test_loader, is_test=True)

In [14]:
# self-training
trainer.self_train(labeled_dataset, unlabeled_dataset)

psudo-label 0/0


2023-07-12 10:34:40.115733: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-12 10:34:40.231263: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-12 10:34:40.687308: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-07-12 10:34:40.687353: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

labeled 7927 unlabeled 10000
pseudo-labeled 0
After updated -> labeled 7927 unlabeled 10000
outer_epoch 0 inner_epoch 0
train_epoch 0
Training Loss per 1000 steps: 4.018381595611572
Training Accuracy per 1000 steps: 28.125
Training Loss Epoch: 0.6426773996152464
Training Accuracy Epoch: 76.69988646398386
Validation Loss per 1000 steps: 3.9169087409973145
Validation Accuracy per 1000 steps: 40.625
Validation Loss Epoch: 4.256488610539706
Validation Accuracy Epoch: 28.093306288032455
Test Loss per 1000 steps: 4.029982566833496
Test Accuracy per 1000 steps: 37.5
Test Loss Epoch: 4.670109042099544
Test Accuracy Epoch: 21.16461366181411
outer_epoch 0 inner_epoch 1
train_epoch 1
Training Loss per 1000 steps: 4.780878067016602
Training Accuracy per 1000 steps: 28.125
Training Loss Epoch: 0.5081275421712969
Training Accuracy Epoch: 83.5498927715403
Validation Loss per 1000 steps: 3.441494941711426
Validation Accuracy per 1000 steps: 43.75
Validation Loss Epoch: 4.096371051044233
Validation Acc

In [15]:
# eval semi-supervised trained model 
checkpoint_path = trainer.ssl_path +'/checkpoint.pt'
checkpoint = torch.load(checkpoint_path)

del model, optimizer, trainer.model, trainer.optimizer
model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', 
                                                      num_labels=config.class_num, 
                                                      output_attentions=False, 
                                                      output_hidden_states=False).to(config.device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

trainer.model = model
trainer.optimizer = optimizer

# trainer.evaluator.evaluate(trainer.model, trainer.test_loader, is_test=True)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [16]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        ids = batch['input_ids'].to(config.device, dtype=torch.long)
        attention_mask = batch['attention_mask'].to(config.device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(config.device, dtype=torch.long)
        targets = batch['labels'].to(config.device, dtype=torch.long)

        # Compute logits
        with torch.no_grad():
            outputs = model(ids, attention_mask)
            logits = outputs.logits
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [17]:
test_dataloader = DataLoader(test_dataset, **config.test_params)
probs = bert_predict(model, test_dataloader)

In [18]:
# import pickle

# fileObj = open(f'../results/self-training_finetuning_scibert_original_costsensitive_probs_fold_{fold}.obj', 'wb')
# pickle.dump(probs,fileObj)
# fileObj.close()

In [19]:
import numpy as np

# preds = np.where(probs[:, 1] > 0.5, 1, 0)
preds = np.argmax(probs, axis=1)

In [20]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

usage_class_names = ['MENTION', 'NOTALGO', 'USE', 'EXTEND']
utilize_class_names = ['NOTUTILIZE', 'UTILIZE']

y_test_encoded = test_labels
accuracy = accuracy_score(y_test_encoded, preds)
precision = precision_score(y_test_encoded, preds, average='macro')
recall = recall_score(y_test_encoded, preds, average='macro')
f1 = f1_score(y_test_encoded, preds, average='macro')

print("Accuracy: %.5f%%" % (accuracy*100))
print("Precision Score: %.5f" % (precision*100))
print("Recall Score: %.5f" % (recall*100))
print("F1 Score: %.5f" % (f1*100))

print(classification_report(y_test_encoded, preds, target_names=usage_class_names, digits=3))

Accuracy: 44.90482%
Precision Score: 52.24900
Recall Score: 59.00267
F1 Score: 42.52735
              precision    recall  f1-score   support

     MENTION      0.266     0.791     0.398        67
     NOTALGO      0.867     0.380     0.528       550
         USE      0.725     0.281     0.405       178
      EXTEND      0.232     0.908     0.369        98

    accuracy                          0.449       893
   macro avg      0.522     0.590     0.425       893
weighted avg      0.724     0.449     0.477       893



In [21]:
from sklearn.metrics import accuracy_score, roc_curve, auc

def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [22]:
import matplotlib.pyplot as plt

# evaluate_roc(probs, y_test_encoded)

In [23]:
inv_encoding = {v: k for k, v in encoding.items()}
pred_label = [inv_encoding[key] for key in preds]

In [24]:
df_test_context = df_test.iloc[:, :3].copy()
df_test_labels = df_test[['USAGE_LABELS']].copy()
df_test_new = pd.concat([df_test_context, df_test_labels], axis=1)
df_test_new

Unnamed: 0,contextID,citationID,CITATIONS_CONTEXTS,USAGE_LABELS
0,202719,213441,ploitation of dependencies among facts. We pla...,MENTION
1,429629,460179,"ic context free language [24], and all such la...",MENTION
2,507299,541772,ch that routing delay is minimized. 5 Implemen...,USE
3,777941,830816,ch defeat applicable link layer compression me...,MENTION
4,834166,891644,ative probability of non-default 5 curve. In o...,USE
...,...,...,...,...
888,106711021,98946060,ommitment is distinguished from jobssatisfacti...,NOTALGO
889,147628218,129938559,or countries thatshttp://www.fin.ee/index.php?...,NOTALGO
890,43035149,41462937,x an integer r ? 2 and let A = e ?i 2r . The q...,NOTALGO
891,101745998,95273024,ate the transcriptional effects of liganded NR...,NOTALGO


In [25]:
df_test_new['PRED_LABELS'] = pred_label

In [26]:
df_test_new

Unnamed: 0,contextID,citationID,CITATIONS_CONTEXTS,USAGE_LABELS,PRED_LABELS
0,202719,213441,ploitation of dependencies among facts. We pla...,MENTION,USE
1,429629,460179,"ic context free language [24], and all such la...",MENTION,MENTION
2,507299,541772,ch that routing delay is minimized. 5 Implemen...,USE,USE
3,777941,830816,ch defeat applicable link layer compression me...,MENTION,USE
4,834166,891644,ative probability of non-default 5 curve. In o...,USE,USE
...,...,...,...,...,...
888,106711021,98946060,ommitment is distinguished from jobssatisfacti...,NOTALGO,NOTALGO
889,147628218,129938559,or countries thatshttp://www.fin.ee/index.php?...,NOTALGO,NOTALGO
890,43035149,41462937,x an integer r ? 2 and let A = e ?i 2r . The q...,NOTALGO,USE
891,101745998,95273024,ate the transcriptional effects of liganded NR...,NOTALGO,NOTALGO


In [27]:
df_test_new.to_csv(f'../Self-Training_Fine-Tuning_SciBERT_USAGE_new_Fold_{fold}.csv', index=False)

In [28]:
# new_dataset = {label:[] for label in range(2)}
# print(new_dataset)

In [29]:
# def pseudo_labeling(unlabeled_dataset, confidence_threshold):
#     unlabeled_loader = DataLoader(unlabeled_dataset, **config.unlabeled_params)
#     model.eval()
#     new_dataset = {label:[] for label in range(2)}

#     with torch.no_grad():
#         for _, batch in enumerate(unlabeled_loader):
#             ids = batch['input_ids'].to(config.device, dtype=torch.long)
#             attention_mask = batch['attention_mask'].to(config.device, dtype=torch.long)
#             token_type_ids = batch['token_type_ids'].to(config.device, dtype=torch.long)
#             targets = batch['labels'].to(config.device, dtype=torch.long)

#             outputs = model(ids, attention_mask, token_type_ids, labels=targets)
#             loss, logits = outputs[0], outputs[1]
#             confidences = torch.softmax(logits, dim=-1)
#             big_val, big_idx = torch.max(confidences.data, dim=-1)

#             for text_id, label, conf_val, target in zip(ids, big_idx, big_val, targets):
#                 pred_label, conf_val, target = label.item(), conf_val.item(), target.item()
#                 if conf_val >= confidence_threshold:
#                     new_dataset[pred_label].append((text_id, pred_label, target))

#     num_of_min_dataset = 987654321
#     for label, dataset in new_dataset.items():
#         if num_of_min_dataset > len(dataset):
#             num_of_min_dataset = len(dataset)

#     for label in new_dataset.keys():
#         new_dataset[label] = new_dataset[label][:num_of_min_dataset]

#     total, correct = 0, 0
#     balanced_dataset = []
#     for label in new_dataset.keys():
#         balanced_dataset.extend(new_dataset[label][:num_of_min_dataset])

#     for data in balanced_dataset:
#         text_id, pred_label, target = data[0], data[1], data[2]
#         if pred_label == target:
#             correct+=1
#         total+=1

#     print('pseduo-label {}/{}'.format(correct, total))
#     return balanced_dataset

In [30]:
# new_dataset = pseudo_labeling(unlabeled_dataset, confidence_threshold=0.9)

In [31]:
df_train

Unnamed: 0,contextID,citationID,CITATIONS_CONTEXTS,UTILIZE_LABELS,CUE_WORDS,CUE_HUMAN,CUE_SUBJECT,CUE_QUANTITY,CUE_FREQUENCY,CUE_TENSE,...,CUE_ALGOCLASS,CUE_USE,CUE_EXTEND,CUE_PROPOSE,CUE_EXPLAIN,CUE_ALGOKEY,CUE_DOCELKEY,CUE_PREP,USAGE_LABELS,CUE_WORDS_2
0,73793,73896,"ic is a function of, we need to monitor its si...",UTILIZE,fig. of procedure in describe algo pseudocode ...,we,algorithm we,,,,...,,,,describe,describe shown,pseudocode algorithm procedure,pseudocode algorithm procedure fig.,in of,USE,shown describe
1,81011,81256,ty restraints were included for bases that wer...,NOTUTILIZE,for at of procedure extend us in structure alg...,,algorithm,,,,...,calculation,using,extended,,,algorithm procedure,algorithm procedure,for in of,MENTION,extended using
2,124172,125234,ontradictory. Figure 19 shows an error that ar...,NOTUTILIZE,figure show in presented algo into routine pre...,,figure algorithm,,,,...,,,,presented,presented shows,algorithm,figure algorithm,on in into,MENTION,shows presented
3,152509,156833,each algorithm. Future work is to formalize an...,NOTUTILIZE,for of in us based on on following use develop...,we,algorithm we,,,,...,,,,,,algorithm,algorithm,on of,MENTION,
4,165264,171552,of batch processing. When the actual time seri...,NOTUTILIZE,for proposed of measure in us use algo between...,,algorithm,,,,...,,,,proposed,,algorithm,algorithm,between for of when in,MENTION,proposed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7922,201652323,239174775,algorithms/parameters which is then followed b...,UTILIZE,of scheme use our differ techniques algo we sc...,we our,paper we our,,,,...,analyses,use,,present,present,,,for in of,USE,use present
7923,201650768,239172222,"f components, the communication scheme for the...",UTILIZE,for of in propose algo system proposes scheme ...,we,algorithm we,,,,...,,,,proposes,,algorithm,algorithm,for in of,USE,proposes
7924,201647360,239167273,al nonconvex problem (14) is solvable. It is e...,UTILIZE,at in problem develop algo method on us using ...,,algorithm,,,,...,problem,using,,,,algorithm,algorithm,,USE,using
7925,201633780,239146636,ed and used the Gonzalez-Perez et al. (2012) b...,UTILIZE,for performance of in us on use our algo compa...,we our,algorithm we our,,,,...,,,,,,algorithm,algorithm,for of,USE,


In [32]:
df_unlabeled

Unnamed: 0,contextID,citationID,CONTENTS,LABELS
0,81092137,83882792,regret minimization algorithms. .1.1 RELATED ...,UNK_UNK
1,29076091,28364380,"f the mean curvature flow, for which one subtr...",UNK_UNK
2,5255081,5080973,"density 3. In [7, 35], the authors proved lin...",UNK_UNK
3,40181031,41787608,terationsandavoidinghalosofunusedspacearound m...,UNK_UNK
4,87451901,91615254,by Buchin et al. [8]. Despite extensive resea...,UNK_UNK
...,...,...,...,...
9995,97580257,104838051,"ise variance, resulting in better weight estim...",UNK_UNK
9996,82013109,84967007,nalysis software is limited to multi-body syst...,UNK_UNK
9997,196206171,231257845,r that cansbe used to identify uniquely the re...,UNK_UNK
9998,191997292,225056136,"the Genetic Algorithm[2],sParticle Swarm Algor...",UNK_UNK


In [33]:
df_check = pd.concat([df_test[['contextID', 'citationID']].copy(), df_unlabeled[['contextID', 'citationID']].copy()], axis=0)
df_check

Unnamed: 0,contextID,citationID
0,202719,213441
1,429629,460179
2,507299,541772
3,777941,830816
4,834166,891644
...,...,...
9995,97580257,104838051
9996,82013109,84967007
9997,196206171,231257845
9998,191997292,225056136


In [34]:
df_check.drop_duplicates()

Unnamed: 0,contextID,citationID
0,202719,213441
1,429629,460179
2,507299,541772
3,777941,830816
4,834166,891644
...,...,...
9995,97580257,104838051
9996,82013109,84967007
9997,196206171,231257845
9998,191997292,225056136
