In [2]:
import os
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

torch.cuda.set_device(1)                            #　指定gpu1
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

EPOCHS = 1
MAX_TOKEN_COUNT = 128
BATCH_SIZE = 32

In [3]:
from torch.cuda.amp import autocast as autocast
from torch.cuda.amp import GradScaler as GradScaler

In [4]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
# 设置随机数种子
setup_seed(20)

In [5]:
# train_df=pd.read_feather("./autodl-nas/major.feather")
test_df = pd.read_feather("Test.feather")


In [8]:
LABEL_COLUMNS_3 = test_df.columns[11:-3]

In [9]:
LABEL_COLUMNS_3

Index(['A41D', 'A62B', 'A41B', 'D06N', 'A42B', 'A43B', 'D06B', 'A41F', 'E03D',
       'A47K',
       ...
       'Y02D', 'F24V', 'H04T', 'G16B', 'G16C', 'G16Z', 'G21J', 'G16Y', 'G06J',
       'E99Z'],
      dtype='object', length=664)

In [10]:
class PatentDataset(Dataset):
        
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: BertTokenizer,
        max_token_len: int = 512,
        test= False
    ):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.test = test
        
    
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        query = data_row.publication_title.lower()
        
        if not self.test:
            comment_text = data_row['major claim']
            label_3 = data_row[LABEL_COLUMNS_3]
#             label_2 = data_row[LABEL_COLUMNS_2]
#             label_1 = data_row[LABEL_COLUMNS_1]
            
            hard_negative = train_df.sample(1) 
            while str(hard_negative.cpc_sections) == str(data_row.cpc_sections):
                hard_negative = train_df.sample(1)
                
            hard_negative = hard_negative.iloc[0]['major claim']
            
            encoding = self.tokenizer(
                [query,comment_text,hard_negative],
                max_length=self.max_token_len,
                padding="max_length",
                truncation=True,
                add_special_tokens=True, # [CLS] & [SEP]
                return_token_type_ids=False,
                return_attention_mask=True, #attention_mask
                return_tensors='pt',
            )

            
        else:
            comment_text = data_row['major claim']
            label_3 = data_row[LABEL_COLUMNS_3]
            
            encoding = self.tokenizer.encode_plus(
                [query,comment_text],
                max_length=self.max_token_len,
                padding="max_length",
                truncation=True,
                add_special_tokens=True, # [CLS] & [SEP]
                return_token_type_ids=False,
                return_attention_mask=True, #attention_mask
                return_tensors='pt',
            )
        

        
        if not self.test:
            return dict(
#             comment_text=comment_text,
            input_ids = encoding["input_ids"],
            attention_mask=encoding["attention_mask"],
#             label_1=torch.FloatTensor(label_1),
#             label_2=torch.FloatTensor(label_2),
            label_3=torch.FloatTensor(label_3),
                
        )
        else:
            return dict(
#                 comment_text=comment_text,
                input_ids = encoding["input_ids"],
                attention_mask=encoding["attention_mask"],
                label_3=torch.FloatTensor(label_3)
            )
        

In [8]:
train_dataset = PatentDataset(
  train_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,drop_last = True)

In [9]:
Bert_model = BertModel.from_pretrained(BERT_MODEL_NAME,return_dict = True).to(device)
Tags_df = pd.read_feather("Tags.feather")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
class Tags_datasets(Dataset):
    
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: BertTokenizer,
        max_token_len: int = 512
    ):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
    
    def __len__(self):
        return len(self.data)
    
        
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        labels = data_row.Label
        tags = data_row.Tags
        
        encoding = self.tokenizer.encode_plus(
            tags,
            max_length=self.max_token_len,
            padding="max_length",
            truncation=True,
            return_tensors='pt',
        )
        
        return dict(
            index=index,
            label = labels,
            input_ids=encoding['input_ids'].flatten(),
            attention_mask = encoding['attention_mask'].flatten()
        )

In [11]:
Tags_dataset = Tags_datasets(data = Tags_df,tokenizer = tokenizer, max_token_len = MAX_TOKEN_COUNT)
Tags_dataloader = DataLoader(Tags_dataset, batch_size = 1)

In [12]:
tags_embeddings = {}

for step,batch in tqdm(enumerate(Tags_dataloader),total = len(Tags_dataloader)):
    label = batch['label'][0]
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    
    with torch.no_grad():
        output = Bert_model(input_ids,attention_mask = attention_mask)
    tag_embeddings = output.pooler_output.detach().cpu()
    tags_embeddings[label] = tag_embeddings



  0%|          | 0/674 [00:00<?, ?it/s]

In [13]:
tags_embedding = np.concatenate([tags_embeddings[key] for key in LABEL_COLUMNS_3],axis = 0)
tags_embedding = torch.tensor(tags_embedding).to(device)

In [11]:
class BertNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True) #load the pretrained bert model
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        outputs = outputs.last_hidden_state
        
        return outputs

In [15]:
class Tags_attention(nn.Module):
    def __init__(self,n_classes:int, labels_embeddings: torch.Tensor):
        super().__init__()
        self.criterion = nn.BCEWithLogitsLoss()
        self.Weight_1 = nn.Linear(n_classes,1)
        self.dropout = nn.Dropout(0.1)
        self.batch_labels_embeddings = labels_embeddings.expand(BATCH_SIZE,n_classes,768)
        self.batch_labels_embeddings = self.batch_labels_embeddings.transpose(1, 2) # batch_size * 768 * 664
        self.batch_labels_embeddings.requires_grad = False
        
    def forward(self, outputs):
        # label-wise attention
        att = torch.bmm(outputs,self.batch_labels_embeddings)
        att = F.softmax(att,dim=1)
        joint_labels_output = torch.bmm(outputs.transpose(1,2),att)
        joint_labels_output = F.relu(self.Weight_1(joint_labels_output).squeeze(2))

        return joint_labels_output

In [16]:
class ConLoss(nn.Module):
    
    def __init__(self, temperature = 0.1):
        super().__init__()
        self.temperature = temperature
        
    def forward(self, query, tags_positive,tags_negative):
        pos_sim = F.cosine_similarity(query.unsqueeze(1),tags_positive.unsqueeze(0),dim=2)
#         print(pos_sim)
        neg_sim = F.cosine_similarity(query.unsqueeze(1),tags_negative.unsqueeze(0),dim=2)
#         print(neg_sim)
        pos_exp_logits = torch.exp(pos_sim)
        neg_exp_logits = torch.exp(neg_sim)
        
        loss = torch.log(pos_exp_logits.sum(1,keepdim=True)+neg_exp_logits.sum(1,keepdim=True)) - pos_sim.diag().unsqueeze(1)
        
        loss = torch.mean(loss)
        
        return loss

In [17]:
BERT = BertNetwork().to(device)
tags_attention = Tags_attention(len(LABEL_COLUMNS_3),tags_embedding).to(device)
conloss = ConLoss().to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:

N_EPOCHS = EPOCHS

steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = total_training_steps // 10
warmup_steps, total_training_steps

optimizer = AdamW([
                {'params': BERT.parameters()},
                {'params': tags_attention.parameters()}],
                lr=5e-4
            )

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_training_steps
)

In [19]:
# function to train the model
def train():
    
    now=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
    best_valid_loss = float('inf')
    BERT.train()
    tags_attention.train()

    total_loss = 0 

    scaler = GradScaler()
    
    for step,batch in tqdm(enumerate(train_dataloader),total=len(train_dataloader),desc="Train"):
        
#         if step%5000 == 0 and step!=0:
#             valid_loss,_,_ = evaluate(val_dataloader)
#             if valid_loss < best_valid_loss:
#                 best_valid_loss = valid_loss
#                 torch.save(model.state_dict(), f"./model/Classfication_Baseline_model{now}.pt")

        input_ids = batch['input_ids'].transpose(0,1).to(device)
        attention_mask = batch['attention_mask'].transpose(0,1).to(device)

        query_input_ids = input_ids[0]
        positive_input_ids = input_ids[1]
        negative_input_ids = input_ids[2]

        query_attention_mask = attention_mask[0]
        positive_attention_mask = attention_mask[1]
        negative_attention_mask = attention_mask[2]


        with autocast():
            query = BERT(query_input_ids,query_attention_mask)[:,0]
            positive = BERT(positive_input_ids,positive_attention_mask)[:,0]
            negative = BERT(negative_input_ids,negative_attention_mask)[:,0]

            tags_positive = tags_attention(positive)
            tags_negative = tags_attention(negative) 

            loss = conloss(query,positive,negative)

        optimizer.zero_grad()
        if step%10 == 0 :
            print(f"step: {step} loss: {loss}")
        # add on to the total loss
        total_loss = total_loss + loss.float().item()
        
        # scale the loss 
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(BERT.parameters(), 1.0)
        torch.nn.utils.clip_grad_norm_(tags_attention.parameters(), 1.0)
        # update parameters
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        
    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    return avg_loss

In [33]:
fo = open("untitled.txt")

In [34]:
for line in fo:
    print(line)
    

sdfsdf

dsf

sdfsf

sdfs


In [22]:
train()

Train:   0%|          | 0/62484 [00:00<?, ?it/s]

step: 0 loss: 4.167180061340332
step: 10 loss: 4.165933609008789
step: 20 loss: 4.1711344718933105
step: 30 loss: 4.185024261474609
step: 40 loss: 4.177688121795654


KeyboardInterrupt: 