In [None]:
import pandas as pd
df1=pd.read_csv('Generated_Abstracts_G06F.csv')
df2=pd.read_csv('Generated_Abstracts_G06K.csv')
df3=pd.read_csv('Generated_Abstracts_G06N.csv')
df4=pd.read_csv('Generated_Abstracts_G06Q.csv')
df5=pd.read_csv('Generated_Abstracts_G06T.csv')
df6=pd.read_csv('Generated_Abstracts_G06V.csv')

In [None]:
import pandas as pd
merged_df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)
shuffled_df = merged_df.sample(frac=1).reset_index(drop=True)
shuffled_df.to_csv('merged_G06.csv', index=False)

In [None]:
df=pd.read_csv('merged_G06.csv')

In [None]:
df.head()

Unnamed: 0,patent_id,patent_abstract,patent_title,claim_text,cpc_subclass,generated_abstract
0,11224782,"Systems, methods, and computer-readable media ...",Physical activity monitoring and motivating wi...,A method of operating an electronic device co...,G06Q,The present disclosure relates to a method f...
1,11232342,"An RFID tag includes a booster antenna, a feed...",RFID tag and method for manufacturing RFID tag,"A method for manufacturing an RFID tag, compr...",G06K,The present invention relates to a method fo...
2,11216497,The disclosure relates to an artificial intell...,Method for processing language information and...,"A method for operating an electronic device, ...",G06F,The present invention relates to a method fo...
3,11217636,A display device includes a display module. An...,Display device,A display device comprising: a display module...,G06V,The present invention relates to a display d...
4,11216488,Embodiments of present disclosure discloses sy...,Method and system for managing applications in...,A method for managing applications in an elec...,G06N,The present patent claim relates to a method...


In [None]:
df.shape

(6000, 6)

In [None]:
texts = df['patent_abstract	'].tolist()
label = df['cpc_subclass'].tolist()

In [None]:
subclass_to_label = {
    'G06F': 0,
    'G06K': 1,
    'G06N': 2,
    'G06Q': 3,
    'G06T': 4,
    'G06V': 5
}
labels = [subclass_to_label.get(subclass, -1) for subclass in df['cpc_subclass'].tolist()]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure the correct data type
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))
model.to(device)

num_epochs = 5
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)


def train(epoch, model, dataloader, optimizer, scheduler):
    model.train()
    for i, batch in enumerate(dataloader):
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        if i % 10 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item()}")

for epoch in range(num_epochs):
    train(epoch, model, train_loader, optimizer, scheduler)

def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        true_labels.extend(batch['labels'].tolist())
    return classification_report(true_labels, predictions, output_dict=False)

print(evaluate(model, val_loader))