In [None]:
!pip install datasets

In [None]:
!pip install evaluate

In [None]:
!pip install nltk

In [None]:
import random
import nltk
from nltk.corpus import wordnet

nltk.download("wordnet")
nltk.download("omw-1.4")

In [None]:
# Synonym Replacement
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word = random.choice(words)
        synonyms = wordnet.synsets(word)
        if synonyms:
            new_word = synonyms[0].lemmas()[0].name()
            new_words = [new_word if w == word else w for w in new_words]
    return " ".join(new_words)

# Sentence Shuffling
def shuffle_sentences(text):
    sentences = text.split(". ")
    random.shuffle(sentences)
    return ". ".join(sentences)


**Data Preprocessing**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.utils import resample

# Train Data
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Datasets/training.csv")

train_df["title"] = train_df["title"].str.lower()
train_df["abstract"] = train_df["abstract"].str.lower()

# Merge title + abstract
train_df["text"] = train_df["title"] + " " + train_df["abstract"]

# Ensure category labels are converted to integers
label_map = {label: idx for idx, label in enumerate(train_df["category"].unique())}
train_df["label"] = train_df["category"].map(label_map)


# for key, value in label_map.items():
#     print(f"{key}: {value}")

# Check class distribution before oversampling
print("Before Oversampling:")
print(train_df["category"].value_counts(normalize=True))

# Find the largest category count
max_size = train_df["category"].value_counts().max()

# Oversample all classes to match the largest category and apply augmentation
df_balanced = []
for category in train_df["category"].unique():
    df_subset = train_df[train_df["category"] == category]
    df_upsampled = resample(df_subset, replace=True, n_samples=max_size, random_state=42)
    augmented_data = df_upsampled.copy()

    for i in range(len(augmented_data)):
        if random.random() > 0.5:
            choice = random.choice(["synonym", "shuffle"])
            if choice == "synonym":
                augmented_data.iloc[i, augmented_data.columns.get_loc("text")] = synonym_replacement(df_upsampled.iloc[i]["text"])
            elif choice == "shuffle":
                augmented_data.iloc[i, augmented_data.columns.get_loc("text")] = shuffle_sentences(df_upsampled.iloc[i]["text"])

    df_balanced.append(augmented_data)

# Combine balanced and augmented dataset
balanced_df = pd.concat(df_balanced)

# Shuffle dataset
balanced_df = balanced_df.sample(frac=1, random_state=42)

# Check new class distribution
print("\nAfter Oversampling:")
print(balanced_df["category"].value_counts(normalize=True))


# Stratified 80-20 split to maintain equal class distribution
train_texts, val_texts, train_labels, val_labels = train_test_split(
    balanced_df["text"],
    balanced_df["label"],
    test_size=0.2,
    random_state=42,
    stratify=balanced_df["label"]  # Ensures equal class distribution
)

# Check category distribution in training set
print("Training set distribution:")
print(pd.Series(train_labels).value_counts(normalize=True))

# Check category distribution in validation set
print("\nValidation set distribution:")
print(pd.Series(val_labels).value_counts(normalize=True))



# Convert to Hugging Face dataset format
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
val_dataset = Dataset.from_dict({"text": val_texts.tolist(), "label": val_labels.tolist()})

print(f"Training set: {len(train_dataset)} samples")
print(f"Validation set: {len(val_dataset)} samples")
print("Dataset Loaded Successfully")


**Fine-tuning pretrained model**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v1"  # Zero-shot model
# model_name = "microsoft/deberta-v3-large"  # Zero-shot model

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map), ignore_mismatched_sizes=True)

print("Model loaded for fine-tuning")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

print("Tokenization complete")

In [None]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=6,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    fp16=True,
    logging_dir="./logs",
    report_to="none",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

**Evaluated Model through validation dataset**

In [None]:
import numpy as np
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }

In [None]:
raw_eval_preds = trainer.predict(tokenized_val_dataset)
logits, labels = raw_eval_preds.predictions, raw_eval_preds.label_ids

# Compute accuracy & F1
metrics = compute_metrics((logits, labels))
print("Computed Metrics:", metrics)

**Save model**

In [None]:
# Save model & tokenizer
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Datasets/fine_tuned_model1")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Datasets/fine_tuned_model1")

print("Fine-tuned model2 saved!")


**Testing Random text to ensure the model integerity before truly predict testing file**

In [None]:
from transformers import pipeline

# Load the fine-tuned model
classifier = pipeline("text-classification", model="/content/drive/MyDrive/Colab Notebooks/Datasets/fine_tuned_model1", tokenizer=tokenizer)

# Predict on new text
# text = "Quantum entanglement is key to secure communication."
text = "The present invention involves a quantum computing structure, comprising: one or more logical qubits, which is encoded into a plurality of superconducting qubits; and each of the logical qubits comprises at least one operating qubit and at least one ancilla qubit. Also provided is a method of quantum computing, comprising: performing encoded quantum computing operations with logical qubits that are encoded into superconducting operating qubits and superconducting ancilla qubits. The present invention further involves a method of error correction for a quantum computing structure comprising: presenting a plurality of logical qubits, each of which comprises an operating physical qubit and an ancilla physical qubit, wherein the logical states of the plurality of logical qubits are formed from a tensor product of the states of the operating and ancilla qubits; and wherein the states of the ancilla physical qubits are suppressed; and applying strong pulses to the grouping of logical qubits."
prediction = classifier(text)

print("Predicted Category:", prediction)

num_labels = model.config.num_labels
print(f"Number of labels: {num_labels}")



**Predict testing file**

In [None]:
# Load the fine-tuned model
model_path = "/content/drive/MyDrive/Colab Notebooks/Datasets/fine_tuned_model1"
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, return_all_scores=False)

# Load test dataset
test_file_path = "/content/drive/MyDrive/Colab Notebooks/Datasets/testing.csv"  # Update with actual path
test_df = pd.read_csv(test_file_path)

if "title" in test_df.columns and "abstract" in test_df.columns:
    test_df["text"] = test_df["title"] + " " + test_df["abstract"]
else:
    print("Error: Ensure your CSV file contains 'title' and 'abstract' columns.")

# Apply predictions to the entire dataset
test_df[["predicted_category", "confidence"]] = test_df["text"].apply(
    lambda x: pd.Series({
        "predicted_category": classifier(x)[0]['label'],  # Predicted label
        "confidence": classifier(x)[0]['score']  # Confidence probability
    })
)

# Convert `LABEL_X` to an integer
test_df["predicted_category"] = test_df["predicted_category"].apply(lambda x: int(x.split("_")[-1]))  # Convert `LABEL_1` → 1

# Reverse the dictionary
label_map_reverse = {v: k for k, v in label_map.items()}

# Map numeric labels back to their category names
test_df["predicted_category"] = test_df["predicted_category"].map(label_map_reverse)

test_df.drop(columns=["text"], inplace=True)

# Save file with predictions
output_file = "/content/drive/MyDrive/Colab Notebooks/Datasets/Kuan_Lin_testing.csv"
test_df.to_csv(output_file, index=False)
print(f"Predictions saved to: {output_file}")

**Double check format and output**

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Datasets/test_data_with_predictions1.csv")
print(df["predicted_category"].head())