In [None]:
import os
import torch
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, DatasetDict
import json 
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

In [None]:
MODEL_NAME = "gpt2"  
OUTPUT_DIR = "./finetuned-gpt2"
DATASET_NAME = "Manageengine"  
NUM_TRAIN_EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 5e-5
WARMUP_STEPS = 500
MAX_LENGTH = 512
SAVE_STEPS = 10000
SEED = 42

In [None]:
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
dataset = load_dataset(DATASET_NAME)

In [9]:
# import requests
# from bs4 import BeautifulSoup
# import re
# import json

# url = "https://www.manageengine.com/products/desktop-central/faq.html"
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
# }
# response = requests.get(url, headers=headers)
# if response.status_code != 200:
#     print("Failed to fetch the page")
#     exit()

# soup = BeautifulSoup(response.text, "html.parser")

# faqs = []
# faq_sections = soup.find_all("div", class_="accordion_in")

# for section in faq_sections:
#     question = section.find("div", class_="acc_head").text.strip()
#     question = re.sub(r"^\d+\.\s*", "", question)
#     answer = section.find("div", class_="acc_content").text.strip()
#     faqs.append({"question": question, "answer": answer})



# with open("faqs.json", "w", encoding="utf-8") as f:
#     json.dump(faqs, f, indent=4, ensure_ascii=False)

# print("FAQs scraped successfully and saved to faqs.json")


FAQs scraped successfully and saved to faqs.json


In [12]:
def create_dataset_from_qa_pairs(file_path, train_ratio=0.8):
    with open(file_path, 'r', encoding='utf-8') as f:
        qa_pairs = json.load(f)
    df = pd.DataFrame(qa_pairs)
    df['text'] = df.apply(lambda row: f"Question: {row['question']}\nAnswer: {row['answer']}", axis=1)
    train_size = int(len(df) * train_ratio)
    train_df = df.iloc[:train_size]
    val_df = df.iloc[train_size:]
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset
    })
    return dataset_dict


In [None]:
dataset = create_dataset_from_qa_pairs("faqs.json")

In [None]:
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding = "max_length",
        truncation = True,
        max_length = MAX_LENGTH
    )

In [None]:
tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]  
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False 
)

In [None]:
trainer_arguments = TrainingArguments(
    output_dir = OUTPUT_DIR,
    overwrite_output_dir = True,
    num_train_epochs = 3,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    warmup_steps = WARMUP_STEPS,
    learning_rate = LEARNING_RATE,
    weight_decay = 0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=SAVE_STEPS,
    evaluation_strategy="steps",
    eval_steps=SAVE_STEPS,
    seed=SEED,
    fp16=torch.cuda.is_available(),
)

In [None]:
trainer = Trainer(
    model = model,
    args = trainer_arguments,
    data_collator = data_collator,
   train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

In [18]:
import numpy as np
def apply_rope(x, pos, dim):
    """
    Applies RoPE (Rotary Positional Embeddings) to an input vector.
    
    Args:
        x: Input tensor of shape (seq_len, dim)
        pos: Position indices (seq_len,)
        dim: Embedding dimension (must be even)
    
    Returns:
        Tensor of shape (seq_len, dim) with RoPE applied.
    """
    # Ensure the dimension is even for rotation
    assert dim % 2 == 0, "Embedding dimension must be even for RoPE"

    # Compute theta (rotation angles) for each dimension
    theta = 1.0 / (10000 ** (2 * (np.arange(dim // 2) / dim)))
    print(theta)

    # Compute angles for each position
    angles = np.outer(pos, theta)  # Shape: (seq_len, dim//2)

    # Compute sin and cos for the rotation matrix
    sin_angles = np.sin(angles)
    cos_angles = np.cos(angles)

    # Split input tensor into real and imaginary parts
    x_real, x_imag = np.split(x, 2, axis=-1)  # Each of shape (seq_len, dim/2)

    # Apply rotation
    x_rotated = np.concatenate([x_real * cos_angles - x_imag * sin_angles,
                                x_real * sin_angles + x_imag * cos_angles], axis=-1)

    return x_rotated


In [19]:
# Example sequence of 5 tokens with embedding size 4 (must be even)
seq_len = 5
dim = 4  # Must be even
x = np.random.rand(seq_len, dim)  # Random embeddings

# Define positions (0 to seq_len-1)
positions = np.arange(seq_len)
print(x)
print(positions)
# Apply RoPE
x_rope = apply_rope(x, positions, dim)

# Print results
print("Original Embeddings:\n", x)
print("\nRoPE Transformed Embeddings:\n", x_rope)


[[0.8892339  0.98800979 0.96360525 0.61355999]
 [0.52276888 0.35692841 0.18013395 0.01959338]
 [0.20788909 0.53627561 0.14015751 0.1703878 ]
 [0.86538056 0.51496165 0.00778109 0.79358106]
 [0.09406633 0.47596445 0.42949415 0.56032595]]
[0 1 2 3 4]
[1.   0.01]
Original Embeddings:
 [[0.8892339  0.98800979 0.96360525 0.61355999]
 [0.52276888 0.35692841 0.18013395 0.01959338]
 [0.20788909 0.53627561 0.14015751 0.1703878 ]
 [0.86538056 0.51496165 0.00778109 0.79358106]
 [0.09406633 0.47596445 0.42949415 0.56032595]]

RoPE Transformed Embeddings:
 [[ 0.8892339   0.98800979  0.96360525  0.61355999]
 [ 0.13087573  0.35671464  0.53722163  0.02316162]
 [-0.21395725  0.53276083  0.13070691  0.18107852]
 [-0.85781833  0.49092607  0.11441929  0.80867051]
 [ 0.26355639  0.45317667 -0.35192575  0.57891125]]
