In [1]:
import huggingface_hub
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader


In [2]:


# Load the dataset
df = pd.read_csv('/kaggle/input/nvidia-documentation-question-and-answer-pairs/NvidiaDocumentationQandApairs.csv')

# Basic exploration to understand the dataset
print("Dataset size:", df.shape)
print("Columns in the dataset:", df.columns)
print("First few rows of the dataset:")

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

df = df.dropna()
df

Dataset size: (7108, 3)
Columns in the dataset: Index(['Unnamed: 0', 'question', 'answer'], dtype='object')
First few rows of the dataset:

Missing values in each column:
Unnamed: 0    0
question      0
answer        0
dtype: int64


Unnamed: 0.1,Unnamed: 0,question,answer
0,0,What is Hybridizer?,Hybridizer is a compiler from Altimesh that en...
1,1,How does Hybridizer generate optimized code?,Hybridizer uses decorated symbols to express p...
2,2,What are some parallelization patterns mention...,The text mentions using parallelization patter...
3,3,How can you benefit from accelerators without ...,You can benefit from accelerators' compute hor...
4,4,What is an example of using Hybridizer?,An example in the text demonstrates using Para...
...,...,...,...
7103,7103,What is the focus of the GTC event in 2015?,The focus of the GTC event in 2015 is GPU code...
7104,7104,How were the main changes made to the code for...,"The main changes included merging kernels, reg..."
7105,7105,What are some key fields in the cudaDeviceProp...,"Some key fields include name, memoryClockRate,..."
7106,7106,What did changing the kernel approach achieve ...,Changing the kernel approach reduced the itera...


In [3]:
import nltk
#nltk.download('all-nltk')

# Initialize the WordNet lemmatizer
#lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    tokens = word_tokenize(text)
    # Lemmatization (optional)
    #lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Re-join the tokens into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Preprocess both questions and answers
df['question'] = df['question'].apply(preprocess_text)
df['answer'] = df['answer'].apply(preprocess_text)

# Preprocess and split the dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # 80% train, 20% test
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)  # Of the 80% train, split into 75% train and 25% val

# Now your data is ready for model training
print("Preprocessing complete. Dataset is ready for training.")


Preprocessing complete. Dataset is ready for training.


In [4]:
# Load pre-trained DistilGPT model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")


# Set padding token
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        
        # Tokenize inputs and labels
        encoded_question = self.tokenizer.encode_plus(question, truncation=True, max_length=self.max_length, return_tensors="pt")
        encoded_answer = self.tokenizer.encode_plus(answer, truncation=True, max_length=self.max_length, return_tensors="pt")
        
        # Pad sequences to the same length
        input_ids = torch.cat([encoded_question['input_ids'], torch.zeros((1, self.max_length - encoded_question['input_ids'].shape[1]), dtype=torch.long)], dim=1)
        labels = torch.cat([encoded_answer['input_ids'], torch.zeros((1, self.max_length - encoded_answer['input_ids'].shape[1]), dtype=torch.long)], dim=1)
        
        return input_ids.squeeze(0), labels.squeeze(0)

# Prepare your datasets
train_dataset = QADataset(train_df['question'].tolist(), train_df['answer'].tolist(), tokenizer, max_length=128)
val_dataset = QADataset(val_df['question'].tolist(), val_df['answer'].tolist(), tokenizer, max_length=128)
test_dataset = QADataset(test_df['question'].tolist(), test_df['answer'].tolist(), tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [6]:
def evaluate(model, val_loader, device):
    model.eval()  # Put the model in evaluation mode
    total_correct = 0
    total_samples = 0
    
    # No gradient needed for evaluation, which saves memory and computations
    with torch.no_grad():
        for batch in val_loader:
            # Move each tensor within the batch to the specified device
            input_ids = batch[0].to(device)

            # Compute model outputs
            outputs = model(input_ids=input_ids)
            logits = outputs.logits

            # Calculate accuracy
            predicted_labels = torch.argmax(logits, dim=-1)
            total_correct += (predicted_labels == batch[1].to(device)).sum().item()
            total_samples += input_ids.size(0)

    # Calculate accuracy
    accuracy = total_correct / total_samples if total_samples > 0 else 0
    
    return accuracy


In [7]:
# Fine-tuning parameters
epochs = 3
learning_rate = 5e-5

# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_loss:.4f}")

    # Evaluation on validation data after each epoch
    acc = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}/{epochs}, Validation accuracy: {acc:.4f}")


Epoch 1/3, Train Loss: 2.1137
Epoch 1/3, Validation accuracy: 97.6575
Epoch 2/3, Train Loss: 2.1035
Epoch 2/3, Validation accuracy: 97.6575
Epoch 3/3, Train Loss: 2.1035
Epoch 3/3, Validation accuracy: 97.6575


In [8]:

accuracy=evaluate(model, test_loader, device)
print(f" accuracy: {accuracy:.4f}")



 accuracy: 97.0837


In [9]:
import pandas as pd

# Load JSON data into DataFrame
df = pd.read_json('/kaggle/input/qaeval/evaluation_data.json')
# Flatten the data since the structure has nested elements under 'data'
df = pd.json_normalize(df['data'])
#print(df.T.head())
# Reshape the DataFrame
questions = []
correct_answers = []
promt="answer with only the right correct option's letter"
for index, row in df.iterrows():
    question = row['Question']
    options = [row['Options.A'], row['Options.B'], row['Options.C'], row['Options.D']]
    correct_answer = row['CorrectAnswer']

    for option in options:
        questions.append(f"{question} {option}{promt}")
        correct_answers.append(correct_answer)

new_data = {
    'Question': questions,
    'CorrectAnswer': correct_answers
}

new_df = pd.DataFrame(new_data)
df=new_df
df['Question'] = df['Question'].apply(preprocess_text)
df['CorrectAnswer'] = df['CorrectAnswer'].apply(preprocess_text)
df.head(5)


Unnamed: 0,Question,CorrectAnswer
0,what is the primary benefit of nvidias tensorr...,c
1,what is the primary benefit of nvidias tensorr...,c
2,what is the primary benefit of nvidias tensorr...,c
3,what is the primary benefit of nvidias tensorr...,c
4,how does nvidias cuda programming model contri...,b


In [10]:

# Prepare your datasets
eval_dataset = QADataset(df['Question'].tolist(), df['CorrectAnswer'].tolist(), tokenizer, max_length=128)

eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

# Evaluate the model
accuracy = evaluate(model, eval_loader, device)
print(f"Accuracy: {accuracy:.4f}%")


Accuracy: 95.6607%


In [11]:
# Save the fine-tuned model
model.save_pretrained("/kaggle/working/fine_tuned_distilGPT")