# Fine-tuning model for interview question answering

This notebook is a test for fine-tuning a model for interview question answering. It contains several sources for interview questions and answers. The goal is to fine-tune a model to answer interview questions.

## Requirements:
This code written using Local Machine with GPU Nvidia GTX 1660 Ti 6GB. But you can use Google Colab for free.

 - Anaconda
 - Nvidia CUDA Toolkit 11.1
 - Jupyter Notebook

## Links to the sources:

Sources for interview questions and answers:
- [https://github.com/sudheerj/angular-interview-questions](https://github.com/sudheerj/angular-interview-questions)
- [https://github.com/sudheerj/javascript-interview-questions](https://github.com/sudheerj/javascript-interview-questions)
- [https://github.com/sudheerj/reactjs-interview-questions](https://github.com/sudheerj/reactjs-interview-questions)
- [https://github.com/aershov24/full-stack-interview-questions](https://github.com/aershov24/full-stack-interview-questions)

## Model for paraphrasing:
Also we use the following model for paraphrasing:
- [https://huggingface.co/google/flan-t5-small](https://huggingface.co/google/flan-t5-small)

## Model for fine-tuning:
And the following model for fine-tuning:
- [https://huggingface.co/databricks/dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b)
- [https://huggingface.co/google/flan-t5-small](https://huggingface.co/google/flan-t5-small)

In [None]:
# Install dependencies and libraries including CUDA for PyTorch
!pip install datasets markdown beautifulsoup4
!pip install torch torchvision torchaudio --index-url https: // download.pytorch.org/whl/cu117
!pip install transformers pandas accelerate nvidia-ml-py3 datasets nltk

## Create question and answer dataset from interviewing questions and answers

This code is used to create a dataset from the sources above. It parses the markdown files and creates a dataset in JSON format.
Most of questions are the h3 or h4 tags and answers are the content after the question tag.

In [None]:
import os

import markdown
import pandas as pd
from bs4 import BeautifulSoup

sudheerj_paths = [
    os.path.join('..', 'data', 'interview', 'sudheerj', 'angular-interview-questions.md'),
    os.path.join('..', 'data', 'interview', 'sudheerj', 'javascript-interview-questions.md'),
    os.path.join('..', 'data', 'interview', 'sudheerj', 'reactjs-interview-questions.md'),
]

aershov24_paths = [
    os.path.join('..', 'data', 'interview', 'aershov24', 'full-stack-interview-questions.md')
]

# Extract questions and answers from markdown files
def parse_files(md_files, question_selector):
    data = pd.DataFrame()
    for md_file in md_files:
        with open(md_file, "r", encoding="utf-8") as file:
            md_content = file.read()
            html_content = markdown.markdown(md_content)
            soup = BeautifulSoup(html_content, "html.parser")

            questions = soup.select(question_selector)

            for question in questions:
                answer_elements = []
                sibling = question.find_next_sibling()

                while sibling and sibling.name != question_selector:
                    answer_elements.append(str(sibling))
                    sibling = sibling.find_next_sibling()

                answer = BeautifulSoup(''.join(answer_elements).strip())

                data = pd.concat([data, pd.DataFrame({
                    'question': [question.text.strip()],
                    'answer': [answer.text.strip()]
                })], ignore_index=True)
    return data


sudheerj_df = parse_files(sudheerj_paths, "h3")
aershov24_df = parse_files(aershov24_paths, "h4")

combine_df = pd.concat([sudheerj_df, aershov24_df], ignore_index=True)
combine_df.to_json(os.path.join('..', 'datasets', 'interview', 'interview_questions.json'), orient='records')
combine_df.tail()

In [None]:
from tqdm.auto import tqdm
import random
import os
import pandas as pd
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

combine_df = pd.read_json(os.path.join('..', 'datasets', 'interview', 'interview_questions.json'), orient='records')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

def replace_with_synonym(sentence, max_replacements=2):
    words = sentence.split()
    replacements = 0
    new_words = []

    for word in words:
        if replacements < max_replacements and random.random() < 0.5:
            synonyms = get_synonyms(word)
            if synonyms:
                word = random.choice(synonyms)
                replacements += 1
        new_words.append(word)

    return ' '.join(new_words)

def generate_augmented_row(row):
    question = row['question']
    answer = row['answer']

    augmented_question = replace_with_synonym(question)
    augmented_answer = replace_with_synonym(answer)
    rand = random.random()
    if rand < 0.25:
        return augmented_question, answer
    elif rand > 0.25:
        return question, augmented_answer
    else:
        return augmented_question, augmented_answer

def generate_augmented_rows(df):
    for _, row in tqdm(df.iterrows(), total=len(df), desc='Augmenting'):
        yield generate_augmented_row(row)

pd.DataFrame(generate_augmented_rows(df=combine_df), columns=['question', 'answer'])
augmented_df = pd.concat(
    [
        combine_df,
        pd.DataFrame(generate_augmented_rows(df=combine_df), columns=['question', 'answer']),
        pd.DataFrame(generate_augmented_rows(df=combine_df), columns=['question', 'answer']),
    ],
    ignore_index=True
).dropna().drop_duplicates(subset=['question', 'answer'], keep='first', ignore_index=True)

augmented_df.to_json(os.path.join('..', 'datasets', 'interview', 'interview_questions_augmented.json'), orient='records')

print(f'Original dataset size: {len(combine_df)}')
print(f'Augmented dataset size: {len(augmented_df)}')
augmented_df.tail()

In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from accelerate import Accelerator

num_train_epochs = 4

accelerator = Accelerator()
data = pd.read_json(os.path.join('..', 'datasets', 'interview', 'interview_questions_augmented.json'), orient='records')
train_data = data.sample(frac=0.9, random_state=42)
val_data = data.drop(train_data.index)

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

tokenizer = accelerator.prepare(tokenizer)

def preprocess_data(batch):
    input_texts = ["question: " + example for example in batch["question"]]
    target_texts = ["answer: " + example for example in batch["answer"]]
    input_tokenized = tokenizer(input_texts, truncation=True, max_length=512, padding="max_length", return_tensors="np")
    target_tokenized = tokenizer(target_texts, truncation=True, max_length=512, padding="max_length", return_tensors="np")
    input_tokenized, target_tokenized = accelerator.prepare(input_tokenized, target_tokenized)
    return {"input_ids": input_tokenized.input_ids, "attention_mask": input_tokenized.attention_mask, "labels": target_tokenized.input_ids}


train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

config = T5Config.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, config=config)

model = accelerator.prepare(model)


training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs",
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("output/model")
tokenizer.save_pretrained("output/model")

In [None]:
def generate_answer(question, model, tokenizer, max_length=128):
    model.eval()
    input_text = "question: " + question
    input_tokens = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_tokens = model.generate(input_tokens, max_length=max_length, repetition_penalty=2.5, length_penalty=1.0, early_stopping=True, num_beams=4, num_return_sequences=4)
    answer = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    answer = answer.replace("answer: ", "")
    return answer

question = "What is the difference between AngularJS and Angular?"

source_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
source_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

source_answer = generate_answer(question, source_model, source_tokenizer)

tuned_model = T5ForConditionalGeneration.from_pretrained("output/model")
tuned_tokenizer = T5Tokenizer.from_pretrained("output/model")

tuned_answer = generate_answer(question, tuned_model, tuned_tokenizer)

print(f"Question: {question}") # Question: What is the difference between AngularJS and Angular?
print(f"Source answer: {source_answer}") # Source answer: AngularJS AngularJS may refer to:
print(f"Tuned answer: {tuned_answer}") # Tuned answer: AngularJS is a JavaScript language that can be used to build web applications. It's also known as AngularJS or AngularJS. The main difference between AngularJS and AngularJS is that it uses the same syntax as AngularJS. For example, you can use AngularJS instead of AngularJS.
