### About
#### Generate sample questions using LLM.

### Import necessary packages and libraries

In [11]:
pip install -qq -U google-generativeai 

Note: you may need to restart the kernel to use updated packages.


In [81]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import ast
from tqdm.auto import tqdm
import time

### Get data

In [16]:
qna_df = pd.read_json('https://raw.githubusercontent.com/hariprasath-v/Nnet101_Assistant/refs/heads/main/data/nnet_101_qna_with_id.json')

In [56]:
data_dict = data[['id','question','tags','answer']].to_dict(orient='records')

### Gemini api configuration

In [89]:
import google.generativeai as genai
import os

genai.configure(api_key=secret_value)

### Build prompt to generate sample question

In [138]:
build_prompt = """
Generate a list of five clear, concise questions based on the provided data.

Each question should:
* Be straightforward and easy to understand.
* Use as few words as possible.
* Be relevant to the given data.

**Example:**

**Data:**
Question: What is the capital of France?
Answer: Paris
Tags: geography|country|city

**Output:**
['What is the largest city in France?', 'When was the French Revolution?', 'What is the official language of France?', 'Who is the current president of France?', 'What is the most famous landmark in France?']

**Data:**

{question}
{answer}
{tags}

**Output:**
"""


### Define model

In [110]:
model = genai.GenerativeModel("models/gemini-1.0-pro")

In [None]:
def generate_questions(doc):
    prompt = build_prompt.format(**doc)

    response = model.generate_content(prompt).text
   
    
    return response

### Sample questions

In [370]:
data_dict[0]

{'id': 'f55240b8',
 'question': 'How to choose the number of hidden layers and nodes in a feedforward neural network?',
 'tags': 'model-selection|neural-networks',
 'answer': "**Network Configuration in Neural Networks**\n\n**Standardization**\nThere is no single standardized method for configuring networks. However, guidelines exist for setting the number and type of network layers, as well as the number of neurons in each layer.\n\n**Initial Architecture Setup**\nBy following specific rules, one can establish a competent network architecture. This involves determining the number and type of neuronal layers and the number of neurons within each layer. This approach provides a foundational architecture but may not be optimal.\n\n**Iterative Tuning**\nOnce the network is initialized, its configuration can be iteratively tuned during training. Ancillary algorithms, such as pruning, can be used to eliminate unnecessary nodes, optimizing the network's size and performance.\n\n**Network Lay

In [371]:
print(build_prompt.format(**data_dict[0]))



Generate a list of five clear, concise questions based on the provided data.

Each question should:
* Be straightforward and easy to understand.
* Use as few words as possible.
* Be relevant to the given data.

**Example:**

**Data:**
Question: What is the capital of France?
Answer: Paris
Tags: geography|country|city

**Output:**
['What is the largest city in France?', 'When was the French Revolution?', 'What is the official language of France?', 'Who is the current president of France?', 'What is the most famous landmark in France?']

**Data:**

How to choose the number of hidden layers and nodes in a feedforward neural network?
**Network Configuration in Neural Networks**

**Standardization**
There is no single standardized method for configuring networks. However, guidelines exist for setting the number and type of network layers, as well as the number of neurons in each layer.

**Initial Architecture Setup**
By following specific rules, one can establish a competent network archit

In [372]:
generate_questions(data_dict[0])

"['How do you determine the number of hidden layers in a neural network?', 'How many neurons should each layer of a neural network have?', 'What type of neurons should be used in a neural network?', 'How do you optimize the configuration of a neural network?', 'What is the purpose of pruning in a neural network?']"

In [373]:
len(data_dict)

500

### Generate sample question using gemini

In [None]:
results = {}
for idx, doc  in tqdm(enumerate(data_dict)): 
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions
    # Pause for 60 seconds after every 15 iterations
    if (idx + 1) % 15 == 0:
        print(f"Pausing for 60 seconds...|Completed: {len(results)}|Remaining: {len(data_dict)-len(results)}")
        time.sleep(60)
#total time to generate questions: 45:37

### Save the raw generated questions

In [144]:
import pickle

# Save the results to a pickle file
with open('gemini_llm_generated_questions_raw.pkl', 'wb') as f:
    pickle.dump(results, f)

### Load data

In [145]:
with open('gemini_llm_generated_questions_raw.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

In [406]:
loaded_dict['f55240b8']

'1. How do I choose the number of hidden layers in a neural network?\n2. How many nodes should I use in each hidden layer?\n3. When should I use pruning to optimize network configuration?\n4. What is the relationship between the number of nodes in the input, hidden, and output layers?\n5. How can I determine the optimal network size for my problem?'

In [377]:
llm_question_sample = pd.DataFrame({'id':loaded_dict.keys(),'questions':loaded_dict.values()})

#### The generated questions are not in list format. Let's use regex to clean the generated questions.

### Unique question starting patterns

In [378]:
llm_question_sample['questions'].apply(lambda x: x[:2]).unique()

array(['1.', "['", '- ', '[\n', '* '], dtype=object)

In [380]:
llm_question_sample[llm_question_sample['questions'].str.startswith('1.')]['questions'][0]

'1. How do I choose the number of hidden layers in a neural network?\n2. How many nodes should I use in each hidden layer?\n3. When should I use pruning to optimize network configuration?\n4. What is the relationship between the number of nodes in the input, hidden, and output layers?\n5. How can I determine the optimal network size for my problem?'

In [383]:
llm_question_sample[llm_question_sample['questions'].str.startswith("['")]['questions'][2]

"['What are keys, queries, and values in attention mechanisms?', 'How does attention retrieve candidate matches?', 'How did Bahdanau et al. (2015) calculate alpha?', 'How did Vaswani et al. (2017) improve the efficiency of alpha calculation?', 'What are the different sources of queries, keys, and values in attention mechanisms?']"

In [385]:
llm_question_sample[llm_question_sample['questions'].str.startswith('- ')]['questions'][85]

'- What is the purpose of skip connections in ResNet?\n- How do skip connections help with the vanishing/exploding gradient problem?\n- What happens to gradients during backpropagation through skip connections?\n- How does the highway connection in ResNet enhance learning?\n- What are the advantages of using skip connections over traditional feedforward networks?'

In [387]:
llm_question_sample[llm_question_sample['questions'].str.startswith('[\n')]['questions'][158]

"[\n 'What is permutation invariance?',\n 'How does permutation invariance affect neural networks?',\n 'Why is permutation invariance important for convolutional networks?',\n 'How does permutation invariance relate to image recognition tasks?',\n 'What are the benefits of using permutation invariant models?'\n]"

In [390]:
llm_question_sample[llm_question_sample['questions'].str.startswith('* ')]['questions'][335]

'* What is the difference between pooling and subsampling?\n* How do pooling layers in CNNs perform subsampling?\n* What is the benefit of using pooling layers for subsampling?\n* Why are pooling operations essential in CNNs?\n* How do pooling operations preserve important characteristics in images?'

### Function to capture the patterns and convert it into list

In [391]:
def clean_text(text):
    text = str(text)  # Ensure text is a string
    if text.startswith('1.'):
        # Remove leading numbers and periods
        return [re.sub(r'^\d+\.\s*', '', t) for t in text.split("\n")]
    elif text.startswith("['") or text.startswith('["'):
        return ast.literal_eval(re.sub(r"(?<=\w).?'\s?(?=\w)", '', text))
    elif text.startswith('- '):
        # Remove leading '- ' from each line
        return [line.lstrip('- ').strip() for line in text.split('\n')]
    elif text.startswith('[\n'):
        # Evaluate string representation of a list (assuming valid format)
        return ast.literal_eval(text)
    elif text.startswith('* '):
        # Remove leading '* ' from each line
        return [line.lstrip('* ').strip() for line in text.split('\n')]
    else:
        # If none of the conditions match, return as is
        return text

In [392]:
llm_question_sample['questions'] = llm_question_sample['questions'].apply(lambda x: clean_text(x))

### Let's check the unique length of list.

In [393]:
llm_question_sample['questions'].apply(lambda x: len(x)).unique()

array([5])

###  Convert dataframe to dictionary

In [408]:
results = llm_question_sample.to_dict(orient='records')

### Create dictionary with id as key and questions as value

In [422]:
parsed_results = {}

for doc in results:
    parsed_results[doc['id']] = doc['questions']

### Save cleaned data

In [426]:
# Save the results to a pickle file
with open('gemini_llm_generated_questions_cleaned.pkl', 'wb') as f:
    pickle.dump(parsed_results, f)

### Get question id  

In [418]:
doc_index = {d['id']: d for d in data_dict}

### Create a list of tuple for each id

In [420]:
final_results = []

for doc_id, questions in parsed_results.items():
    tags = doc_index[doc_id]['tags']
    for q in questions:
        final_results.append((q, tags, doc_id))

### Create dataframe

In [427]:
ground_truth_data = pd.DataFrame(final_results, columns=['question', 'tags', 'document'])

In [428]:
ground_truth_data.shape

(2500, 3)

In [429]:
ground_truth_data.to_csv('ground-truth-data.csv', index=False)

In [431]:
ground_truth_data.head()

Unnamed: 0,question,tags,document
0,How do I choose the number of hidden layers in...,model-selection|neural-networks,f55240b8
1,How many nodes should I use in each hidden layer?,model-selection|neural-networks,f55240b8
2,When should I use pruning to optimize network ...,model-selection|neural-networks,f55240b8
3,What is the relationship between the number of...,model-selection|neural-networks,f55240b8
4,How can I determine the optimal network size f...,model-selection|neural-networks,f55240b8
