In [36]:
# Import general packages
import pandas as pd
import re
import io
import json

# Set environment variable to authenticate GCP credentials
!export GOOGLE_APPLICATION_CREDENTIALS='book-to-quiz-7558e7ee5aca.json'

LOCATION = "us-central1"
PROJECT = 'book-to-quiz'
BUCKET = 'book-to-quiz-question-bank'

MODEL = "gemini-1.5-flash-001"

In [3]:
# Imports the Google Cloud client library
from google.cloud import storage

def gcs_read(bucket_name, blob_name):
    """Read a blob from GCS using file-like IO"""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    with blob.open("r") as file:
        # Return data as lines
        return file.readlines()
    
def gcs_write(bucket_name, blob_name, content):
    """Write a blob from GCS using file-like IO"""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    with blob.open("w") as file:
        file.write(content)

In [4]:
# Import Vertex AI packages
import vertexai
from vertexai.generative_models import GenerativeModel
import vertexai.preview.generative_models as generative_models

def generate(p_text, g_config, s_settings):
    """Yield text with generator which receive prompt text, generation config and safety settings as arguments"""
    vertexai.init(project=PROJECT, location=LOCATION)
    model = GenerativeModel(
        MODEL,
    )
    responses = model.generate_content(
        [p_text],
        generation_config=g_config,
        safety_settings=s_settings,
        stream=True,
    )

    for response in responses:
        yield response.text

In [5]:
prompt = """Generate quiz with these requirements:
- Total questions: 50.
- Topic: AWS.
- Difficulty: Hard.
- Types: true/false, single correct answer, multiple correct answers.
- Indent questions with number.
- Indent choices and true/false with upper letter.
- If more than 2 correct answers questions, remind  \"(select [exact number] apply)\" before choice A.
- In each question, total correct answers is less than total choices.
- Choices consist multiple technically complicated steps.
- Show correct choices at the end of each question.
- No markdown, plain text.
- Group by type."""

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

In [6]:
# Make the generator
generator = generate(prompt, generation_config, safety_settings)

In [7]:
# Use string comprehension to gather text from generator
generate_text = ''.join(response for response in generator)

In [8]:
# Use print() for generate_text for better preview because generated_text is whole string, not list of lines
print(generate_text)

## True/False

1. A Lambda function can be triggered by an S3 bucket notification event, but only if the bucket is configured to allow public access.
    A. True
    B. False
    **Correct Answer: B. False**

2. Amazon EC2 instances can be launched with a pre-configured security group that allows all incoming traffic.
    A. True
    B. False
    **Correct Answer: B. False**

3. Amazon RDS for PostgreSQL supports both read replicas and multi-AZ deployments for high availability.
    A. True
    B. False
    **Correct Answer: A. True**

4. AWS Lambda functions can be directly invoked using the AWS CLI, but not through the AWS Management Console.
    A. True
    B. False
    **Correct Answer: B. False**

5. Amazon S3 offers server-side encryption using only AWS KMS.
    A. True
    B. False
    **Correct Answer: B. False**

6. Amazon ElastiCache can be used as a caching layer for both Amazon DynamoDB and Amazon RDS.
    A. True
    B. False
    **Correct Answer: A. True**

7. An Amazon S

In [13]:
# Read generated text as lines to text
buf = io.StringIO(prompt)
lines = buf.readlines()

# Check topic line
topic_line = [line for line in lines if '- Topic: ' in line][0]
topic = topic_line[9:].rstrip('\n.')
topic = re.sub(' ', '_', topic.lower())
print('Topic:', topic)

# Check difficulty line
difficulty_line = [line for line in lines if '- Difficulty: ' in line][0]
difficulty = difficulty_line[14:].rstrip('\n.').lower()
print('Difficulty:', difficulty)

# Check size line
size_line = [line for line in lines if '- Total questions: ' in line][0]
size = size_line[19:].rstrip('\n.')
print('Size:', size)

# Combine topic, difficulty and size into .txt file name
file_name = f'{topic}_{difficulty}_{size}.txt'
print('File name:', file_name)

Topic: aws
Difficulty: hard
Size: 50
File name: aws_hard_50.txt


In [136]:
WRITE_BLOB = file_name
gcs_write(BUCKET, WRITE_BLOB, generate_text)

In [16]:
READ_BLOB = file_name
data = gcs_read(BUCKET, READ_BLOB)

# Use display() because data is already list of lines structure
display(data)

['## True/False Questions\n',
 '\n',
 '1. You can use AWS Lambda to create a serverless application that runs on a specific EC2 instance.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct Answer: B. False**\n',
 '\n',
 '2. AWS CloudFormation allows you to define and manage your AWS resources using a declarative language, making infrastructure as code possible.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct Answer: A. True**\n',
 '\n',
 '3. AWS S3 offers a built-in feature called "server-side encryption" that encrypts data at rest.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct Answer: A. True**\n',
 '\n',
 '4. AWS IAM roles can be used to grant temporary access to AWS resources without having to manage user credentials.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct Answer: A. True**\n',
 '\n',
 '5. AWS Lambda functions can directly access data in an Amazon RDS database without any configuration or security considerations.\n',
 '    A. True\n',
 '  

In [22]:
dict_data = {'question': [],
             'answer': []}

# Switch determines if the last line is in question section or not
q_prev = False

for line in data:
    # Strip '**' style around 'Correct answer(s)'
    line = line.replace('**', '')
    # Check if line is not blank or title, heading ('##')
    if (line != '\n') and ('##' not in line):
        # Call the first word of the line is `head`
        head = line.split()[0]
        
        # Append new question if all of these meet:
        # - `q_prev` is False
        # - First character of `head` is numeric
        # - Last character of `head` is '.'
        if (not q_prev) and head[0].isnumeric() and head[-1] == ".":
            dict_data['question'].append(line)
            q_prev = True

        # Append new answer if all of these meet:
        # - `q_prev` is True
        # - `head` is 'A.'
        elif q_prev and head == 'A.':
            dict_data['answer'].append(line)
            q_prev = False

        # Add line to unfinished question
        elif q_prev:
            dict_data['question'][-1] += line
    
        # Add line to unfinished answer
        else:
            dict_data['answer'][-1] += line

# Convert `dict_data` to a DataFrame and make a copy of it
df0 = pd.DataFrame(dict_data)

print('Number of questions:', len(df0))
print('Columns:', df0.columns.values)

Number of questions: 50
Columns: ['question' 'answer']


In [23]:
# View some rows
df0.iloc[15:60]

Unnamed: 0,question,answer
15,16. AWS CodeDeploy automates application deplo...,A. True\n B. False\n Correct Answer:...
16,17. AWS Organizations allows you to centrally ...,A. True\n B. False\n Correct Answer:...
17,18. AWS CloudTrail records API calls made to A...,A. True\n B. False\n Correct Answer:...
18,19. AWS WAF provides a web application firewal...,A. True\n B. False\n Correct Answer:...
19,20. AWS Shield is a managed DDoS protection se...,A. True\n B. False\n Correct Answer:...
20,21. Which of the following AWS services is use...,A. AWS Route 53\n B. AWS VPC\n C. AW...
21,22. Which of the following AWS services is a f...,A. Amazon Redshift\n B. Amazon DynamoDB...
22,23. Which of the following AWS services is use...,A. AWS Auto Scaling\n B. AWS Elastic Be...
23,24. Which of the following AWS services is use...,A. AWS CloudWatch\n B. AWS CloudTrail\n...
24,25. Which of the following AWS services is use...,A. AWS VPC\n B. AWS Direct Connect\n ...


In [24]:
# Make a copy of df0
df1 = df0.copy()

In [25]:
# Strip the number at the beginning of each question
df1['question'] = df1['question'].str.replace(r'^\d{0,4}\.[ ]', '', regex=True)

In [26]:
df1

Unnamed: 0,question,answer
0,You can use AWS Lambda to create a serverless ...,A. True\n B. False\n Correct Answer:...
1,AWS CloudFormation allows you to define and ma...,A. True\n B. False\n Correct Answer:...
2,"AWS S3 offers a built-in feature called ""serve...",A. True\n B. False\n Correct Answer:...
3,AWS IAM roles can be used to grant temporary a...,A. True\n B. False\n Correct Answer:...
4,AWS Lambda functions can directly access data ...,A. True\n B. False\n Correct Answer:...
5,AWS EBS volumes can be attached to multiple EC...,A. True\n B. False\n Correct Answer:...
6,AWS Auto Scaling can automatically adjust the ...,A. True\n B. False\n Correct Answer:...
7,AWS DynamoDB is a fully managed NoSQL database...,A. True\n B. False\n Correct Answer:...
8,AWS Kinesis is a fully managed streaming servi...,A. True\n B. False\n Correct Answer:...
9,AWS Elastic Beanstalk supports deploying and m...,A. True\n B. False\n Correct Answer:...


In [27]:
# Make a copy of `df1`
df_2 = df1.copy()

In [29]:
def convert_index(capital):
    """Return zero-based index from capital, 'A' has unicode code as 65"""
    return ord(capital) - 65

def choices_split(answer):
    """
    Split the `answer` data into multiple choices
    """
    # print(answer)
    
    # Use 'Correct Answer(s): ' to split text.
    # Index 0 is all choices, index 1 is all answers
    split_all = re.split(r'Correct Answer[s]*: ', answer)
    # print("After strip 'Correct Answer: ': ", split_all)
    
    choices = split_all[0]
    # print('Choices text:', choices)
    
    correct_stack = split_all[1]
    # print('Correct stack:', correct_stack)
    
    # Split using ','
    correct_stack = correct_stack.split(',')
    
    # Pick only first capital indicating the choices
    correct_note = [item.strip()[0] for item in correct_stack]
    # print('Correct note:', correct_note)
    
    # Make zero-based index from alphabet
    correct_index = [convert_index(item) for item in correct_note]
    # print('Correct index', correct_index)
    
    # Split using 'X. ', index 0 is '', so pass
    choices = re.split(r'[A-Z]\.[ ]', choices)[1:]
    # Strip right side of choice text
    choices = [choice.rstrip() for choice in choices]
    # print('All choices in list: ', choices)
    
    incorrect = [choice for index, choice in enumerate(choices) if index not in correct_index]
    # print('Incorrect in list: ', incorrect)
    correct = [choice for index, choice in enumerate(choices) if index in correct_index]
    # print('Correct in list: ', correct)
    
    return {'incorrect': incorrect,
            'correct': correct}

In [30]:
# Test the function with sub set of data
df_test = df_2.iloc[20:22]['answer'].apply(choices_split)

In [31]:
# Print out to preview results
display(df_test.str['incorrect'])
df_test.str['correct']

20         [AWS Route 53, AWS VPC, AWS Direct Connect]
21    [Amazon DynamoDB, Amazon S3, Amazon ElastiCache]
Name: answer, dtype: object

20    [AWS NAT Gateway]
21    [Amazon Redshift]
Name: answer, dtype: object

In [32]:
# Apply the function to full dataset
df_2['incorrect'] = df_2['answer'].apply(choices_split).str['incorrect']
df_2['correct'] = df_2['answer'].apply(choices_split).str['correct']

In [33]:
# Preview a row
display(df_2.loc[40, 'incorrect'])
display(df_2.loc[40, 'correct'])

['AWS IAM', 'AWS CloudTrail']

['AWS Security Groups', 'AWS Network Load Balancer']

In [34]:
# New order of columns
new_order = ['question', 'incorrect', 'correct']

# Make a copy of `df_a` with new order of columns
df_clean = df_2[new_order].copy()

# Preview data in a row
display(df_clean['incorrect'][45])
display(df_clean['correct'][45])
df_clean.iloc[45]

['AWS Trusted Advisor']

['AWS IAM', 'AWS CloudTrail', 'AWS Config']

question     (Select 3 answers) Which three of the followin...
incorrect                                [AWS Trusted Advisor]
correct                  [AWS IAM, AWS CloudTrail, AWS Config]
Name: 45, dtype: object

In [35]:
# Preview the whole final table
df_clean

Unnamed: 0,question,incorrect,correct
0,You can use AWS Lambda to create a serverless ...,[True],[False]
1,AWS CloudFormation allows you to define and ma...,[False],[True]
2,"AWS S3 offers a built-in feature called ""serve...",[False],[True]
3,AWS IAM roles can be used to grant temporary a...,[False],[True]
4,AWS Lambda functions can directly access data ...,[True],[False]
5,AWS EBS volumes can be attached to multiple EC...,[True],[False]
6,AWS Auto Scaling can automatically adjust the ...,[False],[True]
7,AWS DynamoDB is a fully managed NoSQL database...,[False],[True]
8,AWS Kinesis is a fully managed streaming servi...,[False],[True]
9,AWS Elastic Beanstalk supports deploying and m...,[False],[True]


In [37]:
# Pick up the read blob name with '.txt' excluded, to make new json file name
export_file = READ_BLOB[:-4] + '.json'

# Export `df_clean` to a JSON file
df_clean.to_json(export_file)

In [38]:
# Read exported json file as DataFrame
df_json = pd.read_json(export_file)

In [39]:
# Preview json file data
df_json

Unnamed: 0,question,incorrect,correct
0,You can use AWS Lambda to create a serverless ...,[True],[False]
1,AWS CloudFormation allows you to define and ma...,[False],[True]
2,"AWS S3 offers a built-in feature called ""serve...",[False],[True]
3,AWS IAM roles can be used to grant temporary a...,[False],[True]
4,AWS Lambda functions can directly access data ...,[True],[False]
5,AWS EBS volumes can be attached to multiple EC...,[True],[False]
6,AWS Auto Scaling can automatically adjust the ...,[False],[True]
7,AWS DynamoDB is a fully managed NoSQL database...,[False],[True]
8,AWS Kinesis is a fully managed streaming servi...,[False],[True]
9,AWS Elastic Beanstalk supports deploying and m...,[False],[True]


In [40]:
# Load json file in conventional way
f = open(export_file)
data = json.load(f)

# Preview index
data['question']['15']

'AWS CodeDeploy automates application deployments to EC2 instances, making it easier to deploy and update your applications.\n'