In [55]:
# Import general packages
import pandas as pd
import re
import io
import json

# Set environment variable to authenticate GCP credentials
!export GOOGLE_APPLICATION_CREDENTIALS='book-to-quiz-7558e7ee5aca.json'

LOCATION = "us-central1"
PROJECT = 'book-to-quiz'
BUCKET = 'book-to-quiz-question-bank'
MODEL = "gemini-1.5-flash-001"

In [56]:
# Imports the Google Cloud client library
from google.cloud import storage

def gcs_read(bucket_name, blob_name, j_load=False):
    """
    Read a blob from GCS using file-like IO.
    Default use readlines() for text file.
    Change `j_load` to True if json.load() is used for reading clean JSON file.
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    with blob.open("r") as file:
        if j_load:
            return json.load(file)
        return file.readlines()
    
def gcs_write(bucket_name, blob_name, content):
    """Write a blob from GCS using file-like IO"""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    with blob.open("w") as file:
        file.write(content)

In [57]:
# Import Vertex AI packages
import vertexai
from vertexai.generative_models import GenerativeModel
import vertexai.preview.generative_models as generative_models

def generate(p_text, g_config, s_settings):
    """Yield text with generator which receive prompt text, generation config and safety settings as arguments"""
    vertexai.init(project=PROJECT, location=LOCATION)
    model = GenerativeModel(
        MODEL,
    )
    responses = model.generate_content(
        [p_text],
        generation_config=g_config,
        safety_settings=s_settings,
        stream=True,
    )

    for response in responses:
        yield response.text

In [58]:
prompt = """Generate quiz with these requirements:
- Total questions: 50.
- Topic: AWS.
- Difficulty: Hard.
- Types: true/false, single correct answer, multiple correct answers.
- Indent questions with number.
- Indent choices and true/false with upper letter.
- If more than 2 correct answers questions, remind  \"(select [exact number] apply)\" before choice A.
- In each question, total correct answers is less than total choices.
- Choices consist multiple technically complicated steps.
- Show correct choices at the end of each question.
- No markdown, plain text.
- Group by type."""

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

In [59]:
# Assign generator to a variable
generator = generate(prompt, generation_config, safety_settings)

In [60]:
# Use string comprehension to gather text from generator
generate_text = ''.join(response for response in generator)

In [61]:
# Use print() for generate_text for better preview because generated_text is whole string, not list of lines
print(generate_text)

## True/False

1. You can use Amazon S3 to store data for a relational database like MySQL.
A. True
B. False
**Correct: B. False**

2. Amazon EC2 instances can be launched in multiple Availability Zones within a single AWS Region.
A. True
B. False
**Correct: A. True**

3. Amazon CloudFront can be used to deliver content from Amazon S3 to users around the world with low latency.
A. True
B. False
**Correct: A. True**

4. Amazon Elastic Beanstalk supports deploying applications built using .NET, PHP, Java, Python, Ruby, Node.js, and Go.
A. True
B. False
**Correct: A. True**

5. Amazon DynamoDB is a fully managed NoSQL database service that supports both document and key-value data models.
A. True
B. False
**Correct: A. True**

6. Amazon SQS is a message queuing service that can be used to decouple applications and ensure that messages are delivered reliably.
A. True
B. False
**Correct: A. True**

7. AWS Lambda is a serverless compute service that allows you to run code without provisionin

In [62]:
# Read generated text as lines
buf = io.StringIO(prompt)
lines = buf.readlines()

# Extract topic
topic_line = [line for line in lines if '- Topic: ' in line][0]
topic = topic_line[9:].rstrip('\n.')
topic = re.sub(' ', '_', topic.lower())
print('Topic:', topic)

# Extract difficulty
difficulty_line = [line for line in lines if '- Difficulty: ' in line][0]
difficulty = difficulty_line[14:].rstrip('\n.').lower()
print('Difficulty:', difficulty)

# Extract size
size_line = [line for line in lines if '- Total questions: ' in line][0]
size = size_line[19:].rstrip('\n.')
print('Size:', size)

# Combine topic, difficulty and size into .txt file name
file_name = f'{topic}_{difficulty}_{size}.txt'
print('File name:', file_name)

Topic: aws
Difficulty: hard
Size: 50
File name: aws_hard_50.txt


In [63]:
# Assign file_name to a constant using for writing to GCS
WRITE_BLOB = file_name

gcs_write(BUCKET, WRITE_BLOB, generate_text)

In [64]:
# Assign file_name to a constant using for reading from GCS
READ_BLOB = file_name

# Read data as lines
data = gcs_read(BUCKET, READ_BLOB)

# Use display() because data is already list of lines structure
display(data)

['## True/False\n',
 '\n',
 '1. You can use Amazon S3 to store data for a relational database like MySQL.\n',
 'A. True\n',
 'B. False\n',
 '**Correct: B. False**\n',
 '\n',
 '2. Amazon EC2 instances can be launched in multiple Availability Zones within a single AWS Region.\n',
 'A. True\n',
 'B. False\n',
 '**Correct: A. True**\n',
 '\n',
 '3. Amazon CloudFront can be used to deliver content from Amazon S3 to users around the world with low latency.\n',
 'A. True\n',
 'B. False\n',
 '**Correct: A. True**\n',
 '\n',
 '4. Amazon Elastic Beanstalk supports deploying applications built using .NET, PHP, Java, Python, Ruby, Node.js, and Go.\n',
 'A. True\n',
 'B. False\n',
 '**Correct: A. True**\n',
 '\n',
 '5. Amazon DynamoDB is a fully managed NoSQL database service that supports both document and key-value data models.\n',
 'A. True\n',
 'B. False\n',
 '**Correct: A. True**\n',
 '\n',
 '6. Amazon SQS is a message queuing service that can be used to decouple applications and ensure that m

In [65]:
def split_qa(lines_data):
    """
    Accepts list of lines.
    Returns a dictionary with keys `question` and `answer`
    """
    dict_data = {'question': [],
                 'answer': []}
    
    # Switch determines if the last line is in question section or not
    q_prev = False
    
    for line in lines_data:
        
        # Strip '**' style around 'Correct answer(s)' or 'Correct'
        line = line.replace('**', '')
        
        # Check if line is not blank, title or heading ('##')
        if (line != '\n') and ('##' not in line):
            
            # Call the first word of the line is `head`
            head = line.split()[0]
            
            # Append new question if all of these meet:
            # - Previous line is not in question text
            # - First character of `head` is numeric
            # - Last character of `head` is '.'
            if (not q_prev) and head[0].isnumeric() and head[-1] == ".":
                dict_data['question'].append(line)
                q_prev = True
    
            # Append new answer if all of these meet:
            # - Previous line is in question text
            # - `head` is 'A.'
            elif q_prev and head == 'A.':
                dict_data['answer'].append(line)
                q_prev = False
    
            # Add line to unfinished question
            elif q_prev:
                dict_data['question'][-1] += line
        
            # Add line to unfinished answer
            else:
                dict_data['answer'][-1] += line
    
    return dict_data

In [66]:
# Return split_qa result to a variable
dict0 = split_qa(data)

# Convert `dict0` to a DataFrame
df0 = pd.DataFrame(dict0)

print('Number of questions:', len(df0))
print('Columns:', df0.columns.values)

Number of questions: 50
Columns: ['question' 'answer']


In [67]:
# View some rows
df0.iloc[15:60]

Unnamed: 0,question,answer
15,16. Which of the following AWS services can be...,A. Amazon CloudWatch\nB. Amazon Route 53\nC. A...
16,17. Which of the following AWS services can be...,A. Amazon EC2\nB. Amazon VPC\nC. Amazon Route ...
17,18. Which of the following AWS services can be...,A. AWS CloudFormation\nB. Amazon EC2\nC. Amazo...
18,19. Which of the following AWS services is a f...,A. Amazon RDS\nB. Amazon DynamoDB\nC. Amazon R...
19,20. Which of the following AWS services can be...,A. Amazon EC2\nB. Amazon Route 53\nC. Amazon E...
20,21. (select 2 apply) Which of the following AW...,A. Amazon S3\nB. Amazon EC2\nC. Amazon DynamoD...
21,22. (select 3 apply) Which of the following AW...,A. Amazon EC2\nB. Amazon Elastic Beanstalk\nC....
22,23. (select 2 apply) Which of the following AW...,A. Amazon CloudWatch\nB. Amazon Route 53\nC. A...
23,24. (select 3 apply) Which of the following AW...,A. Amazon VPC\nB. Amazon Route 53\nC. Amazon C...
24,25. (select 2 apply) Which of the following AW...,A. AWS IAM\nB. Amazon S3\nC. AWS Security Hub\...


In [68]:
# Make a copy of df0
df1 = df0.copy()

In [69]:
# Strip the number at the beginning and '\n' at the end of each question 
df1['question'] = df1['question'].str.replace(r'^\d{0,4}\.[ ]', '', regex=True).str.rstrip()

In [70]:
df1

Unnamed: 0,question,answer
0,You can use Amazon S3 to store data for a rela...,A. True\nB. False\nCorrect: B. False\n
1,Amazon EC2 instances can be launched in multip...,A. True\nB. False\nCorrect: A. True\n
2,Amazon CloudFront can be used to deliver conte...,A. True\nB. False\nCorrect: A. True\n
3,Amazon Elastic Beanstalk supports deploying ap...,A. True\nB. False\nCorrect: A. True\n
4,Amazon DynamoDB is a fully managed NoSQL datab...,A. True\nB. False\nCorrect: A. True\n
5,Amazon SQS is a message queuing service that c...,A. True\nB. False\nCorrect: A. True\n
6,AWS Lambda is a serverless compute service tha...,A. True\nB. False\nCorrect: A. True\n
7,Amazon CloudWatch can be used to monitor AWS r...,A. True\nB. False\nCorrect: A. True\n
8,Amazon VPC allows you to create a private netw...,A. True\nB. False\nCorrect: A. True\n
9,Amazon Route 53 is a DNS service that can be u...,A. True\nB. False\nCorrect: A. True\n


In [88]:
def convert_index(capital):
    """Return zero-based index from capital, 'A' has unicode code as 65"""
    return ord(capital) - 65

def split_choice(answer):
    """
    Split the `answer` data into multiple choices
    """
    # print(answer)
    
    # Use 'Correct Answer(s): ' to split text.
    # Index 0 is all choices, index 1 is all answers
    if 'Correct Answer' in answer:
        split_all = re.split(r'Correct Answer[s]*: ', answer)
    else:
        split_all = re.split('Correct: ', answer)
    # print("After strip 'Correct Answer(s): ' or 'Correct: ':", split_all)
    
    choices = split_all[0]
    # print('Choices text:', choices)
    
    correct_stack = split_all[1]
    # print('Correct stack:', correct_stack)
    
    # Split using ','
    correct_stack = correct_stack.split(',')
    
    # Pick only first capital indicating the choices
    correct_note = [item.strip()[0] for item in correct_stack]
    # print('Correct note:', correct_note)
    
    # Make zero-based index from alphabet
    correct_index = [convert_index(item) for item in correct_note]
    # print('Correct index', correct_index)
    
    # Split using 'X. ', index 0 is '', so pass
    choices = re.split(r'[A-Z]\.[ ]', choices)[1:]
    # Strip right side of choice text
    choices = [choice.rstrip() for choice in choices]
    # print('All choices in list: ', choices)
    
    # Separate choices into lists as `incorrect` and `correct`
    incorrect = [choice for index, choice in enumerate(choices) if index not in correct_index]
    # print('Incorrect in list: ', incorrect)
    correct = [choice for index, choice in enumerate(choices) if index in correct_index]
    # print('Correct in list: ', correct)
    
    return {'incorrect': incorrect,
            'correct': correct}

In [72]:
# Make a copy of `df1`
df_2 = df1.copy()

In [91]:
# Test the function with sub set of data
df_test = df_2.iloc[20:22]['answer'].apply(split_choice)

In [92]:
# Print out to preview results
display(df_test.str['incorrect'])
df_test.str['correct']

20    [Amazon EC2, Amazon Redshift]
21                      [Amazon S3]
Name: answer, dtype: object

20                         [Amazon S3, Amazon DynamoDB]
21    [Amazon EC2, Amazon Elastic Beanstalk, AWS Clo...
Name: answer, dtype: object

In [93]:
# Apply the function to full dataset
df_2['incorrect'] = df_2['answer'].apply(split_choice).str['incorrect']
df_2['correct'] = df_2['answer'].apply(split_choice).str['correct']

In [94]:
# Preview a row
display(df_2.loc[40, 'incorrect'])
display(df_2.loc[40, 'correct'])

['Amazon S3', 'Amazon EC2', 'Amazon Redshift']

['Amazon RDS']

In [95]:
# New order of columns
new_order = ['question', 'incorrect', 'correct']

# Make a copy of `df_a` with new order of columns
df_clean = df_2[new_order].copy()

# Preview data in a row
display(df_clean['incorrect'][45])
display(df_clean['correct'][45])
df_clean.iloc[45]

['Amazon Route 53', 'Amazon SQS', 'Amazon EC2']

['Amazon CloudWatch']

question     Which of the following AWS services can be use...
incorrect            [Amazon Route 53, Amazon SQS, Amazon EC2]
correct                                    [Amazon CloudWatch]
Name: 45, dtype: object

In [96]:
# Preview the whole final table
df_clean

Unnamed: 0,question,incorrect,correct
0,You can use Amazon S3 to store data for a rela...,[True],[False]
1,Amazon EC2 instances can be launched in multip...,[False],[True]
2,Amazon CloudFront can be used to deliver conte...,[False],[True]
3,Amazon Elastic Beanstalk supports deploying ap...,[False],[True]
4,Amazon DynamoDB is a fully managed NoSQL datab...,[False],[True]
5,Amazon SQS is a message queuing service that c...,[False],[True]
6,AWS Lambda is a serverless compute service tha...,[False],[True]
7,Amazon CloudWatch can be used to monitor AWS r...,[False],[True]
8,Amazon VPC allows you to create a private netw...,[False],[True]
9,Amazon Route 53 is a DNS service that can be u...,[False],[True]


In [97]:
# Pick up the read blob name with '.txt' excluded, to make new json file name
JSON_BLOB = READ_BLOB[:-4] + '.json'

# Convert `df_clean` to JSON string
export_json_string = df_clean.to_json()

In [98]:
# Write JSON string to GCS object as a JSON file
gcs_write(BUCKET, JSON_BLOB, export_json_string)

In [99]:
# Read just exported JSON file for testing purpose
read_json_string = gcs_read(BUCKET, JSON_BLOB, j_load=True)

# Convert to DataFrame
df_json = pd.DataFrame(read_json_string)

In [100]:
# Preview json file data as DataFrame
df_json

Unnamed: 0,question,incorrect,correct
0,You can use Amazon S3 to store data for a rela...,[True],[False]
1,Amazon EC2 instances can be launched in multip...,[False],[True]
2,Amazon CloudFront can be used to deliver conte...,[False],[True]
3,Amazon Elastic Beanstalk supports deploying ap...,[False],[True]
4,Amazon DynamoDB is a fully managed NoSQL datab...,[False],[True]
5,Amazon SQS is a message queuing service that c...,[False],[True]
6,AWS Lambda is a serverless compute service tha...,[False],[True]
7,Amazon CloudWatch can be used to monitor AWS r...,[False],[True]
8,Amazon VPC allows you to create a private netw...,[False],[True]
9,Amazon Route 53 is a DNS service that can be u...,[False],[True]


In [101]:
# Preview index in JSON string to confirm
read_json_string['question']['15']

'Which of the following AWS services can be used to monitor AWS resources and applications, and trigger actions based on predefined thresholds?'