In [1]:
# Import general packages
import pandas as pd
import re
import io
import json

# Set environment variable to authenticate GCP credentials
!export GOOGLE_APPLICATION_CREDENTIALS='quiz-generate.json'

LOCATION = "us-central1"
PROJECT = 'exam-practice-404408'
BUCKET = 'exam-banks'
MODEL = "gemini-1.5-flash-001"

In [2]:
# Imports the Google Cloud client library
from google.cloud import storage

def gcs_read(bucket_name, blob_name, j_load=False):
    """
    Read a blob from GCS using file-like IO.
    Default use readlines() for text file.
    Change `j_load` to True if json.load() is used for reading clean JSON file.
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    with blob.open("r") as file:
        if j_load:
            return json.load(file)
        return file.readlines()
    
def gcs_write(bucket_name, blob_name, content):
    """Write a blob from GCS using file-like IO"""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    with blob.open("w") as file:
        file.write(content)

In [3]:
# Import Vertex AI packages
import vertexai
from vertexai.generative_models import GenerativeModel
import vertexai.preview.generative_models as generative_models

def generate(p_text, g_config, s_settings):
    """Yield text with generator which receive prompt text, generation config and safety settings as arguments"""
    vertexai.init(project=PROJECT, location=LOCATION)
    model = GenerativeModel(
        MODEL,
    )
    responses = model.generate_content(
        [p_text],
        generation_config=g_config,
        safety_settings=s_settings,
        stream=True,
    )

    for response in responses:
        yield response.text

In [4]:
prompt = """Generate quiz with these requirements:
- Total questions: 50.
- Topic: AWS.
- Difficulty: Hard.
- Types: true/false, single correct answer, multiple correct answers.
- Indent questions with number.
- Indent choices and true/false with upper letter.
- If more than 2 correct answers questions, remind  \"(select [exact number] apply)\" before choice A.
- In each question, total correct answers is less than total choices.
- Choices consist multiple technically complicated steps.
- Show correct choices at the end of each question.
- No markdown, plain text.
- Group by type."""

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

In [5]:
# Assign generator to a variable
generator = generate(prompt, generation_config, safety_settings)

In [6]:
# Use string comprehension to gather text from generator
generate_text = ''.join(response for response in generator)

In [7]:
# Use print() for generate_text for better preview because generated_text is whole string, not list of lines
print(generate_text)

## True/False

1. A single AWS account can only have one IAM user.
    A. True
    B. False
    **Correct: B**

2. Amazon S3 buckets can be directly accessed from the internet without any configuration.
    A. True
    B. False
    **Correct: B**

3. AWS Lambda functions can only be triggered by HTTP requests.
    A. True
    B. False
    **Correct: B**

4. AWS CloudTrail can only record API calls made to AWS services.
    A. True
    B. False
    **Correct: B**

5. AWS CloudFormation can only be used to create new resources, not update or delete existing ones.
    A. True
    B. False
    **Correct: B**

6. Amazon DynamoDB supports both document and relational data models.
    A. True
    B. False
    **Correct: B**

7. AWS Elastic Beanstalk automatically scales your application based on traffic patterns.
    A. True
    B. False
    **Correct: A**

8. AWS Direct Connect provides a dedicated private connection to the AWS cloud.
    A. True
    B. False
    **Correct: A**

9. AWS Cloud

In [8]:
# Read generated text as lines
buf = io.StringIO(prompt)
lines = buf.readlines()

# Extract topic
topic_line = [line for line in lines if '- Topic: ' in line][0]
topic = topic_line[9:].rstrip('\n.')
topic = re.sub(' ', '_', topic.lower())
print('Topic:', topic)

# Extract difficulty
difficulty_line = [line for line in lines if '- Difficulty: ' in line][0]
difficulty = difficulty_line[14:].rstrip('\n.').lower()
print('Difficulty:', difficulty)

# Extract size
size_line = [line for line in lines if '- Total questions: ' in line][0]
size = size_line[19:].rstrip('\n.')
print('Size:', size)

# Combine topic, difficulty and size into .txt file name
file_name = f'{topic}_{difficulty}_{size}.txt'
print('File name:', file_name)

Topic: aws
Difficulty: hard
Size: 50
File name: aws_hard_50.txt


In [9]:
# Assign file_name to a constant using for writing to GCS
WRITE_BLOB = file_name

gcs_write(BUCKET, WRITE_BLOB, generate_text)

In [10]:
# Assign file_name to a constant using for reading from GCS
READ_BLOB = file_name

# Read data as lines
data = gcs_read(BUCKET, READ_BLOB)

# Use display() because data is already list of lines structure
display(data)

['## True/False\n',
 '\n',
 '1. A single AWS account can only have one IAM user.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct: B**\n',
 '\n',
 '2. Amazon S3 buckets can be directly accessed from the internet without any configuration.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct: B**\n',
 '\n',
 '3. AWS Lambda functions can only be triggered by HTTP requests.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct: B**\n',
 '\n',
 '4. AWS CloudTrail can only record API calls made to AWS services.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct: B**\n',
 '\n',
 '5. AWS CloudFormation can only be used to create new resources, not update or delete existing ones.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct: B**\n',
 '\n',
 '6. Amazon DynamoDB supports both document and relational data models.\n',
 '    A. True\n',
 '    B. False\n',
 '    **Correct: B**\n',
 '\n',
 '7. AWS Elastic Beanstalk automatically scales your application based on traffi

In [11]:
def split_qa(lines_data):
    """
    Accepts list of lines.
    Returns a dictionary with keys `question` and `answer`
    """
    dict_data = {'question': [],
                 'answer': []}
    
    # Switch determines if the last line is in question section or not
    q_prev = False
    
    for line in lines_data:
        
        # Strip '**' style around 'Correct answer(s)' or 'Correct'
        line = line.replace('**', '')
        
        # Check if line is not blank, title or heading ('##')
        if (line != '\n') and ('##' not in line):
            
            # Call the first word of the line is `head`
            head = line.split()[0]
            
            # Append new question if all of these meet:
            # - Previous line is not in question text
            # - First character of `head` is numeric
            # - Last character of `head` is '.'
            if (not q_prev) and head[0].isnumeric() and head[-1] == ".":
                dict_data['question'].append(line)
                q_prev = True
    
            # Append new answer if all of these meet:
            # - Previous line is in question text
            # - `head` is 'A.'
            elif q_prev and head == 'A.':
                dict_data['answer'].append(line)
                q_prev = False
    
            # Add line to unfinished question
            elif q_prev:
                dict_data['question'][-1] += line
        
            # Add line to unfinished answer
            else:
                dict_data['answer'][-1] += line
    
    return dict_data

In [12]:
# Return split_qa result to a variable
dict0 = split_qa(data)

# Convert `dict0` to a DataFrame
df0 = pd.DataFrame(dict0)

print('Number of questions:', len(df0))
print('Columns:', df0.columns.values)

Number of questions: 50
Columns: ['question' 'answer']


In [13]:
# View some rows
df0.iloc[15:60]

Unnamed: 0,question,answer
15,16. Which AWS service is primarily used for se...,A. AWS CloudTrail\n B. AWS IAM\n C. ...
16,17. Which AWS service allows you to create and...,A. AWS VPC\n B. Amazon Route 53\n C....
17,18. Which AWS service is primarily used for au...,A. AWS CloudFormation\n B. AWS CloudTra...
18,19. Which AWS service provides a managed servi...,A. AWS Lambda\n B. Amazon EC2\n C. A...
19,20. Which AWS service is primarily used for co...,A. Amazon S3\n B. AWS CloudTrail\n C...
20,21. Which of the following are valid methods f...,A. Using the AWS Management Console\n B...
21,22. Which of the following are benefits of usi...,A. Serverless architecture\n B. Pay-per...
22,23. Which of the following are components of a...,A. Subnets\n B. Route tables\n C. Se...
23,24. Which of the following are valid ways to c...,A. Using IAM policies\n B. Using bucket...
24,25. Which of the following are ways to monitor...,A. Creating custom metrics\n B. Using p...


In [14]:
# Make a copy of df0
df1 = df0.copy()

In [15]:
# Strip the number at the beginning and '\n' at the end of each question 
df1['question'] = df1['question'].str.replace(r'^\d{0,4}\.[ ]', '', regex=True).str.rstrip()

In [16]:
df1

Unnamed: 0,question,answer
0,A single AWS account can only have one IAM user.,A. True\n B. False\n Correct: B\n
1,Amazon S3 buckets can be directly accessed fro...,A. True\n B. False\n Correct: B\n
2,AWS Lambda functions can only be triggered by ...,A. True\n B. False\n Correct: B\n
3,AWS CloudTrail can only record API calls made ...,A. True\n B. False\n Correct: B\n
4,AWS CloudFormation can only be used to create ...,A. True\n B. False\n Correct: B\n
5,Amazon DynamoDB supports both document and rel...,A. True\n B. False\n Correct: B\n
6,AWS Elastic Beanstalk automatically scales you...,A. True\n B. False\n Correct: A\n
7,AWS Direct Connect provides a dedicated privat...,A. True\n B. False\n Correct: A\n
8,AWS CloudWatch only monitors resources within ...,A. True\n B. False\n Correct: B\n
9,AWS Organizations allows you to manage multipl...,A. True\n B. False\n Correct: A\n


In [17]:
def convert_index(capital):
    """Return zero-based index from capital, 'A' has unicode code as 65"""
    return ord(capital) - 65

def split_choice(answer):
    """
    Split the `answer` data into multiple choices
    """
    # print(answer)
    
    # Use 'Correct Answer(s): ' to split text.
    # Index 0 is all choices, index 1 is all answers
    if 'Correct Answer' in answer:
        split_all = re.split(r'Correct Answer[s]*: ', answer)
    else:
        split_all = re.split('Correct: ', answer)
    # print("After strip 'Correct Answer(s): ' or 'Correct: ':", split_all)
    
    choices = split_all[0]
    # print('Choices text:', choices)
    
    correct_stack = split_all[1]
    # print('Correct stack:', correct_stack)
    
    # Split using ','
    correct_stack = correct_stack.split(',')
    
    # Pick only first capital indicating the choices
    correct_note = [item.strip()[0] for item in correct_stack]
    # print('Correct note:', correct_note)
    
    # Make zero-based index from alphabet
    correct_index = [convert_index(item) for item in correct_note]
    # print('Correct index', correct_index)
    
    # Split using 'X. ', index 0 is '', so pass
    choices = re.split(r'[A-Z]\.[ ]', choices)[1:]
    # Strip right side of choice text
    choices = [choice.rstrip() for choice in choices]
    # print('All choices in list: ', choices)
    
    # Separate choices into lists as `incorrect` and `correct`
    incorrect = [choice for index, choice in enumerate(choices) if index not in correct_index]
    # print('Incorrect in list: ', incorrect)
    correct = [choice for index, choice in enumerate(choices) if index in correct_index]
    # print('Correct in list: ', correct)
    
    return {'incorrect': incorrect,
            'correct': correct}

In [18]:
# Make a copy of `df1`
df2 = df1.copy()

In [19]:
# Test the function with sub set of data
df_test = df2.iloc[20:22]['answer'].apply(split_choice)

In [20]:
# Print out to preview results
display(df_test.str['incorrect'])
df_test.str['correct']

20    [Using the AWS CLI, Using Terraform]
21                     [High availability]
Name: answer, dtype: object

20    [Using the AWS Management Console, Using the A...
21    [Serverless architecture, Pay-per-use pricing,...
Name: answer, dtype: object

In [21]:
# Apply the function to full dataset
df2['incorrect'] = df2['answer'].apply(split_choice).str['incorrect']
df2['correct'] = df2['answer'].apply(split_choice).str['correct']

In [22]:
# Preview a row
display(df2.loc[40, 'incorrect'])
display(df2.loc[40, 'correct'])

['Amazon EC2', 'AWS Elastic Beanstalk', 'Amazon ECS']

['AWS Lambda']

In [23]:
# New order of columns
new_order = ['question', 'incorrect', 'correct']

# Make a copy of `df2` with new order of columns
df_clean = df2[new_order].copy()

# Preview data in a row
display(df_clean['incorrect'][45])
display(df_clean['correct'][45])
df_clean.iloc[45]

['AWS CloudTrail', 'AWS CloudFormation', 'Amazon S3']

['AWS IAM']

question     Which AWS service is primarily used for securi...
incorrect      [AWS CloudTrail, AWS CloudFormation, Amazon S3]
correct                                              [AWS IAM]
Name: 45, dtype: object

In [24]:
# Preview the whole final table
df_clean

Unnamed: 0,question,incorrect,correct
0,A single AWS account can only have one IAM user.,[True],[False]
1,Amazon S3 buckets can be directly accessed fro...,[True],[False]
2,AWS Lambda functions can only be triggered by ...,[True],[False]
3,AWS CloudTrail can only record API calls made ...,[True],[False]
4,AWS CloudFormation can only be used to create ...,[True],[False]
5,Amazon DynamoDB supports both document and rel...,[True],[False]
6,AWS Elastic Beanstalk automatically scales you...,[False],[True]
7,AWS Direct Connect provides a dedicated privat...,[False],[True]
8,AWS CloudWatch only monitors resources within ...,[True],[False]
9,AWS Organizations allows you to manage multipl...,[False],[True]


In [25]:
# Pick up the read blob name with '.txt' excluded, to make new json file name
JSON_BLOB = READ_BLOB[:-4] + '.json'

# Convert `df_clean` to JSON string
export_json_string = df_clean.to_json()

In [26]:
# Write JSON string to GCS object as a JSON file
gcs_write(BUCKET, JSON_BLOB, export_json_string)

In [27]:
# Read just exported JSON file for testing purpose
read_json_string = gcs_read(BUCKET, JSON_BLOB, j_load=True)

# Convert to DataFrame
df_json = pd.DataFrame(read_json_string)

In [28]:
# Preview json file data as DataFrame
df_json

Unnamed: 0,question,incorrect,correct
0,A single AWS account can only have one IAM user.,[True],[False]
1,Amazon S3 buckets can be directly accessed fro...,[True],[False]
2,AWS Lambda functions can only be triggered by ...,[True],[False]
3,AWS CloudTrail can only record API calls made ...,[True],[False]
4,AWS CloudFormation can only be used to create ...,[True],[False]
5,Amazon DynamoDB supports both document and rel...,[True],[False]
6,AWS Elastic Beanstalk automatically scales you...,[False],[True]
7,AWS Direct Connect provides a dedicated privat...,[False],[True]
8,AWS CloudWatch only monitors resources within ...,[True],[False]
9,AWS Organizations allows you to manage multipl...,[False],[True]


In [29]:
# Preview index in JSON string to confirm
read_json_string['question']['15']

'Which AWS service is primarily used for securing access to your AWS resources?'