In [205]:
# Import packages
import pandas as pd
import re

In [206]:
# Imports the Google Cloud client library
from google.cloud import storage

!export GOOGLE_APPLICATION_CREDENTIALS='book-to-quiz-7558e7ee5aca.json'

def gcs_read(bucket_name, blob_name):
    """Write and read a blob from GCS using file-like IO"""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your new GCS object
    # blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Mode can be specified as wb/rb for bytes mode.
    # See: https://docs.python.org/3/library/io.html

    with blob.open("r") as f:
        # Return data as lines
        return f.readlines()

In [207]:
BUCKET = 'book-to-quiz-question-bank'
BLOB = 'aws_hard.txt'
data = gcs_read(BUCKET, BLOB)
# Print first line and last line to confirm the content
print(data[0], '[...]\n', data[-1])

## True/False (17 questions)
 [...]
 **Correct Answer: C. Redshift Query Optimization, D. Redshift Data Sharing**


In [208]:
dict_data = {'question': [],
             'answer': []}

# Switch determine if the last line is in question section or no
q_prev = False

for line in data:
  # Check if line is not blank or title
  if (line != '\n') and ('##' not in line):
    # Call the first word of the line is `head`
    head = line.split()[0]

    # Append new question if all of these meet:
    # - `q_prev` is False
    # - First character of `head` is numeric
    # - Last character of `head` is numeric or 'r'
    if (not q_prev) and head[0].isnumeric() and head[-1] == ".":
      dict_data['question'].append(line)
      q_prev = True

    # Append new answer if all of these meet:
    # - `q_prev` is True
    # - `head` is 'A.' or '*A.'
    elif q_prev and head == 'A.':
      dict_data['answer'].append(line)
      q_prev = False

    # Add line to unfinished question
    elif q_prev:
      dict_data['question'][-1] += line

    # Add line to unfinished answer
    else:
      dict_data['answer'][-1] += line

# Convert `dict_data` to a DataFrame and make a copy of it
df0 = pd.DataFrame(dict_data)

print('Number of questions: ', len(df0))
print('Columns: ', df0.columns.values)

Number of questions:  50
Columns:  ['question' 'answer']


In [209]:
# View some indices (15 to 38)
df0.iloc[15:39]

Unnamed: 0,question,answer
15,16. You are using Amazon Kinesis Data Streams ...,A. True\nB. False\n**Correct Answer: A. True**\n
16,17. You are using Amazon SNS to send notificat...,A. True\nB. False\n**Correct Answer: A. True**\n
17,1. You are running a web application on Amazon...,A. Amazon S3\nB. Amazon Route 53\nC. Amazon Cl...
18,2. You are designing a database for a new appl...,A. Amazon RDS for MySQL\nB. Amazon DynamoDB\nC...
19,3. You are using Amazon CloudWatch to monitor ...,A. CPUUtilization\nB. NetworkIn\nC. DiskReadBy...
20,4. You are building a new application that req...,A. Amazon S3\nB. Amazon EBS\nC. Amazon KMS\nD....
21,5. You are using Amazon API Gateway to expose ...,A. Custom authorizers\nB. Usage plans\nC. CORS...
22,6. You are using Amazon Route 53 to manage the...,A. HTTP health check\nB. HTTPS health check\nC...
23,7. You are using Amazon SQS to implement a mes...,A. Message retention\nB. Dead letter queues\nC...
24,8. You are building a serverless application u...,"A. Attaching the ""AmazonDynamoDBFullAccess"" ma..."


In [210]:
# Make a copy of df0
df1 = df0.copy()

In [211]:
# Strip the numbers
df1['question'] = df1['question'].str.replace(r'^\d{0,4}\.[ ]', '', regex=True)

In [212]:
df1

Unnamed: 0,question,answer
0,You are running a mission-critical application...,A. True\nB. False\n**Correct Answer: B. False**\n
1,You are setting up a new Amazon S3 bucket for ...,A. True\nB. False\n**Correct Answer: B. False**\n
2,"You are designing a highly scalable, low-laten...",A. True\nB. False\n**Correct Answer: A. True**\n
3,You are using Amazon CloudFront to distribute ...,A. True\nB. False\n**Correct Answer: B. False**\n
4,You are developing a serverless application th...,A. True\nB. False\n**Correct Answer: B. False**\n
5,You are using Amazon ECS to deploy a container...,A. True\nB. False\n**Correct Answer: A. True**\n
6,You are using Amazon API Gateway to expose a R...,A. True\nB. False\n**Correct Answer: A. True**\n
7,You are using Amazon Route 53 for DNS manageme...,A. True\nB. False\n**Correct Answer: A. True**\n
8,You are using Amazon CloudWatch to monitor you...,A. True\nB. False\n**Correct Answer: B. False**\n
9,You are using Amazon VPC to create a private n...,A. True\nB. False\n**Correct Answer: A. True**\n


In [213]:
# Make a copy of `df1`
df_2 = df1.copy()

In [214]:
df_2

Unnamed: 0,question,answer
0,You are running a mission-critical application...,A. True\nB. False\n**Correct Answer: B. False**\n
1,You are setting up a new Amazon S3 bucket for ...,A. True\nB. False\n**Correct Answer: B. False**\n
2,"You are designing a highly scalable, low-laten...",A. True\nB. False\n**Correct Answer: A. True**\n
3,You are using Amazon CloudFront to distribute ...,A. True\nB. False\n**Correct Answer: B. False**\n
4,You are developing a serverless application th...,A. True\nB. False\n**Correct Answer: B. False**\n
5,You are using Amazon ECS to deploy a container...,A. True\nB. False\n**Correct Answer: A. True**\n
6,You are using Amazon API Gateway to expose a R...,A. True\nB. False\n**Correct Answer: A. True**\n
7,You are using Amazon Route 53 for DNS manageme...,A. True\nB. False\n**Correct Answer: A. True**\n
8,You are using Amazon CloudWatch to monitor you...,A. True\nB. False\n**Correct Answer: B. False**\n
9,You are using Amazon VPC to create a private n...,A. True\nB. False\n**Correct Answer: A. True**\n


In [215]:
df_2['answer'][30]

'A. Kinesis Data Streams shards\nB. Kinesis Data Streams retention policy\nC. Kinesis Data Streams scaling policy\nD. Kinesis Data Streams consumer groups\n**Correct Answer: A. Kinesis Data Streams shards**\n'

In [216]:
def choices_split(answer):
    """
    Split the `answer` data into multiple choices
    """
    # Use '**' to split text, index 0 is all choices, index 1 is all answers, index 2 is '\n'
    # print(answer)
    split_all = re.split(r'\*\*', answer)
    # print("Strip '**': ", split_all)
    choices = split_all[0]
    # print('Choices text:', choices)
    correct = split_all[1].replace('Correct Answer: ', '')
    # print('Correct text:', correct)
    
    choices = re.split(r'[A-Z]\.[ ]', choices)[1:]
    choices = [choice.rstrip() for choice in choices]
    # print('All choices in list: ', choices)
    correct = re.split(r'[A-Z]\.[ ]', correct)[1:]
    correct = [item.rstrip(', ') for item in correct]
    # print('Correct in list: ', correct)
    incorrect = list(set(choices) - set(correct))
    # print('Incorrect in list: ', incorrect)
    
    return {'incorrect': incorrect,
            'correct': correct}

In [217]:
df_test = df_2.iloc[40:42]['answer'].apply(choices_split)

In [218]:
display(df_test.str['incorrect'])
df_test.str['correct']

40    [ElastiCache data sharding, ElastiCache cluste...
41    [SNS message deduplication, SNS message retry ...
Name: answer, dtype: object

40    [ElastiCache replication, ElastiCache multi-AZ...
41     [SNS topic subscriptions, SNS message filtering]
Name: answer, dtype: object

In [219]:
df_2['incorrect'] = df_2['answer'].apply(choices_split).str['incorrect']
df_2['correct'] = df_2['answer'].apply(choices_split).str['correct']

In [220]:
display(df_2.loc[40, 'incorrect'])
display(df_2.loc[40, 'correct'])

['ElastiCache data sharding', 'ElastiCache cluster auto scaling']

['ElastiCache replication', 'ElastiCache multi-AZ deployments']

In [221]:
# New order of columns
new_order = ['question', 'incorrect', 'correct']

# Make a copy of `df_a` with new order of columns
df_clean = df_2[new_order].copy()

display(df_clean['incorrect'][45])

display(df_clean['correct'][45])

df_clean.iloc[45]

['Kinesis Data Streams retention policy',
 'Kinesis Data Streams scaling policy']

['Kinesis Data Streams shards', 'Kinesis Data Streams consumer groups']

question     You are using Amazon Kinesis Data Streams for ...
incorrect    [Kinesis Data Streams retention policy, Kinesi...
correct      [Kinesis Data Streams shards, Kinesis Data Str...
Name: 45, dtype: object

In [222]:
df_clean

Unnamed: 0,question,incorrect,correct
0,You are running a mission-critical application...,[True],[False]
1,You are setting up a new Amazon S3 bucket for ...,[True],[False]
2,"You are designing a highly scalable, low-laten...",[False],[True]
3,You are using Amazon CloudFront to distribute ...,[True],[False]
4,You are developing a serverless application th...,[True],[False]
5,You are using Amazon ECS to deploy a container...,[False],[True]
6,You are using Amazon API Gateway to expose a R...,[False],[True]
7,You are using Amazon Route 53 for DNS manageme...,[False],[True]
8,You are using Amazon CloudWatch to monitor you...,[True],[False]
9,You are using Amazon VPC to create a private n...,[False],[True]


In [223]:
export_file = BLOB[:-4] + '.json'
# Export `df_clean` to a JSON file
df_clean.to_json(export_file)

In [224]:
df_json = pd.read_json(export_file)

In [233]:
df_json

Unnamed: 0,question,incorrect,correct
0,You are running a mission-critical application...,[True],[False]
1,You are setting up a new Amazon S3 bucket for ...,[True],[False]
2,"You are designing a highly scalable, low-laten...",[False],[True]
3,You are using Amazon CloudFront to distribute ...,[True],[False]
4,You are developing a serverless application th...,[True],[False]
5,You are using Amazon ECS to deploy a container...,[False],[True]
6,You are using Amazon API Gateway to expose a R...,[False],[True]
7,You are using Amazon Route 53 for DNS manageme...,[False],[True]
8,You are using Amazon CloudWatch to monitor you...,[True],[False]
9,You are using Amazon VPC to create a private n...,[False],[True]


In [231]:
import json

f = open(export_file)
data = json.load(f)
data['question']['15']

'You are using Amazon Kinesis Data Streams for real-time data processing. You can configure Kinesis Data Streams to automatically scale the number of shards based on the data throughput. This allows you to handle high data volumes without needing to manually adjust the number of shards.\n'