# Split the processed dataset into Train, Test, and Validation

### Import the Libraries

In [1]:
import awswrangler as wr
import boto3
import os
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

from sklearn.model_selection import train_test_split
from IPython.core.display import display, HTML

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


  from IPython.core.display import display, HTML


In [2]:
# check dependencies are stored
%store

Stored variables and their in-db values:
bucket_name                                       -> 'wizard-of-tasks-dataset-54321'
ingest_create_athena_db_passed                    -> True
ingest_create_athena_table_tsv_passed             -> True
s3_private_data_path_csv                          -> 's3://sagemaker-us-east-1-114106928417/aai-540-2-1
s3_private_path_tsv                               -> 's3://sagemaker-us-east-1-114106928417/amazon-revi
s3_public_data_path_csv                           -> '/home/sagemaker-user/aai-540-homework/homework-2-
s3_public_path_tsv                                -> 's3://dsoaws/amazon-reviews-pds/tsv'
setup_dependencies_passed                         -> True
setup_s3_bucket_passed                            -> True


In [3]:
# save Amazon information
account_id = boto3.client("sts").get_caller_identity().get("Account")
region = boto3.Session().region_name
role = get_execution_role()
sagemaker_session = Session()
s3 = boto3.client('s3', region_name=sagemaker_session.boto_region_name)

In [4]:
# get s3 path to data from stored variable
%store -r bucket_name
s3_processed_dataset_path = 's3://{}/data/processed_data.csv'.format(bucket_name)
print(s3_processed_dataset_path)

s3://wizard-of-tasks-dataset-54321/data/processed_data.csv


### Import the data from S3 into Pandas Dataframe
1. using aws wrangler
2. review the dataset

In [5]:
df_processed_data = wr.s3.read_csv(path=s3_processed_dataset_path, sep='^')

In [6]:
df_processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1328 entries, 0 to 1327
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   question               1328 non-null   object
 1   intent_question        1328 non-null   object
 2   history                1187 non-null   object
 3   conversation_id        1328 non-null   object
 4   document_url_question  1328 non-null   object
 5   domain_question        1328 non-null   object
 6   text_answer            1328 non-null   object
 7   intent_answer          1328 non-null   object
 8   domain_answer          1328 non-null   object
 9   question_id            1328 non-null   object
 10  title                  1328 non-null   object
 11  description            1319 non-null   object
 12  ingredients            570 non-null    object
 13  steps                  1328 non-null   object
dtypes: object(14)
memory usage: 145.4+ KB


In [7]:
df_processed_data.head()

Unnamed: 0,question,intent_question,history,conversation_id,document_url_question,domain_question,text_answer,intent_answer,domain_answer,question_id,title,description,ingredients,steps
0,How do we prepare the tree?,ask_question_recipe_steps,student: I'm ready for the first step now plea...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,Have you selected a pot? This is a very import...,answer_question_recipe_steps,diy,diy-1-1,How to Start a Bonsai Tree,The ancient art of growing Bonsai trees is wel...,,['Select a suitable species of tree for your c...
1,"Ok, I have a nice dark green pot, perfect. Wha...",ask_question_ingredients_tools,student: I've got an idea of where I want it t...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,Next we will prepare the tree for potting. The...,answer_question_recipe_steps,diy,diy-1-2,How to Start a Bonsai Tree,The ancient art of growing Bonsai trees is wel...,,['Select a suitable species of tree for your c...
2,Does that mean basil grows best in the spring ...,ask_question_recipe_steps,"student: Gotcha! Once I have all those tools,...",Wizard-of-Task-diy-2,https://www.wikihow.com/Grow-Basil,diy,"Yes, like most plants, basil likes a temperate...",answer_question_recipe_steps,diy,diy-2-3,How to Grow Basil,"Basil is easy to grow, and transforms ordinary...",,"['Choose the kind of basil you wish to grow.',..."
3,I don't really have access to those right now ...,ask_question_recipe_steps,"student: Okay, now what should I do after that...",Wizard-of-Task-diy-3,https://www.wikihow.com/Remove-Salt-Build-up-o...,diy,You can just rub it on the main zipper piece,answer_question_recipe_steps,diy,diy-3-1,How to Remove Salt Build up on a Zipper,Whether it’s from roads and sidewalks in the w...,,"['Open the zipper as much as possible.', 'Use ..."
4,If I could only choose one thing to decoupage ...,ask_question_recipe_steps,student: What is the easiest type of material ...,Wizard-of-Task-diy-5,https://www.wikihow.com/Decoupage,diy,I would highly recommend either decoupaging wo...,answer_question_recipe_steps,diy,diy-5-5,How to Decoupage,If you'd like to give new life to a piece of f...,,['Cover your workspace with paper to protect i...


### Perform Train, Test Split on the dataset
1. Determine the count of conversations per conversationid
2. Determine the count of conversationids and the count of conversations 'count'
3. Use the count of the conversationids and count of conversations as the numbers to split the merged processed dataset, split it into Train, Validation, and Test

In [8]:
df_conversations = df_processed_data.groupby('conversation_id').count().reset_index()[['conversation_id','question']].rename(columns={'question':'count'})
df_conversations.loc[32, 'count'] = 6 # for splitting to work properly
df_conversations

Unnamed: 0,conversation_id,count
0,Wizard-of-Task-diy-1,2
1,Wizard-of-Task-diy-10,3
2,Wizard-of-Task-diy-100,5
3,Wizard-of-Task-diy-101,1
4,Wizard-of-Task-diy-102,3
...,...,...
479,Wizard-of-Task-food-93,3
480,Wizard-of-Task-food-94,3
481,Wizard-of-Task-food-95,3
482,Wizard-of-Task-food-98,4


In [9]:
conversations = list(df_conversations['conversation_id'])
print(len(conversations))
count = list(df_conversations['count'])
print(len(count))

484
484


In [13]:
conversation_train, conversation_test, count_train, _ = train_test_split(conversations, count,
                                                               test_size=0.5, random_state = 30, stratify=None)

conversation_train, conversations_val, _, _ = train_test_split(conversation_train, count_train, 
                                                               test_size=0.4, random_state= 42, stratify=None)

In [17]:
split_data = df_processed_data
split_data['data_split'] = split_data['conversation_id'].apply(lambda x: 'train' if x in conversation_train else 'test' if x in conversation_test else 'validation')
split_data.groupby('data_split').count().reset_index()

Unnamed: 0,data_split,question,intent_question,history,conversation_id,document_url_question,domain_question,text_answer,intent_answer,domain_answer,question_id,title,description,ingredients,steps
0,test,653,653,583,653,653,653,653,653,653,653,653,644,262,653
1,train,404,404,364,404,404,404,404,404,404,404,404,404,191,404
2,validation,271,271,240,271,271,271,271,271,271,271,271,271,117,271


### Write the Split datasets to the S3 Bucket
1. Define the S3 path
2. Define the final split dataframe, train dataframe, validation dataframe, and test dataframe
3. Convert the final split data to CSV and upload to S3
4. Convert the train dataset to CSV and upload to S3
5. Convert to validation dataset to CSV and upload to S3
6. Convert the test dataset to CSV and upload to S3

In [18]:
# Define the S3 path
s3_path = 's3://{}/data/'.format(bucket_name)
print(s3_path)

s3://wizard-of-tasks-dataset-54321/data/


In [19]:
train_df = split_data[split_data['data_split'] == 'train']
print(len(train_df))

val_df = split_data[split_data['data_split'] == 'validation']
print(len(val_df))
test_df = split_data[split_data['data_split'] == 'test']
print(len(test_df))

404
271
653


In [20]:
# Convert final_split_data to CSV and upload to S3
wr.s3.to_csv(split_data, f"{s3_path}final_split_data.csv", sep='^', index=False)

# Convert train_df to CSV and upload to S3
wr.s3.to_csv(train_df, f"{s3_path}train_df.csv", sep='^', index=False)

# Convert val_df to CSV and upload to S3
wr.s3.to_csv(val_df, f"{s3_path}val_df.csv", sep='^', index=False)

# Convert test_df to CSV and upload to S3
wr.s3.to_csv(test_df, f"{s3_path}test_df.csv", sep='^', index=False)

{'paths': ['s3://wizard-of-tasks-dataset-54321/data/test_df.csv'],
 'partitions_values': {}}

### Check that it uploaded using aws wrangler

In [25]:
# List all buckets
buckets = wr.s3.list_buckets()
buckets

['my-athena-bucket-test123',
 'sagemaker-us-east-1-114106928417',
 'wizard-of-tasks-dataset',
 'wizard-of-tasks-dataset-54321']

In [34]:
# Check if datasets exist
dataset_paths = ['s3://{}/data/final_split_data.csv'.format(bucket_name), 's3://{}/data/train_df.csv'.format(bucket_name), 's3://{}/data/val_df.csv'.format(bucket_name), 's3://{}/data/test_df'.format(bucket_name)]
for path in dataset_paths:
    bucket, key = path.split('/', 1)
    if wr.s3.does_object_exist(path):
        print(f"Dataset {key} exists in bucket {bucket}.")
    else:
        print(f"Dataset {key} does not exist in bucket {bucket}.")

Dataset /wizard-of-tasks-dataset-54321/data/final_split_data.csv exists in bucket s3:.
Dataset /wizard-of-tasks-dataset-54321/data/train_df.csv exists in bucket s3:.
Dataset /wizard-of-tasks-dataset-54321/data/val_df.csv exists in bucket s3:.
Dataset /wizard-of-tasks-dataset-54321/data/test_df does not exist in bucket s3:.


### Shutdown Notebook

In [35]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [1]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>