# Preprocess Data

## Import libraries

In [None]:
!pip install beautifulsoup4 requests

In [71]:
import awswrangler as wr
import numpy as np
import json
import os
import pandas as pd
import re
import requests
import subprocess
import warnings

from bs4 import BeautifulSoup

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load Data from S3

In [74]:
df_diy = wr.s3.read_json(path='s3://wizard-of-tasks-dataset-5432/wizard_of_tasks_cooking_v1.0.json', orient='index')
df_cooking = wr.s3.read_json(path='s3://wizard-of-tasks-dataset-5432/wizard_of_tasks_diy_v1.0.json', orient='index')

In [75]:
# Overview of diy dataset
print(len(df_diy))
df_diy.head(5)

272


Unnamed: 0,document_url,data_split,turns
Wizard-of-Task-food-1,https://www.wholefoodsmarket.com/recipes/labne...,test,[{'text': 'Hi! I love labneh but I've never mi...
Wizard-of-Task-food-2,https://www.wholefoodsmarket.com/recipes/cream...,train,[{'text': 'How much cream cheese and other ing...
Wizard-of-Task-food-3,https://www.wholefoodsmarket.com/recipes/citru...,train,[{'text': 'Will I be using premade pasta from ...
Wizard-of-Task-food-4,https://www.wholefoodsmarket.com/recipes/grill...,train,[{'text': 'Will I be making the tortellini by ...
Wizard-of-Task-food-5,https://www.wholefoodsmarket.com/recipes/pasta...,train,[{'text': 'Will I be making my own pasta for t...


In [76]:
# Overview of cooking dataset
print(len(df_cooking))
df_cooking.head(5)

277


Unnamed: 0,document_url,data_split,turns
Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,train,[{'text': 'What are the tools and other things...
Wizard-of-Task-diy-2,https://www.wikihow.com/Grow-Basil,train,"[{'text': 'Basil is so fragrant. I love it. ',..."
Wizard-of-Task-diy-3,https://www.wikihow.com/Remove-Salt-Build-up-o...,validation,[{'text': 'Now what step do I need to take to ...
Wizard-of-Task-diy-4,https://www.wikihow.com/Stop-an-Engine-from-Ov...,test,[{'text': 'Why do engines overheat in the firs...
Wizard-of-Task-diy-5,https://www.wikihow.com/Decoupage,train,"[{'text': 'What exactly is decoupage?', 'turn_..."


## Combine datasets

In [80]:
# Concatenate dataframes
df = pd.concat([df_cooking, df_diy])
df.reset_index(inplace=True)
df.rename(columns={'index': 'conversation_id'}, inplace=True)

df.head(5)

Unnamed: 0,conversation_id,document_url,data_split,turns
0,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,train,[{'text': 'What are the tools and other things...
1,Wizard-of-Task-diy-2,https://www.wikihow.com/Grow-Basil,train,"[{'text': 'Basil is so fragrant. I love it. ',..."
2,Wizard-of-Task-diy-3,https://www.wikihow.com/Remove-Salt-Build-up-o...,validation,[{'text': 'Now what step do I need to take to ...
3,Wizard-of-Task-diy-4,https://www.wikihow.com/Stop-an-Engine-from-Ov...,test,[{'text': 'Why do engines overheat in the firs...
4,Wizard-of-Task-diy-5,https://www.wikihow.com/Decoupage,train,"[{'text': 'What exactly is decoupage?', 'turn_..."


## Pre-process data

### Step 1: Pull out each message (aka turn) for each conversation.

In [82]:
# Iterate through each conversation in the combined dataset and go through the turns in the conversation to pull
# each turn to a list and create a new dataframe for turns

# Initialize an empty list to store each modified turn from the conversations
turns_list = []

# Iterate over each row in the DataFrame `df`, where each row represents a conversation
for index, row in df.iterrows():
    # Create a list `history` that combines the role and text of each turn for historical context
    history = [f"{turn['role']}: {turn['text']}" for turn in row['turns']]
    
    # Replace all None values in `history` with an empty string for consistency
    history = ["" if pd.isna(turn) else turn for turn in history]

    # Loop through each turn in the current row's turns
    for i, turn in enumerate(row['turns']):
        # Ensure there are entries in history and the index is within the list bounds
        if len(history) != 0 and len(history) >= i:
            # Assign the last four turns as history to the current turn or as many as available if less than four
            if i > 4:
                turn['history'] = " | ".join(history[i-4:i])
            else:
                turn['history'] = " | ".join(history[:i])
        else:
            # If no history is available, set history to an empty string
            turn['history'] = ""
        
        # Add additional metadata to each turn for later reference
        turn['conversation_id'] = row['conversation_id']  # Conversation identifier
        turn['document_url'] = row['document_url']        # URL to the document of the conversation
        # Determine the domain based on a keyword in the conversation ID
        turn['domain'] = "food" if "food" in turn['conversation_id'] else "diy"
        
        # Append the modified turn to the list of all turns
        turns_list.append(turn)

# Create a new DataFrame from the list of turns
df_turns = pd.DataFrame(turns_list)

# Display the first five rows of the new DataFrame to verify the results
df_turns.head(5)

Unnamed: 0,text,turn_counter,dangerous_tools,shared_data,intent,real_life_action,relevant,useful,worker_id,previous_worker_id,role,history,conversation_id,document_url,domain,external_urls
0,What are the tools and other things required f...,1,[],[],ask_question_ingredients_tools,I will decide which bonsai plant will I start.,yes,yes,30,,student,,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,
1,You will need quite a few tools to start a bon...,2,[],"[brush, drill, gauge, gravel, hose, refrigerat...",return_list_ingredients_tools,,yes,yes,54,30.0,teacher,student: What are the tools and other things r...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,[]
2,I'm ready for the first step now please.,3,[],[],request_next_step,I would make sure I have everything ready,yes,yes,83,54.0,student,student: What are the tools and other things r...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,
3,"That is great to hear, first you need to selec...",4,[],[Select a suitable species of tree for your cl...,return_next_step,,yes,yes,24,83.0,teacher,student: What are the tools and other things r...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,[]
4,"I've got an idea of where I want it to grow, w...",5,[],[],request_next_step,I would find a good place to grow the tree,yes,yes,83,24.0,student,student: What are the tools and other things r...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,


### Step 2: Remove non question utterances. 
We keep all questions where the intent contains "ask" or the corresponding answer contains "answer_question_recipe_steps"

In [96]:
# Create a filtered DataFrame 'df_qa' that includes only the rows where 
# the 'intent' column = ['ask_question_ingredients_tools' 'ask_question_recipe_steps']
df_qa = df_turns[(df_turns['intent'].str.contains('ask')) & (df_turns['intent'] != 'ask_student_question')]
print(df_qa['intent'].unique())

# Get the indices of rows from 'df_qa' as an array to keep track of question indices
qa_indices = df_qa.index.values

# Find indices for rows where 'intent' is specifically 'answer_question_recipe_steps'
answer_indices = df_turns[(df_turns['intent'] == 'answer_question_recipe_steps')].index.values

# Identify the indices of potential questions missing in 'qa_indices' by checking indices just before 'answer_indices'
# This accounts for cases where the preceding turn might be the corresponding question for the answer
missing_qa = [a-1 for a in answer_indices if a-1 not in qa_indices]

# Combine the original 'qa_indices' with the newly identified 'missing_qa' indices
qa_indices = np.concatenate((qa_indices, missing_qa))

# Update 'df_qa' to include both initial and newly found question indices using iloc to reference the DataFrame by index position
df_qa = df_turns.iloc[qa_indices].copy()

# Reset the index of 'df_qa' to ensure it starts from 0 and the old index becomes a column
df_qa.reset_index(inplace=True)

# Rename the column formerly known as 'index' to 'turn_id' for clarity, as it represents the original position in 'df_turns'
df_qa.rename(columns={'index': 'turn_id'}, inplace=True)

# Display the first five rows of the updated 'df_qa' DataFrame to verify the changes
df_qa.head(5)

['ask_question_ingredients_tools' 'ask_question_recipe_steps']


Unnamed: 0,turn_id,text,turn_counter,dangerous_tools,shared_data,intent,real_life_action,relevant,useful,worker_id,previous_worker_id,role,history,conversation_id,document_url,domain,external_urls
0,0,What are the tools and other things required f...,1,[],[],ask_question_ingredients_tools,I will decide which bonsai plant will I start.,yes,yes,30,,student,,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,
1,6,How do we prepare the tree?,7,[],[],ask_question_recipe_steps,I would think about preparing the tree,yes,yes,83,44.0,student,student: I'm ready for the first step now plea...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,
2,8,"Ok, I have a nice dark green pot, perfect. Wha...",9,[],[],ask_question_ingredients_tools,Get the pot.,yes,yes,51,16.0,student,student: I've got an idea of where I want it t...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,
3,16,How do I prepare the pot?,17,[],[],ask_question_recipe_steps,I will ask the teacher how to prepare the pot,yes,yes,11,24.0,student,"student: Ok, I've prepared the tree, wired the...",Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,
4,30,Any specific growing medium I need for my seeds?,31,[],[],ask_question_ingredients_tools,find a suitable growing medium,yes,yes,9,112.0,student,student: What should I do after the seeds germ...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,


In [97]:
# Create a list of indices for potential answer rows. This assumes each answer follows a question directly.
# It increments each index in qa_indices by 1 to target the following row as the potential answer.
answer_indices = [i+1 for i in qa_indices]

# Filter df_turns to get only those rows that are identified as potential answers using the indices from 
# answer_indices.
df_answers = df_turns[df_turns.index.isin(answer_indices)].copy()

# Reset the index of df_answers. This operation also moves the old index to a new column 
# which helps to keep track of the original row indices from df_turns.
df_answers.reset_index(inplace=True)

# Rename the newly created 'index' column to 'turn_id'. 
# This clarifies that this column now represents the original turn's index from df_turns.
df_answers.rename(columns={'index': 'turn_id'}, inplace=True)

# Display the first five rows of df_answers
df_answers.head(5)

Unnamed: 0,turn_id,text,turn_counter,dangerous_tools,shared_data,intent,real_life_action,relevant,useful,worker_id,previous_worker_id,role,history,conversation_id,document_url,domain,external_urls
0,1,You will need quite a few tools to start a bon...,2,[],"[brush, drill, gauge, gravel, hose, refrigerat...",return_list_ingredients_tools,,yes,yes,54,30.0,teacher,student: What are the tools and other things r...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,[]
1,7,Have you selected a pot? This is a very import...,8,[],[Select a pot. The hallmark feature of Bonsai ...,answer_question_recipe_steps,,yes,yes,16,83.0,teacher,"teacher: That is great to hear, first you need...",Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,[]
2,9,Next we will prepare the tree for potting. The...,10,[],[Prepare the tree. If you've just bought a Bon...,answer_question_recipe_steps,,yes,yes,70,51.0,teacher,teacher: Now we prepare the tree. | student: H...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,[]
3,17,First make sure it is clean and dry.,18,[],[],answer_question_external_fact,,yes,yes,44,11.0,teacher,teacher: Next we will remove the tree and clea...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,[]
4,31,You do need to pick a nice pot. Presentation i...,32,[],[],answer_question_external_fact,,yes,yes,16,9.0,teacher,"teacher: After germination, put your seedlings...",Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,[]


In [100]:
# Combine each same index of df_qa and df_answers into one row
df_qa_pairs = pd.concat([df_qa.add_suffix('_question'), df_answers.add_suffix('_answer')], axis=1)

# List columns for the new dataframe
df_qa_pairs.columns

Index(['turn_id_question', 'text_question', 'turn_counter_question',
       'dangerous_tools_question', 'shared_data_question', 'intent_question',
       'real_life_action_question', 'relevant_question', 'useful_question',
       'worker_id_question', 'previous_worker_id_question', 'role_question',
       'history_question', 'conversation_id_question', 'document_url_question',
       'domain_question', 'external_urls_question', 'turn_id_answer',
       'text_answer', 'turn_counter_answer', 'dangerous_tools_answer',
       'shared_data_answer', 'intent_answer', 'real_life_action_answer',
       'relevant_answer', 'useful_answer', 'worker_id_answer',
       'previous_worker_id_answer', 'role_answer', 'history_answer',
       'conversation_id_answer', 'document_url_answer', 'domain_answer',
       'external_urls_answer'],
      dtype='object')

In [101]:
# Drop and rename columns
df_qa_pairs.drop(columns=[
    'turn_id_answer', 'conversation_id_answer', 'document_url_answer',  
    'worker_id_answer', 'worker_id_question', 'turn_id_question', 'turn_counter_question', 'dangerous_tools_question', 
    'shared_data_question', 'previous_worker_id_question', 'role_question', 'external_urls_question', 'turn_counter_answer',
    'previous_worker_id_answer', 'role_answer', 'external_urls_question', 'shared_data_answer', 'dangerous_tools_answer', 'history_answer'
    ], inplace=True)
df_qa_pairs = df_qa_pairs.rename(columns={'data_split_question': 'data_split', 'conversation_id_question': 'conversation_id'})

In [102]:
# Add a new column 'question_count' to the dataframe 'df_qa_pairs'.
# This column is populated by counting the cumulative number of occurrences of each 'conversation_id' 
# up to the current row, effectively numbering the rows (questions) within each unique conversation.
df_qa_pairs['question_count'] = df_qa_pairs.groupby('conversation_id')['conversation_id'].cumcount()

# Create a new column 'question_id' that extracts a unique identifier from each 'conversation_id'.
df_qa_pairs['question_id'] = df_qa_pairs['conversation_id'].str.split('Wizard-of-Task-').str[1]

# Update the 'question_id' column by appending a hyphen and the 'question_count' to the unique identifier.
df_qa_pairs['question_id'] = df_qa_pairs['question_id'] + '-' + df_qa_pairs['question_count'].astype(str)

# Display the first five rows of the modified dataframe 'df_qa_pairs' to verify the changes and see the structure of the data.
df_qa_pairs.head(5)

Unnamed: 0,text_question,intent_question,real_life_action_question,relevant_question,useful_question,history_question,conversation_id,document_url_question,domain_question,text_answer,intent_answer,real_life_action_answer,relevant_answer,useful_answer,domain_answer,external_urls_answer,question_count,question_id
0,What are the tools and other things required f...,ask_question_ingredients_tools,I will decide which bonsai plant will I start.,yes,yes,,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,You will need quite a few tools to start a bon...,return_list_ingredients_tools,,yes,yes,diy,[],0,diy-1-0
1,How do we prepare the tree?,ask_question_recipe_steps,I would think about preparing the tree,yes,yes,student: I'm ready for the first step now plea...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,Have you selected a pot? This is a very import...,answer_question_recipe_steps,,yes,yes,diy,[],1,diy-1-1
2,"Ok, I have a nice dark green pot, perfect. Wha...",ask_question_ingredients_tools,Get the pot.,yes,yes,student: I've got an idea of where I want it t...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,Next we will prepare the tree for potting. The...,answer_question_recipe_steps,,yes,yes,diy,[],2,diy-1-2
3,How do I prepare the pot?,ask_question_recipe_steps,I will ask the teacher how to prepare the pot,yes,yes,"student: Ok, I've prepared the tree, wired the...",Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,First make sure it is clean and dry.,answer_question_external_fact,,yes,yes,diy,[],3,diy-1-3
4,Any specific growing medium I need for my seeds?,ask_question_ingredients_tools,find a suitable growing medium,yes,yes,student: What should I do after the seeds germ...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,You do need to pick a nice pot. Presentation i...,answer_question_external_fact,,yes,yes,diy,[],4,diy-1-4


Remove all questions that are:
* not questions
* not useful/ relevant
* not ask_question_recipe_steps
* external questions

In [104]:
# Display the unique values from the 'intent_answer' column in the 'df_qa_pairs' DataFrame.
df_qa_pairs['intent_answer'].unique()

array(['return_list_ingredients_tools', 'answer_question_recipe_steps',
       'answer_question_external_fact', 'ask_question_ingredients_tools',
       'ask_question_recipe_steps', 'request_next_step',
       'return_next_step', 'chitchat', 'stop', 'ask_student_question',
       'misc'], dtype=object)

In [105]:
print(len(df_qa_pairs))

# Drop all rows where neither intent_question contains ask or intent_answer contains answer
df_qa_pairs_filtered = df_qa_pairs[~(~(df_qa_pairs['intent_question'].str.contains('ask')) & ~(df_qa_pairs['intent_answer'].str.contains('answer')))]
print(len(df_qa_pairs_filtered))

# Drop all rows where intent_answer = chitchat
df_qa_pairs_filtered = df_qa_pairs_filtered[~df_qa_pairs_filtered['intent_answer'].str.contains('chitchat')]
print(len(df_qa_pairs_filtered))

4479
4341
4312


In [113]:
# Filter df_qa_pairs_filtered to select rows where 'intent_answer' is 'answer_question_recipe_steps'.
df_internal_questions = df_qa_pairs_filtered[df_qa_pairs_filtered['intent_answer'] == 'answer_question_recipe_steps']

# Filter df_qa_pairs_filtered to select rows where 'intent_answer' is 'answer_question_external_fact'.
df_external_answer = df_qa_pairs_filtered[df_qa_pairs_filtered['intent_answer'] == 'answer_question_external_fact']

# Filter df_qa_pairs_filtered to select rows where 'external_urls_answer' column contains at least one URL.
df_external_link = df_qa_pairs_filtered[df_qa_pairs_filtered['external_urls_answer'].str.len() != 0]

# From the internal_questions, filter out any questions that appear in external_link
df_internal_questions = df_internal_questions[~df_internal_questions['question_id'].isin(df_external_link['question_id'])]

# From the external_answer, filter out any answers that appear in external_link.
df_individuals = df_external_answer[~df_external_answer['question_id'].isin(df_external_link['question_id'])]

print(f'External links: {len(df_external_link)}')
print(f'External answers: {len(df_external_answer)}')
print('-----')
print(f'External questions: {len(df_external_link) + len(df_individuals)}')
print(f'Internal questions: {len(df_internal_questions)}')

External links: 584
External answers: 1543
-----
External questions: 1794
Internal questions: 1589


In [114]:
# Drop all not useful and not relevant questions from the df that contains 'answer_question_recipe_steps'
df_internal_questions = df_internal_questions[(df_internal_questions['relevant_question']== "yes") & (df_internal_questions['useful_question']== "yes")]
print(f'Internal questions: {len(df_internal_questions)}')

Internal questions: 1331


In [115]:
# Drop and rename columns
df_internal_questions = df_internal_questions.drop(columns=['relevant_answer', 'useful_answer', 'external_urls_answer', 
    'relevant_question', 'useful_question', 'real_life_action_question', 'real_life_action_answer', 'question_count'])
df_internal_questions = df_internal_questions.rename(columns={'text_question': 'question', 'history_question': 'history'})
df_internal_questions.head(5)

Unnamed: 0,question,intent_question,history,conversation_id,document_url_question,domain_question,text_answer,intent_answer,domain_answer,question_id
1,How do we prepare the tree?,ask_question_recipe_steps,student: I'm ready for the first step now plea...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,Have you selected a pot? This is a very import...,answer_question_recipe_steps,diy,diy-1-1
2,"Ok, I have a nice dark green pot, perfect. Wha...",ask_question_ingredients_tools,student: I've got an idea of where I want it t...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,Next we will prepare the tree for potting. The...,answer_question_recipe_steps,diy,diy-1-2
8,Does that mean basil grows best in the spring ...,ask_question_recipe_steps,"student: Gotcha! Once I have all those tools,...",Wizard-of-Task-diy-2,https://www.wikihow.com/Grow-Basil,diy,"Yes, like most plants, basil likes a temperate...",answer_question_recipe_steps,diy,diy-2-3
14,I don't really have access to those right now ...,ask_question_recipe_steps,"student: Okay, now what should I do after that...",Wizard-of-Task-diy-3,https://www.wikihow.com/Remove-Salt-Build-up-o...,diy,You can just rub it on the main zipper piece,answer_question_recipe_steps,diy,diy-3-1
23,If I could only choose one thing to decoupage ...,ask_question_recipe_steps,student: What is the easiest type of material ...,Wizard-of-Task-diy-5,https://www.wikihow.com/Decoupage,diy,I would highly recommend either decoupaging wo...,answer_question_recipe_steps,diy,diy-5-5


## Step 3: Add Recipe Data

In [116]:
# Pull URLs from 
df_urls = internal_questions['document_url_question'].unique()
print(f'URLs: {len(df_urls)}')

URLs: 485


In [111]:
def parse_time(time_str):
    class CustomParserInfo(parserinfo):
        HMS = [('h', 'hr', 'hrs', 'hour', 'hours'), ('m', 'min', 'mins', 'minute', 'minutes'),
                ('s', 'second', 'seconds')]

    try:
        parsed_time = dparser.parse(time_str, fuzzy=True, parserinfo=CustomParserInfo())
        parsed_time_min = parsed_time.minute + parsed_time.hour * 60 + parsed_time.second / 60
        return parsed_time_min
    except:
        minutes = ('m', 'min', 'mins', 'minute', 'minutes')
        hours = ('h', 'hr', 'hrs', 'hour', 'hours')
        seconds = ('s', 'second', 'seconds')

        time_num = re.findall(r"\d+", time_str.strip())
        if time_num != []:
            final_time = 0
            time_int = int(time_num[0])
            for word in time_str.split(" "):
                if word in minutes:
                    final_time = time_int
                elif word in hours:
                    final_time = time_int * 60
                elif word in seconds:
                    final_time = time_int / 60

            return final_time
        else:
            return 0


def get_method_number(soup):
        """ Scrape how many methods the DIY article contains and save this number in DIYDocument.number_of_parallel_
        methods
        """
        steps_lists = soup.find_all(class_='steps')
        if steps_lists != []:
            header = steps_lists[0].find('h3')
            if header:
                if "Part" in header.text:
                    # DIY article has parts
                    return 1
                else:
                    # DIY article has methods
                    number = len(steps_lists)
                    return number
            else:
                # DIY article has just one method, no parts
                return 1

def get_steps(soup):
    """ Parse steps. """
    # article just has one method
    list_of_steps_list = soup.find_all('ol', class_='steps_list_2')
    method_number = get_method_number(soup)
    if list_of_steps_list != []:
        # loop through all parts (which include an array of steps) of the article
        if method_number > 1:
            list_of_steps_list = list_of_steps_list[:1]
        for part in list_of_steps_list:
            list_of_steps = get_steps_list(soup, part)
    return list_of_steps

def get_steps_list(soup, steps_list):
    """ Helper function to scrape all steps from list object """
    # Loop over steps object to access associated text, images, or video.
    steps = []
    for step_tag in steps_list.find_all('li'):
        if step_tag.get('id') is not None:
            # Build Step object.
            step_object = step_tag.find(class_='step')
            if step_object:
                # scrape bold step header
                header = step_object.find('b')
                if header:
                    header = header.text

                # scrape all the remaining text after the bold step header
                text = step_object.find_all(text=True)
                if len(text) >= 3:
                    text = text[2]
                    if text[0] == '[':
                        text = ''
                    else:
                        text += ';'
                elif text != []:
                    text = text[0]

                # scrape all step text that is stored in a bulleted list
                step_list_text = step_object.find('ul')
                if step_list_text:
                    for bullet_point in step_list_text.find_all('li'):
                        text += bullet_point.find_all(text=True)[0].strip() + ';'
                steps.append(f'{header}: {text}')
    return steps

In [122]:
# Initialize an empty DataFrame
columns = ['document_url_question', 'title', 'description', 'ingredients', 'steps']
df_recipes = pd.DataFrame(columns=columns)

# Define a function to extract steps for wikiHow (assuming it's already defined)
def get_steps(soup):
    steps_list = [step.text.strip() for step in soup.select('.step .whb')]
    return steps_list

# Loop through each URL
for url in urls:
    recipe = {}

    request = requests.get(url)
    html = request.text
    status = request.status_code

    if 'wholefoodsmarket' in url and status == 200:
        soup = BeautifulSoup(html, 'html.parser')
        schema = soup.find_all('script', attrs={"type": "application/ld+json"})
        schema = json.loads(schema[0].string)
        recipe['title'] = soup.find(class_='w-header-title').text
        recipe['description'] = schema['description']

        ingredients = [ingredient for ingredient in schema.get('recipeIngredient', [])]
        recipe['ingredients'] = ingredients

        steps = [step.get('text') for step in schema.get('recipeInstructions', [])]
        recipe['steps'] = steps

    elif 'wikihow' in url and status == 200:
        soup = BeautifulSoup(html, 'html.parser')
        recipe['title'] = soup.find(id='section_0').text
        recipe['description'] = soup.find(id='mf-section-0').text
        recipe['steps'] = get_steps(soup)

    else:
        print(f'Error for: {url}')
        continue

    # Create a DataFrame from the recipe dictionary
    df_recipe = pd.DataFrame([recipe], columns=columns)
    df_recipe['document_url_question'] = url

    # Append the recipe data to the DataFrame using concat
    df_recipes = pd.concat([df_recipes, df_recipe], ignore_index=True)

Error for: https://www.wikihow.com/Use-Chia-Seeds


In [123]:
df_recipes.head(5)

Unnamed: 0,document_url_question,title,description,ingredients,steps
0,https://www.wikihow.com/Start-a-Bonsai-Tree,How to Start a Bonsai Tree,\nThe ancient art of growing Bonsai trees is w...,,[Select a suitable species of tree for your cl...
1,https://www.wikihow.com/Grow-Basil,How to Grow Basil,"\nBasil is easy to grow, and transforms ordina...",,"[Choose the kind of basil you wish to grow., S..."
2,https://www.wikihow.com/Remove-Salt-Build-up-o...,How to Remove Salt Build up on a Zipper,\nWhether it’s from roads and sidewalks in the...,,"[Open the zipper as much as possible., Use bee..."
3,https://www.wikihow.com/Decoupage,How to Decoupage,\nIf you'd like to give new life to a piece of...,,[Cover your workspace with paper to protect it...
4,https://www.wikihow.com/Clean-a-Vacuum-Thermos...,How to Clean a Vacuum Thermosflask That Has St...,\nInsulated flasks are all the rage these days...,,[Put some baking soda and vinegar into the bot...


In [125]:
# Merge the DataFrames
df_merged = df_internal_questions.merge(df_recipes, on="document_url_question")
print(f'Managed to extract context for {len(df_merged)} questions')
df_merged.head(5)

Managed to extract context for 1328 questions


Unnamed: 0,question,intent_question,history,conversation_id,document_url_question,domain_question,text_answer,intent_answer,domain_answer,question_id,title,description,ingredients,steps
0,How do we prepare the tree?,ask_question_recipe_steps,student: I'm ready for the first step now plea...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,Have you selected a pot? This is a very import...,answer_question_recipe_steps,diy,diy-1-1,How to Start a Bonsai Tree,\nThe ancient art of growing Bonsai trees is w...,,[Select a suitable species of tree for your cl...
1,"Ok, I have a nice dark green pot, perfect. Wha...",ask_question_ingredients_tools,student: I've got an idea of where I want it t...,Wizard-of-Task-diy-1,https://www.wikihow.com/Start-a-Bonsai-Tree,diy,Next we will prepare the tree for potting. The...,answer_question_recipe_steps,diy,diy-1-2,How to Start a Bonsai Tree,\nThe ancient art of growing Bonsai trees is w...,,[Select a suitable species of tree for your cl...
2,Does that mean basil grows best in the spring ...,ask_question_recipe_steps,"student: Gotcha! Once I have all those tools,...",Wizard-of-Task-diy-2,https://www.wikihow.com/Grow-Basil,diy,"Yes, like most plants, basil likes a temperate...",answer_question_recipe_steps,diy,diy-2-3,How to Grow Basil,"\nBasil is easy to grow, and transforms ordina...",,"[Choose the kind of basil you wish to grow., S..."
3,I don't really have access to those right now ...,ask_question_recipe_steps,"student: Okay, now what should I do after that...",Wizard-of-Task-diy-3,https://www.wikihow.com/Remove-Salt-Build-up-o...,diy,You can just rub it on the main zipper piece,answer_question_recipe_steps,diy,diy-3-1,How to Remove Salt Build up on a Zipper,\nWhether it’s from roads and sidewalks in the...,,"[Open the zipper as much as possible., Use bee..."
4,If I could only choose one thing to decoupage ...,ask_question_recipe_steps,student: What is the easiest type of material ...,Wizard-of-Task-diy-5,https://www.wikihow.com/Decoupage,diy,I would highly recommend either decoupaging wo...,answer_question_recipe_steps,diy,diy-5-5,How to Decoupage,\nIf you'd like to give new life to a piece of...,,[Cover your workspace with paper to protect it...


In [133]:
# Export 'merged_df' to a CSV file located in the 'data' folder
df_merged.to_csv('data/processed_data.csv')

In [131]:
pip install pandas --upgrade


Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.5
    Uninstalling pandas-1.3.5:
      Successfully uninstalled pandas-1.3.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awswrangler 2.14.0 requires pandas<1.4.0,>=1.2.0; python_full_version >= "3.7.1" and python_full_version < "4.0.0", but you have pandas 2.2.2 which is incompatible.
awswrangler 2.14.0 requires pyarrow<6.1.0,>=2.0.0, but you have pyarrow 15.0.2 which is incompatible.
sagem