## Webscraping DHL Webpage for FAQs to train chatbot.

In [2]:
#imports
from bs4 import BeautifulSoup
import requests

In [3]:
#target URL -> DHL FAQ Page
url = "https://www.dhl.com/sg-en/home/our-divisions/ecommerce/customer-service/consumer-faq.html"
#response = requests.get(url)
#print(response.status_code)
#request blocked
#save html page manually and scrape instead

In [4]:
soup=BeautifulSoup(open("dhl-faq.html"),"html")

In [5]:
import pandas as pd

In [6]:
#faq has 3 sections
faq = soup.find('body').find('div',{'class':'l-view has-no-stage'}).find('main',{"id":'wcag-main-content'}).find_all('div',{'class':'c-component-accordion component-margin component-small l-grid--center-m'})
len(faq)

3

In [7]:
#get questions
qns_list= []
for section in faq:
    question = section.find_all('a')
    qns_list.append(question)


In [8]:
#since it is a list in a list, flatten the list
questions = [qn.text for sublist in qns_list for qn in sublist]

#remove '/' from list
questions = [q.replace('/',' or ') for q in questions]
print(questions)

['How can I track my shipment?', 'What is a tracking number or ID?', 'Where can I find my tracking number or ID?', 'When will my tracking information appear?', 'Why is my shipment status unchanged?', 'Why is my tracking number or ID not working?', 'Can I track multiple tracking numbers with a single request?', 'Who will deliver my shipment?', 'When will my shipment be delivered?', 'Why has my shipment not been delivered yet?', 'Who can I contact if I have not received my shipment?', 'Can you change my delivery address?', 'Can I change the delivery day or delivery time?', 'My shipment is damaged or the content is missing. What can I do?', 'My shipment is not delivered. What can I do?', 'Why is customs holding my shipment?', 'What are the duties and taxes?']


In [9]:
#get answers
ans_list = []
for section in faq:
    answer = section.find_all('div',{'class':"component-margin c-text-generic has-rte component-small"})
    ans_list.append(answer)

In [10]:
#flatten list
answers = [ans.text for sublist in ans_list for ans in sublist]
answers[:5]

#clean answers
answers = [a.replace('\n','').replace('\xa0','') for a in answers]
answers = [a.replace('/',' or ') for a in answers]

In [11]:
answers[:3]

['You can track your shipment using our website or one of our online portals. Usually your merchant or online shop sends you a link to track your shipment.',
 'A tracking number or ID is a combination of numbers and or or letters that uniquely identifies your shipment. The ID length may vary from 10 to 39 characters.',
 'In general, the merchant or online shop is able to provide the tracking number or ID. If you have ordered a product from an online shop, the confirmation email or shipment notification often contains the tracking number or ID. If not, please contact your merchant or online shop.Please note that not all shipments have tracking numbers or IDs.']

In [12]:
combined = []
for i in range(len(questions)):
    pairs = []
    pairs.append(questions[i])
    pairs.append(answers[i])
    combined.append(pairs)

In [13]:
combined

[['How can I track my shipment?',
  'You can track your shipment using our website or one of our online portals. Usually your merchant or online shop sends you a link to track your shipment.'],
 ['What is a tracking number or ID?',
  'A tracking number or ID is a combination of numbers and or or letters that uniquely identifies your shipment. The ID length may vary from 10 to 39 characters.'],
 ['Where can I find my tracking number or ID?',
  'In general, the merchant or online shop is able to provide the tracking number or ID. If you have ordered a product from an online shop, the confirmation email or shipment notification often contains the tracking number or ID. If not, please contact your merchant or online shop.Please note that not all shipments have tracking numbers or IDs.'],
 ['When will my tracking information appear?',
  'You should see tracking events within 24-48 hours after you have received the confirmation by your merchant or online shop. The reason it takes time to see

In [18]:
#create yaml file to train chatbot with dhl q^a
import yaml

with open('../data/faq.yaml', 'w') as f:
    
    data = yaml.dump(combined, f)