# Airline policy scraping - html strings (txt) for split by HTML header
https://python.langchain.com/docs/modules/data_connection/document_transformers/HTML_header_metadata/

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By

## Airlines scraped here
- Japan Airlines
- Korean Air
- Emirates
- All Nippon Airways
- EVA Air
- Air France
- Cathay Pacific Airways
- Turkish Airlines
- Singapore Airlines
- Qatar Airways

## We scraped for each airline
- general baggage (not include special ones like animals and instruments)
- ticket change and refund (without lounge)
- mostly main page, does not include FAQ (MOSTLY)
- with image/href/table
- From airfrance, the scraping mathod changes a bit or else they are unscrapable

## TBD:
- add pet policy

### Function organize

In [3]:
# scrape 1 page with all content
def web_to_html_str(dict):
    '''
    Scraping each XPATH in one page (url)
    Input: a dictionary: key: url, values: XPATH
    Output: a joined string of html string from each url (join each XPATH for a url)
    
    '''
    # extract url and xpaths
    url, XPATHS = list(dict.items())[0]

    #  Launch a web browser
    driver = webdriver.Chrome()
    driver.get(url)

    # transform each html as a string, collect all strings to a list as the policy for this airline
    elements = []
    for XPATH in XPATHS:
        section = driver.find_element(By.XPATH, XPATH) # find the section/div
        section_html = section.get_attribute('innerHTML') # get html string inside the section
        elements.append(section_html)
    
    # string_per_path = '\t|'.join(elements)
    string_per_path = '\t|'.join(elements)

    return string_per_path


In [2]:
def scrape_content(item): # a list of items to scrape for an airline
    '''
    Input: a list of airline: including baggage, refund and delay, each as a list of url + XPATH
    Output: a list of string, each string is an XPATH
    Scrape out a list of html strings for many urls for an airline
    
    '''
    results = [] # a list of dictionary

    for subitem in item:
        for i in range(0, len(subitem), 2):
            result_dict = {}
            key = subitem[i]  # Extract the key
            value = subitem[i + 1]  # Extract the corresponding value
            result_dict[key] = value  # Add key-value pair to the dictionary
            results.append(result_dict)

    scrapped = []

    for result in results: 
        content = web_to_html_str(result) # input a dictionary
        scrapped.append(content)
    
    
    return scrapped # this is a list of string per XPATH

In [18]:
def write_txt(list, filename): 
    '''
    Input: the output list from scrapped_airline
    Output: a txt file for each airline
    Concat the strings in a list and transform to a txt. file
    
    '''
    # Define the file path
    file_path = f'/Users/kay/Desktop/nlp/nlp_airline_project/policy_data/scraped_txt/scraped_{filename}.txt'

    # Open the file in write mode
    with open(file_path, 'w') as file:
        # Write each row of the dataset to the file
        for element in list:
            # file.write('\t'.join(element) + '\n') # join all XPATH to 1 large string
            file.write(element + '|,|,|\n|')
    print(f"Data written to {file_path} successfully!")

### Japan airline

In [119]:
# baggage
checked_bagg = ['https://www.jal.co.jp/jp/en/inter/baggage/checked/?inbound=ar#Rule',
                 ['//*[@id="wrapper"]/div/div/div[3]/div[2]/div']] # whole page

carried_on = ['https://www.jal.co.jp/jp/en/inter/baggage/inflight/?inbound=ar',
              ['//*[@id="wrapper"]/div/div/div[3]/div[2]/div/div'
              ]]
delayed_lost_bagg = ['https://www.jal.co.jp/jp/en/inter/baggage/accident/',
                ['//*[@id="wrapper"]/div/div/div[3]/div[2]/div'
                ]]

prohibited = ['https://www.jal.co.jp/jp/en/inter/baggage/limit/',
              ['//*[@id="wrapper"]/div/div/div[3]/div[2]/div'
              ]]

# ticket refund change
refund = ['https://www.jal.co.jp/jp/en/inter/change/refund/#refund_amount',
          ['//*[@id="wrapper"]/div/div/div[3]/div[2]/div'         
          ]]
change = ['https://www.jal.co.jp/jp/en/dom/change/normal.html#RefundApp',
          ['//*[@id="wrapper"]/div/div/div[3]/div[2]/div/div' # whole page to be specify
          ]]
ticket_validity = ['https://www.jal.co.jp/jp/en/dom/change/valid.html',
                    ['//*[@id="wrapper"]/div/div/div[3]/div[2]/div'      
                    ]]
ticket_cancel_charge = ['https://www.jal.co.jp/jp/en/dom/charge/',
                        ['//*[@id="wrapper"]/div/div/div[3]/div[2]/div[1]' # whole page with links
                         ]]
flight_delay = ['https://www.jal.co.jp/jp/en/dom/serviceinfo/#point1',
                   [ '//*[@id="wrapper"]/div/div/div[3]/div[2]/div[1]/div/div[4]/div/div[3]/div' # whole page
                   ]]

items_japan = [checked_bagg, carried_on, delayed_lost_bagg, prohibited, 
               refund, change, ticket_validity, ticket_cancel_charge, flight_delay]


In [120]:
# performs the scraping
scraped_japan = scrape_content(items_japan)
scraped_japan

['\n\n\n<div class="aem-Grid aem-Grid--12 aem-Grid--default--12 ">\n    \n    <div class="cont_ttl title aem-GridColumn aem-GridColumn--default--12">\n<!-- cont_ttl start -->\n\n    <div class="pageTtl">\n        <div class="inr">\n            <h1 class="heading">Checked baggage</h1>\n        </div>\n    </div><!-- /pageTtl -->\n\n<!-- /cont_ttl end -->\n\n\n    \n\n</div>\n<div class="text aem-GridColumn aem-GridColumn--default--12">\n<!-- text start -->\n\n\t<p>Learn about size and weight restrictions for baggage checked at airport counters, and applicable notes.</p>\n\n\n<!-- text end -->\n\n\n    \n\n</div>\n<div class="responsivegrid mgtS aem-GridColumn aem-GridColumn--default--12">\n<!-- responsivegrid start -->\n<div class="aem-Grid aem-Grid--12 aem-Grid--default--12 aem-Grid--phone--12  ">\n    \n    <div class="textlink textlinkbox aem-GridColumn--default--none aem-GridColumn--phone--none aem-GridColumn--phone--12 aem-GridColumn aem-GridColumn--offset--phone--0 aem-GridColumn-

In [121]:
# write out to txt file to 'scrape' folder
write_txt(scraped_japan, 'japan')

Data written to scraped/scraped_japan.txt successfully!


### Korean Airline
partially scraped: table in baggage allowance unscrapable

In [185]:
# general baggage
checked_bagg = ['https://www.koreanair.com/us/en/airport/baggage/checked/free-allowance',
                       ['/html/body/div[9]/div[2]'
                        ]]
carried_on = ['https://www.koreanair.com/us/en/airport/baggage/carry-on',
              ['/html/body/div[9]/div[2]'
              ]]
delayed_bagg = ['https://www.koreanair.com/us/en/airport/baggage/damaged-or-lost/baggage-delay',
                ['/html/body/div[9]/div[2]'
                ]]
damaged_bagg = ['https://www.koreanair.com/us/en/airport/baggage/damaged-or-lost/damaged-bags',
                ['/html/body/div[9]/div[2]'
                ]
                ]
lost_bagg = ['https://www.koreanair.com/us/en/airport/baggage/damaged-or-lost/lost-items',
            ['/html/body/div[9]/div[2]'        
             ]]
restricted_check = ['https://www.koreanair.com/us/en/airport/baggage/restricted-items/checked',
                    ['/html/body/div[9]/div[2]'
                    ]]
conditon_restrict = ['https://www.koreanair.com/us/en/airport/baggage/restricted-items/conditionally-allowed',
                     [
                         '/html/body/div[9]/div[2]'
                     ]]
prohibited = ['https://www.koreanair.com/us/en/airport/baggage/restricted-items/carry-on',
              ['/html/body/div[9]/div[2]'
              ]]
# ticket refund change
change = ['https://www.koreanair.com/us/en/booking/overview/change-and-refund/change',
          ['/html/body/div[9]/div[2]'
          ]]
refund = ['https://www.koreanair.com/us/en/booking/overview/change-and-refund/refund',
          ['/html/body/div[9]/div[2]'            
          ]]
no_show_penalty = ['https://www.koreanair.com/us/en/booking/overview/change-and-refund/penalty',
                   [ '/html/body/div[9]/div[2]'
                   ]]

items_korean = [checked_bagg, carried_on, 
                delayed_bagg, damaged_bagg, lost_bagg, restricted_check, conditon_restrict, prohibited,# bags
                change, refund, no_show_penalty
                ]



In [186]:
# performs the scraping
scraped_korean = scrape_content(items_korean)
scraped_korean

['\n    \n\n\n\n\n    \n    \n    <div class="page-title-description-b parbase">\n    \n    <div>\n        \n\t\n\t\n    \n<link rel="stylesheet" href="/etc.clientlibs/koreanair/components/content/page/page-title-description-b/clientlib.min.css" type="text/css">\n\n\n\n\t\n    \n<script type="text/javascript" src="/etc.clientlibs/koreanair/components/content/page/page-title-description-b/clientlib.min.js"></script>\n\n\n\n\n\t<div class="ptdb">\n\t\t\n    \n    \n    \n    \n        <input type="hidden" class="i18n-data" data-key="webacc.selected" value="Selected">\n    \n\n\n\t\t<div>\n\t\t\t<h1 class="h1" id="skip">Checked Baggage</h1>\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\t\n\t\t</div>\n\t\t<p class="p _left">Everything you need to know about checked baggage, including size, fees, packing instructions and more.</p>\n\n\t</div>\n\n\n    \n\n    </div>\n    \n</div>\n\n\n    \n    \n    <div class="tab-navigation-bar-c parbase">\n    \n    <div class="mb4020">\n 

In [187]:
# write out to txt file to 'scrape' folder
write_txt(scraped_korean, 'korean')

Data written to scraped/scraped_korean.txt successfully!


### Emirates

In [188]:
# general baggage
checked_bagg = ['https://www.emirates.com/us/english/before-you-fly/baggage/checked-baggage/',
                 [
                    '//*[@id="maincontent"]' # whole page for all
                    ]]
carried_on = ['https://www.emirates.com/us/english/before-you-fly/baggage/cabin-baggage-rules/',
              [
            '//*[@id="maincontent"]'
              ]]
delayed_bagg = ['https://www.emirates.com/us/english/before-you-fly/baggage/delayed-damaged-baggage/',
                [
                '//*[@id="maincontent"]'
                ]]

prohibited = ['https://www.emirates.com/us/english/before-you-fly/travel/dangerous-goods-policy/',
              [
                '//*[@id="maincontent"]'
              ]]
lost = ['https://www.emirates.com/us/english/help/faq-topics/baggage-and-lost-property/',
        [
            '//*[@id="maincontent"]']] # lost property at airport or board

# ticket refund change
change_refund = ['https://www.emirates.com/us/english/help/faq-topics/cancelling-or-changing-a-booking/faq/what-is-a-travel-voucher/',
          [
        '//*[@id="maincontent"]'          
          ]]
flight_delay = ['https://www.emirates.com/us/english/before-you-fly/travel/rules-and-notices/delay-notice/',
               [ 
                  '//*[@id="maincontent"]' ]]

items_emirates = [checked_bagg, carried_on, delayed_bagg, prohibited, lost,# bags
                  change_refund, flight_delay]



In [189]:
# performs the scraping
scraped_emirates = scrape_content(items_emirates)
scraped_emirates

['<div id="58538" data-organism="g-o-c-008-4-page-header"><div><div class="e-container--separator"><div data-auto="page-header" class="page-header-util page-header-util__blank-container-size"><div class="e-container--separator page-header-util__content-header page-header-util__dark"><div class="content-header page-header-util__header" id="content-header261396"><p class="content-header__section" aria-hidden="false">Before You Fly</p><h1 class="content-header__text content-header__text--large page-header-util__text reset-h">Checked baggage</h1></div></div></div></div></div></div><div id="58546" data-organism="g-o-c-001-a-text-block-full-width"><div class="e-container e-container--separator"><div class="g-o-c-009-1-a-text-block-full-width"><div class="enhanced-text-block-list g-o-c-009-1-a-text-block-full-width__block-list"><div class="enhanced-text-section enhanced-text-block-list__block g-o-c-009-1-a-text-block-full-width__block"><div class="enhanced-rich-text enhanced-text-section__tex

In [190]:
# write out to txt file to 'scrape' folder
write_txt(scraped_emirates, 'emirates')

Data written to scraped/scraped_emirates.txt successfully!


### All Nippon Airways

In [161]:
# baggage: to be d: pats, children, disabiligy
checked_bagg = ['https://www.ana.co.jp/en/us/travel-information/baggage-information/checked-baggage/',
                [
                '/html/body/div/div/div'
                ]
                ]

carried_on = ['https://www.ana.co.jp/en/us/travel-information/baggage-information/carry-on/',
              [
                  '/html/body/div/div/div'
               ]
              ]

damaged = ['https://www.ana.co.jp/en/us/travel-information/baggage-information/damage-lost/',
           [
               '/html/body/div/div/div'
           ]
           ]

restricted_bagg = ['https://www.ana.co.jp/en/us/travel-information/baggage-information/restricted-prohibited/',
                   [
                       '/html/body/div/div/div', 
                       
                   ]]

# refund & change
change_refund = ['https://www.ana.co.jp/en/jp/guide/reservation/refund/international/',
                [
                    '/html/body/div[2]/div/div/div[2]', ]]

# flight cancel of delay
flight_delay_cancel = ['https://www.ana.co.jp/en/us/travel-information/cancellation-and-delays/',
                       [
                         '/html/body/div/div/div', 
                       ]]

items_ana = [checked_bagg, carried_on, damaged, restricted_bagg, # bags
        change_refund,
        flight_delay_cancel]


In [162]:
# performs the scraping
scraped_ana = scrape_content(items_ana)
scraped_ana

['\n    \n    <div class="header responsivegrid aem-GridColumn aem-GridColumn--default--12">\n\n\n<div class="aem-Grid aem-Grid--12 aem-Grid--default--12 ">\n    \n    <div class="ANA-S001 aem-GridColumn aem-GridColumn--default--12"><div class="fullnewdesignwidth">\n\n\n\t<div class="template_xf_web_variation xfpage page basicpage">\n\n\n<div class="aem-Grid aem-Grid--12 aem-Grid--default--12 ">\n    \n    <div class="ANA-xf-container aem-GridColumn aem-GridColumn--default--12">\n\n\t\t\n        \t<!-- cn-0001 -->\n<header class="header">\n    <div class="header__inner">\n        <div class="header__logo-container">\n            <a href="/en/us/" id="anchor_header_logo">\n                <div class="header__logo">\n                    <img src="/common-layout2/images/ana-logo-stacked.svg" alt="ANA Inspiration of JAPAN">\n                </div>\n                <div class="header__logo_scroll">\n                    <img src="/common-layout2/images/ana_logo_when_scrolled.png" alt="ANA In

In [163]:
# write out to txt file to 'scrape' folder
write_txt(scraped_ana, 'ana')

Data written to scraped/scraped_ana.txt successfully!


### EVA air

In [154]:
# baggage: to be d: pats, children, disabiligy
general_bagg = ['https://www.evaair.com/en-us/fly-prepare/baggage/free-baggage/general-information/',
                ['//*[@id="mainContent"]/main/div']
                ]
checked_bagg = ['https://www.evaair.com/en-us/fly-prepare/baggage/free-baggage/checked-baggage/',
                [ '//*[@id="mainContent"]/main/div'
                ]]
carried_on = ['https://www.evaair.com/en-us/fly-prepare/baggage/free-baggage/carry-on-baggage/',
              [
                   '//*[@id="mainContent"]/main/div']
              ]

damaged = ['https://www.evaair.com/en-us/fly-prepare/baggage/delayed-missing-damaged-baggage-and-restrictions/delayed-missing-damaged-baggage/',
           [
               '//*[@id="mainContent"]/main/div', # whole page
            ]
           ]

restricted_bagg = ['https://www.evaair.com/en-us/fly-prepare/baggage/delayed-missing-damaged-baggage-and-restrictions/restrictions/',
                   [
                       '//*[@id="mainContent"]/main/div' # whole page
                       
                   ]]

# refund & change
ticket_refund = ['https://www.evaair.com/en-global/customer-services/faq/detail.html?FaqCategories=ticketing',
                [
                    '//*[@id="mainContent"]/main/div']] # faq

# flight cancel of delay
flight_delay_cancel = ['https://www.evaair.com/en-us/customer-services/ticket-changes-refunds-due-to-schedule-changes/',
                       [
                           '//*[@id="mainContent"]/main/div',
                       ]]

items_eva = [general_bagg, checked_bagg, carried_on, damaged, restricted_bagg, # bags
        ticket_refund,
        flight_delay_cancel]


In [155]:
# performs the scraping
scraped_eva = scrape_content(items_eva)
scraped_eva

['\n    \n\n\n\n<div class="editorA">\n    <h1 class="heading-h1 editorA-heading">Free Baggage</h1>\n    <div class="editor editorA-editor">\n    </div>\n</div>\n\n\n\n\n\n\n    <div class="tabA tab tab--page">\n            <a class="tab-item tab-item--page tab-item--page--active" href="/en-us/fly-prepare/baggage/free-baggage/general-information/"><span class="tab-text">General Information</span></a>\n            <a class="tab-item tab-item--page" href="/en-us/fly-prepare/baggage/free-baggage/checked-baggage/"><span class="tab-text">Checked Baggage</span></a>\n            <a class="tab-item tab-item--page" href="/en-us/fly-prepare/baggage/free-baggage/carry-on-baggage/"><span class="tab-text">Carry-on Baggage</span></a>\n\n        <div class="tab-select">\n            <select class="tab-inputSelect js-selectLink" name="tablink" id="tablink" aria-label="Tab">\n                    <option class="tab-selectItem" value="/en-us/fly-prepare/baggage/free-baggage/general-information/">General 

In [156]:
# write out to txt file to 'scrape' folder
write_txt(scraped_eva, 'eva')

Data written to scraped/scraped_eva.txt successfully!


### Air France
partially unscrapable: only contains general bag and damaged_missing bag

In [58]:
# baggage: to be d: pats, children, disabiligy

general_bagg = ['https://wwws.airfrance.us/information/bagages/bagage-cabine-soute',
                ['/html/body']]
damaged_missing = ['https://wwws.airfrance.us/information/bagages/bagage-manquant-airfrance',
           ['/html/body']]

items_france = [
                  general_bagg,
                damaged_missing, 
             ]



In [55]:
# # performs the scraping
scraped_france = scrape_content(items_france)
scraped_france

 '\n    <bw-app ng-version="17.3.0" class="bw-ubc-information" ng-server-context="ssr"><bwc-page-template cdkmonitorsubtreefocus=""><mat-sidenav-container class="mat-drawer-container mat-sidenav-container bwc-page-template__container"><!----><!----><mat-sidenav-content class="mat-drawer-content mat-sidenav-content"><div class="bwc-page-template__content"><bwc-print-header><mat-toolbar class="mat-toolbar mat-toolbar-single-row"><bwc-logo class="bwc-print-brand-logo" _nghost-server-app-c1494823503=""><img _ngcontent-server-app-c1494823503="" class="bwc-logo bwc-logo--brand-logo" style="width: 100%;" src="https://www.static-af.com/assets/components/34.0.0/af/logo/brand-logo.svg" alt=""></bwc-logo></mat-toolbar><!----><!----></bwc-print-header><bwc-sticky-header class="bwc-page-template__content__sticky-header"><div style="position: absolute; left: -10px; top: -10px; right: 0; bottom: 0; overflow: hidden; z-index: -1; visibility: hidden;"><div style="position: absolute; left: 0; top: 0; tr

In [56]:
# write out to txt file to 'scrape' folder
write_txt(scraped_france, 'france')

Data written to scraped/scraped_france.txt successfully!


### Cathay Pacific: problematic because element unreachable

In [182]:
# baggage: to be d: pats, children, disabiligy
general_bagg = ['https://www.cathaypacific.com/cx/en_US/baggage.html',
                  ['/html/body/main/div/div[1]', ###not working for some reason
                 '/html/body/main/div/div[3]',
                 '/html/body/main/div/div[4]',
                 '/html/body/main/div/div[5]',
                 '/html/body/main/div/div[6]',
                 '/html/body/main/div/div[7]',
                 '/html/body/main/div/div[8]',
                 '/html/body/main/div/div[9]',
                 '/html/body/main/div/div[10]'
                 ]]

checked_bagg = ['https://www.cathaypacific.com/cx/en_US/faqs/baggage/check-in-baggage-allowance/what-is-my-free-checked-baggage-allowance.html',
                ['/html/body/main',
                ]]
carried_on = ['https://www.cathaypacific.com/cx/en_US/faqs/baggage/cabin-baggage-allowance/what-is-my-carry-on-baggage-allowance.html',
              ['/html/body']
              ]

delayed_damaged = ['https://www.cathaypacific.com/cx/en_US/baggage/lost-and-damaged-baggage.html',
            ['/html/body/main/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div/div/div',
            '/html/body/main/div/div[2]/div/div/div/div[1]/div/div/div/div[3]/div/div',
            '/html/body/main/div/div[2]/div/div/div/div[2]/div/div/div',
            '//*[@id="aid_2f6a56f5-8daa-4bcc-9e36-c4715634c97c_contentpar_contentblock_responsivegrid_accordion"]/div']]
           
restricted_bagg = ['https://www.cathaypacific.com/cx/en_US/baggage/controlled-and-banned-items/controlled-items.html',
                   ['/html/body/main']]
banned = ['https://www.cathaypacific.com/cx/en_US/baggage/controlled-and-banned-items/banned-items.html',
          ['/html/body/main']]
liquid_stuff = ['https://www.cathaypacific.com/cx/en_US/baggage/controlled-and-banned-items/liquids-aerosols-and-gels.html',
                ['/html/body/main']]
battery = ['https://www.cathaypacific.com/cx/en_US/baggage/controlled-and-banned-items/lithium-batteries.html',
           ['/html/body/main']]

# refund & change
change_refund_cancel_refund = ['https://www.cathaypacific.com/cx/en_US/book-a-trip/change-cancel-refund.html',
                 [ '/html/body/div[1]/div/div/div[1]/div[3]']] 

# flight cancel of delay
flight_delay_cancel = ['https://www.cathaypacific.com/cx/en_US/prepare-trip/flight-delays-and-cancellations.html',
                       ['/html/body/main']]

items_cathay = [
    general_bagg, checked_bagg, carried_on, delayed_damaged, restricted_bagg, banned, liquid_stuff, battery,
                change_refund_cancel_refund,
                flight_delay_cancel
    
                ]


In [183]:
# performs the scraping
scraped_cathay = scrape_content(items_cathay)
scraped_cathay

['\n<div class="container   margin-bottom-40 ">\n        \n        \n            \n                \n                \n\t\n    \n        <div class="section-wrapper-bt green-title-grey-description">\n            <div class="container   ">\n                <div class="abstract">\n                    <div class="wrapper">\n                        \n                        \n                        <div class="wrapper">\n                            <h1 class="title">Baggage information</h1>\n                            \n                                <div class="description">\n                                    \n                                        <p>Your baggage allowance is listed on your e-ticket receipt and in\u202f<a href="/content/cx/en_US/manage-booking.html">Manage Booking</a>. Alternatively, you may use the calculator below to check your baggage allowance.</p>\n<p>Note:</p>\n<ul>\n<li>If you have upgraded to a higher cabin using Upgrade Bid, your baggage allowance is stil

In [184]:
# write out to txt file to 'scrape' folder
write_txt(scraped_cathay, 'cathay')

Data written to scraped/scraped_cathay.txt successfully!


### Turkish airline
partially scraped: last 2 policies unscrapable

In [191]:
# baggage: to be d: pats, children, disabiligy

checked_bagg = ['https://www.turkishairlines.com/en-us/any-questions/free-baggage/',
                ['//*[@id="page_wrapper"]' #whole page
                ]]
carried_on = ['https://www.turkishairlines.com/en-us/any-questions/carry-on-baggage/',
               ['//*[@id="page_wrapper"]'
               ]
              ]

damaged = ['https://www.turkishairlines.com/en-int/any-questions/lost-and-delayed-baggage/',
               ['//*[@id="tcm508-17624"]/div', # whole page
            ]
           ]

items_turkish = [
                checked_bagg, carried_on, damaged
                 ]


In [192]:
# performs the scraping
scraped_turkish = scrape_content(items_turkish)
scraped_turkish

 ' <!-- Component information [\'Component ID\': \'tcm:40-284616\', \'Component last modified date-time\': \'3/13/2024 2:00:28 PM\', \'Component template ID\': \'tcm:40-16213-32\', \'Component template last modified date-time\': \'11/28/2022 7:52:08 AM\', \'Publish time\': \'3/14/2024 12:10:42 AM\'] --> <nav class="subnav navbar navbar-default navber-lower bg-light position-sticky top-90 navbar-expand-lg affix-mbl" role="navigation" data-spy="affix" data-offset-top="200"> <div class="container-fluid"> <div class="navbar-header"> <button type="button" class="navbar-toggler subnav-toggle" data-bs-toggle="collapse" data-bs-target=".navbar-collapse-subnav"> <i class="fa fa-chevron-down red"></i> </button> <h4 class="subnav-brand d-block d-sm-none"></h4> </div> <div class="navbar-collapse navbar-collapse-subnav collapse xs-noppading"> <ul class="nav navbar-nav navbar-center xs-full-width"> <li> <a href="/en-us/any-questions/free-baggage/">Checked baggage</a> </li> <li> <a href="/en-us/any-q

In [193]:
# write out to txt file to 'scrape' folder
write_txt(scraped_turkish, 'turkish')

Data written to scraped/scraped_turkish.txt successfully!


### Singapore airline

In [72]:
# baggage: to be d: pats, children, disabiligy
                
checked_bagg = ['https://www.singaporeair.com/en_UK/us/travel-info/baggage/checked-baggage/',
                ['//*[@id="container"]/div[5]/div/div[1]'
                ]]
carried_on = ['https://www.singaporeair.com/en_UK/us/travel-info/baggage/cabin-baggage/',
              ['//*[@id="container"]/div[5]/div/div[1]']
              ]

delay_damaged = ['https://www.singaporeair.com/en_UK/us/travel-info/baggage/delayed-or-damaged-baggage/',
           ['//*[@id="container"]/div[5]/div'
              #  '//*[@id="container"]/div[5]/div/div[1]/div[2]/div[2]/div[2]/div', # delay and damaged
            ]
           ]

restricted_bagg = ['https://www.singaporeair.com/en_UK/us/travel-info/baggage/baggage-restrictions/',
                   ['//*[@id="container"]/div[5]/div/div[1]' # whole page
                       
                   ]]

# refund & change
cancel_refund = ['https://www.singaporeair.com/en_UK/us/travel-info/charges-changes/cancellations-refunds/',
                ['//*[@id="container"]/div[5]/div/div[1]']] # faq

# flight cancel of delay
flight_delay = ['https://www.singaporeair.com/en_UK/us/travel-info/customer-commitment/tarmac-delay-contingency-plan/',
                       [
                         '//*[@id="container"]/div[5]/div/div[1]'  
                       ]]

items_singapore = [checked_bagg, carried_on, delay_damaged, restricted_bagg, 
                   cancel_refund,
                   flight_delay]

In [73]:
# performs the scraping
scraped_singapore = scrape_content(items_singapore)
scraped_singapore

['<div class="blk-heading">\n<h1 class="main-heading">Checked baggage</h1>\n</div>\n<div class="static-details__content editor"><p>When you book a flight with us, your free baggage allowance will be indicated on your ticket. Your checked baggage allowance is based on the weight or piece concept, depending on your final destination.&nbsp;</p><hr style="clear: both"><h2 id="baggagetips">Baggage tips</h2><p>Here are some tips on how you can ensure that your checked baggage arrives safely at your destination:</p>\n<p>&nbsp;</p>\n<p><strong>While Packing</strong></p>\n<ul>\n<li>Consider purchasing a&nbsp;<a href="https://www.singaporeair.com/en_UK/sg/plan-travel/privileges/travel-insurance/" target="_blank" rel="noopener">travel insurance</a> that covers the loss, delay or damage of baggage and personal belongings before you embark on your trip.</li>\n<li>Consider including a tracking device in your baggage.</li>\n<li>Pack within your baggage allowance. Charges will be levied for checked ba

In [75]:
# write out to txt file to 'scrape' folder
write_txt(scraped_singapore, 'singapore')

Data written to scraped/scraped_singapore.txt successfully!


### Qatar airway

In [76]:
# baggage
general_bagg = ['https://www.qatarairways.com/en/baggage/allowance.html',
                ['//*[@id="main"]/div[2]/div'
                    ]] # table            
   


mishandled = ['https://www.qatarairways.com/en/baggage/mishandled.html',
           ['//*[@id="main"]/div[2]/div'
            ]
           ]

restricted_bagg = ['https://www.qatarairways.com/en/baggage/restricted.html',
                   ['//*[@id="main"]/div[2]'
                       
                   ]]

# refund & change

penalties_charge = ['https://www.qatarairways.com/tradeportal/en/bookingnticketing/Penalties-and-Charges.html',
                ['//*[@id="main"]/div[2]/div']] 
# flight cancel of delay
flight_delay = ['https://www.qatarairways.com/en/legal/eu-air-passenger-rights.html',
                       ['//*[@id="accordion-modified"]/div[1]',  # delay,
                         '//*[@id="accordion-modified"]/div[2]', # cancel
                         ]]

items_qatar = [general_bagg, 
               mishandled, restricted_bagg,
               penalties_charge,
               flight_delay]

In [77]:
# performs the scraping
scraped_qatar = scrape_content(items_qatar)
scraped_qatar

['\n        <div class="heroimage-teardrop-smallerversion section">\n\n\n\n\n<!-- Ribbon Style Left/Right -->\n\n\n\n\n\t<input type="hidden" value="undefined.html" id="boxeverdefaultpostlink">\n\n<input type="hidden" class="boxeverheroimageCTAText">\n<input type="hidden" class="boxeverHeroimagelinkType">\n<input type="hidden" class="boxeverHeroImagebuttonInternalUrl">\n<input type="hidden" class="boexverheroimagebuttonExternalUri">\n\n\n\n\n<input type="hidden" class="isBoxEverPostContent" value="false">\n<input type="hidden" id="isBoxeverDesPath" value="/content/global/en/destinations/repository">\n<input type="hidden" class="boxeverCurrencyCode" value="QAR">\n    \n\n\n   \n   \n   <div>\n      <script>\n!function(a,b){var c=b(a,a.document);a.lazySizes=c,"object"==typeof module&&module.exports&&(module.exports=c)}(window,function(a,b){"use strict";if(b.getElementsByClassName){var c,d=b.documentElement,e=a.Date,f=a.HTMLPictureElement,g="addEventListener",h="getAttribute",i=a[g],j=a.s

In [78]:
# write out to txt file to 'scrape' folder
write_txt(scraped_qatar, 'qatar')

Data written to scraped/scraped_qatar.txt successfully!


## Added top 5

In [20]:
def write_txt(list, filename): 
    '''
    Input: the output list from scrapped_airline
    Output: a txt file for each airline
    Concat the strings in a list and transform to a txt. file
    
    '''
    # Define the file path
    file_path = f'/Users/kay/Desktop/nlp/nlp_airline_project/policy_data/new_scraped_txt/scraped_{filename}.txt'

    # Open the file in write mode
    with open(file_path, 'w') as file:
        # Write each row of the dataset to the file
        for element in list:
            # file.write('\t'.join(element) + '\n') # join all XPATH to 1 large string
            file.write(element + '|,|,|\n|')
    print(f"Data written to {file_path} successfully!")

### American Airline

In [89]:
# baggage: to be d: pats, children, disabiligy
                
checked_bagg = ['https://www.aa.com/i18n/travel-info/baggage/checked-baggage-policy.jsp',
                ['//*[@id="aa-content-frame"]/section[2]'
                ]]
carried_on = ['https://www.aa.com/i18n/travel-info/baggage/carry-on-baggage.jsp',
              ['//*[@id="aa-content-frame"]/section[2]']
              ]

delay_damaged = ['https://www.aa.com/i18n/travel-info/baggage/delayed-or-damaged-baggage.jsp',
           ['//*[@id="aa-content-frame"]/section[2]'
            ]
           ]

restricted_bagg = ['https://www.aa.com/i18n/travel-info/baggage/restricted-items.jsp',
                   ['//*[@id="aa-content-frame"]/section[2]'
                   ]]

# refund & change
refund = ['https://www.aa.com/i18n/customer-service/contact-american/receipts-and-refunds.jsp',
                ['//*[@id="aa-content-frame"]/section[2]']] 

# flight cancel of delay
flight_delay = ['https://www.aa.com/i18n/customer-service/contact-american/delayed-or-canceled-flights.jsp',
                       [
                         '//*[@id="aa-content-frame"]/section[2]'  
                       ]]

items_aa = [checked_bagg, carried_on, delay_damaged, restricted_bagg, 
                   refund,
                   flight_delay]

In [90]:
# performs the scraping
scraped_aa = scrape_content(items_aa)
scraped_aa

['\n\t\t\t<h1 itemprop="name">Checked bag policy</h1>\n\t\t\t\n\n\t\t\t<!-- Special Notices -->\n\t\t\t\n\t\t\t\t\t<section class="message-info">\n\t\t\t\t\t\t<h2 class="header">Checked bag allowances</h2>\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p>Changes to bag allowances and fees have been updated as of February 20, 2024.</p>\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p>Travel within / between the U.S., Puerto Rico, and U.S. Virgin Islands – 1st checked bag fee is $40 ($35 if you pay online) and the 2nd checked bag fee is $45.</p>\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p>Travel to / from Canada, Caribbean, Mexico, Central America, and Guyana – 1st checked bag fee is $35 and the 2nd checked bag fee is $45.</p>\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p>All bag fees are non-refundable and apply per person, at each check-in location, each way, even if you purchase or get an upgrade that includes free checked bags. If you believe you\'ve been incorrectly charged for bag fees, contact an American representative for help or file a 

In [92]:
# write out to txt file to 'scrape' folder
write_txt(scraped_aa, 'aa')

Data written to /Users/kay/Desktop/nlp/nlp_airline_project/policy_data/new_scraped_txt/scraped_aa.txt successfully!


### Spirit Airline: unable to scrape

In [42]:
# # baggage: to be d: pats, children, disabiligy
                
# bag_weight_size = ['https://customersupport.spirit.com/en-us/category/article/KA-01143',
#                 ['/html/body'
#                 ]]

# mishandle = ['https://customersupport.spirit.com/en-us/category/article/KA-01490',
#            ['/html/body'
#             ]
#            ]

# restricted_bagg = ['https://customersupport.spirit.com/en-us/category/article/KA-01142',
#                    ['/html/body'
#                    ]]

# # refund & change
# cancel = ['https://customersupport.spirit.com/en-us/category/article/KA-01518',
#           [
#               '/html/body'
#           ]]
# refund_cancel = ['https://customersupport.spirit.com/en-us/category/article/KA-01195',
#                 ['/html/body']] 

# # flight cancel of delay: no info?
                       

# items_spirit = [bag_weight_size, mishandle, restricted_bagg, cancel, refund_cancel]

In [43]:
# performs the scraping
# scraped_spirit = scrape_content(items_spirit)
# scraped_spirit



In [44]:
# # write out to txt file to 'scrape' folder
# write_txt(scraped_spirit, 'spirit')

Data written to /Users/kay/Desktop/nlp/nlp_airline_project/policy_data/new_scraped_txt/scraped_spirit.txt successfully!


### United Airlines

In [84]:
# baggage: to be d: pats, children, disabiligy
                
checked_bagg = ['https://www.united.com/en/us/fly/baggage/checked-bags.html',
                ['/html/body/div[1]/div/div/div/div[3]/div[2]/div/div/main'
                ]]
carried_on = ['https://www.united.com/en/us/fly/baggage/carry-on-bags.html',
              ['/html/body/div[1]/div/div/div/div[3]/div[2]/div/div/main/div']
              ]
basic_economy = ['https://www.united.com/en/us/fly/travel/inflight/basic-economy.html#your-bags',
                 ['/html/body'
                 ]]
delay_damaged = ['https://www.united.com/en/us/baggage/issues-with-your-checked-bags',
           ['/html/body'
            ]
           ]

restricted_bagg = ['https://www.aa.com/i18n/travel-info/baggage/restricted-items.jsp',
                   ['/html/body'
                   ]]

# refund & change
refund = ['https://www.aa.com/i18n/customer-service/contact-american/receipts-and-refunds.jsp',
                ['/html/body']] 

# flight cancel of delay
flight_delay = ['https://www.united.com/en/us/fly/travel/missed-delayed-or-canceled-flights.html',
                       [
                         '/html/body/div[1]/div/div/div/div[3]/div[2]/div/div/main/div'  
                       ]]

items_ua = [checked_bagg, carried_on,
             basic_economy, delay_damaged, restricted_bagg, 
                   refund,
                   flight_delay
                   ]

In [85]:
# performs the scraping
scraped_ua = scrape_content(items_ua)
scraped_ua

['<div class="atm-l-container"><h1 class="atm-c-heading" id="main-heading">Checked bags</h1></div><div class="region-fly-components"><div id="fly-component-0" class="fly-component"><div><div class="atm-c-text-passage flyAtmosTextPassage"><div class="atm-c-text-passage__inner"></div></div></div></div><div id="fly-component-1" class="fly-component"><div class="atm-l-container"><div class="atm-c-text-passage flyAtmosTextPassage"><div class="atm-l-linelength-container"><div class="sdl-el-body"><p xmlns="http://www.w3.org/1999/xhtml">Check your bags in just a few easy steps.</p></div></div></div></div></div><div id="fly-component-2" class="fly-component"><div class="atm-l-container"><div class="fly-grid fly-grid--colgap-small atm-u-margin-bottom-medium atm-u-margin-top-large"><div class="fly-grid__item grid-colspan-3 grid-colspan-12--mobile"><div class="fly-sub-components"><div class="fly-sub-component"><a class="fly-link-block__link" href="#size-weight"><div class="fly-link-block fly-link-

In [86]:
# write out to txt file to 'scrape' folder
write_txt(scraped_ua, 'ua')

Data written to /Users/kay/Desktop/nlp/nlp_airline_project/policy_data/new_scraped_txt/scraped_ua.txt successfully!


### Frontier Airline: unable to scrape

In [58]:
# # baggage: to be d: pats, children, disabiligy
                
# checked_bagg = ['https://www.united.com/en/us/fly/baggage/carry-on-bags.html',
#                 ['/html/body'
#                 ]]
# carried_on = ['https://www.united.com/en/us/fly/baggage/checked-bags.html',
#               ['/html/body']
#               ]
# basic_economy = ['https://www.united.com/en/us/fly/travel/inflight/basic-economy.html#your-bags',
#                  ['/html/body'
#                  ]]
# delay_damaged = ['https://www.united.com/en/us/baggage/issues-with-your-checked-bags',
#            ['/html/body'
#             ]
#            ]

# restricted_bagg = ['https://www.aa.com/i18n/travel-info/baggage/restricted-items.jsp',
#                    ['/html/body'
#                    ]]

# # refund & change
# refund = ['https://www.aa.com/i18n/customer-service/contact-american/receipts-and-refunds.jsp',
#                 ['/html/body']] 

# # flight cancel of delay
# flight_delay = ['https://www.united.com/en/us/fly/travel/missed-delayed-or-canceled-flights.html',
#                        [
#                          '/html/body'  
#                        ]]

# items_frontier = [
#     checked_bagg,
#             #  carried_on,
#             #  basic_economy, delay_damaged, restricted_bagg, 
#             #        refund,
#             #        flight_delay
#                    ]

In [60]:
# # performs the scraping
# scraped_frontier = scrape_content(items_frontier)
# scraped_frontier

In [61]:
# # write out to txt file to 'scrape' folder
# write_txt(scraped_frontier, 'frontier')

### Delta Airline: delayed baggage incomplete

In [77]:
# baggage: scraped first 3 page in checked bag
                
checked_embargo = ['https://www.delta.com/us/en/baggage/checked-baggage/embargoes-restrictions',
                ['/html/body/div[3]/div[2]/div[2]/div/div/div/main'
                ]]
checked_first = ['https://www.delta.com/us/en/baggage/checked-baggage/first-checked-bag-free',
              ['//*[@id="maincontent"]']
              ]
checked_overweight = ['https://www.delta.com/us/en/baggage/checked-baggage/excess-overweight-baggage',
                      [
                          '//*[@id="maincontent"]'
                      ]]
carried_on = ['https://www.delta.com/us/en/baggage/carry-on-baggage',
              [
                  '//*[@id="maincontent"]'
              ]]

                 
delay_damaged = ['https://www.delta.com/us/en/baggage/delayed-lost-damaged-baggage',
           ['//*[@id="maincontent"]/div[1]/div/div/div/div' # too long to be processed, this is incomplete
            ]
           ]

restricted_bagg = ['https://www.delta.com/us/en/baggage/prohibited-or-restricted-items/overview',
                   ['//*[@id="maincontent"]/div[5]/div/div/div/div'
                   ]]

# refund & change
cancel_refund = ['https://www.delta.com/us/en/change-cancel/cancel-flight',
                ['//*[@id="maincontent"]']] 

# flight cancel of delay
flight_delay = ['https://www.delta.com/us/en/change-cancel/delayed-or-canceled-flight',
                       [
                         '//*[@id="maincontent"]'
                       ]]

items_delta = [checked_embargo, checked_first, checked_overweight, 
              carried_on,
             delay_damaged, restricted_bagg, 
                   cancel_refund,
                   flight_delay
                   ]

In [78]:
# performs the scraping
scraped_delta = scrape_content(items_delta)
scraped_delta

['\n                  \n                  \n\n\n\n\n    \n    \n    <div class="intro">\n\n\n\n\n<!-- Below is for publish/author preview -->\n<div class="container">\n   <div class="introcomponent">\n      <div class="col-12 pl-0">\n         <div class="content-block">\n            <div>\n               <span class="h4">Checked Baggage</span>\n            </div>\n            \n            <div>\n               <span class="h1"><h1>Embargoes &amp; Limitations</h1>\n</span>\n               \n            </div>\n            \n            \n               <hr class="redlinerule">\n            \n            \n            <div class="introdescription intro-body">\n               <div class=" introDescText">\n                  <p>Whether it’s due to weather, weight restrictions or limited baggage space, some airports around the world impose baggage limitations or embargoes. Review current embargoes for your next destination. Unless otherwise noted, Delta will accept up to 2 standard <a></a><

In [79]:
# write out to txt file to 'scrape' folder
write_txt(scraped_delta, 'delta')

Data written to /Users/kay/Desktop/nlp/nlp_airline_project/policy_data/new_scraped_txt/scraped_delta.txt successfully!
