# Split/Chunk text
markdown splitter
smaller chunks
semantic chunking

In [231]:
# !pip install -qU langchain-text-splitters

In [232]:
from langchain_text_splitters import HTMLHeaderTextSplitter
import pandas as pd
from bs4 import BeautifulSoup
import os
import re

#### Load the files

In [233]:
def load_files(path):
    file_list = os.listdir(path)
    new_list = [] # a list to store each airline
    pattern = r'scraped_(.+)\.txt'
    filenames = []
    for file in file_list:
        filename = re.search(pattern, file).group(1)
        filenames.append(filename)
        with open(f"scraped/scraped_{filename}.txt") as f:
            file = f.read().split('|,|,|\n|') # split to each page
            file.remove('')
            string_list = [] # a list to store each airline XPATHS
            for string in file:
                str_xpath = string.split('\t|')
                string_list.extend(str_xpath)
            new_list.append(string_list)

    file_dict = {index: value for index, value in enumerate(filenames)}

    return new_list, file_dict


In [234]:
file_list, file_dict = load_files('scraped')

#### Clean, Split each file by headers, and transform to table

In [235]:
def parse_html(list):
    '''
    Input: a list of strings, each string is an XPATH with class and div
    Output: a list of strings, each string is an XPATH without class and div
    '''
    clean_text = []
    for string in list:
        # Parse the HTML string
        soup = BeautifulSoup(string, 'html.parser')

        # find table and transform it to string
        table = soup.find('table')
        if table != None: 
        
            table_text = ""
            for row in soup.find_all('tr'):
                for cell in row.find_all(['th', 'td']):
                    table_text += cell.get_text() + " "
                table_text += "\n"
                
            table_text = '<p>' + table_text + '</p>'
            

            # Find the position of the table in the soup
            position = 1

            for element in soup.descendants:
                if element == table:
                    break
                position += 1


            # Drop the table element from the soup
            table.extract()

            # turn soup to a list of descendent
            descendants_list = [str(descendant) for descendant in soup.descendants]
            # add new table to the original position
            descendants_list.insert(position, table_text)
            # join the list back a string
            joined_string = ''.join(descendants_list)
            # parse it as a soup for further processing
            soup_new = BeautifulSoup(joined_string, 'html.parser')
            soup_new = soup

        else: 
            soup_new = soup

        # cleaned string
        cleaned_content = ""
        for content in soup_new.find_all(True):
            cleaned_content += str(content)
        clean_text.append(cleaned_content)
                
    return clean_text
   

In [236]:
# testing: it works for all but components[1]
# parse_html(file_list[0])

In [237]:
def split_header(list):
    headers_to_split_on = [
        ("h1", "Header 1"),
        ("h2", "Header 2"),
        ("h3", "Header 3"),
        ("h4", "Header 4"),
        ("h5", "Header 5")
    ]

    splitted_list = []
    for string in list:
        html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        html_header_splits = html_splitter.split_text(string)
        splitted_list.extend(html_header_splits)

    return splitted_list

In [238]:
# split_header(parse_html(file_list[0]))

In [239]:
def to_table(list):
    df = pd.DataFrame()
    for string in list:
        dict = string.metadata
        dict['content'] = string.page_content
        df_dict = pd.DataFrame([dict])
        df = pd.concat([df, df_dict], ignore_index=True)
    
    return df

In [240]:
# to_table(split_header(parse_html(file_list[0])))

## Apply to all files and store the tables in a list for further processing

In [241]:
df_list = []
for file in file_list:
    # get index
    index = file_list.index(file)
    # processing
    airline = parse_html(file) # parse all urls (XPATHs) for an airline
    airline_split = split_header(airline) # to documents
    df_airline = to_table(airline_split)

    # adjust column order
    for column in df_airline.columns:
        if column == 'content':
            content_column = df_airline.pop('content')  # Remove 'Content' column and store it
            df_airline['content'] = content_column 

    # append airline name
    df_airline['Airline'] = file_dict[index]

    # append dataframe
    df_list.append(df_airline)

len(df_list)

10

## File based adjustment
1. Air France

In [242]:
# df_list[0]

In [243]:
df_list[0] = df_list[0].drop(df_list[0][~pd.isna(df_list[0]['Header 4'])].index) 
df_list[0] = df_list[0].drop(df_list[0][pd.isna(df_list[0]['Header 1'])].index) # drop rows that have values in header 4 since it's actually footer

In [244]:
# df_list[0]['Header 1'].unique()

2. Air Korean

In [245]:
# df_list[1]

In [246]:
df_list[1] = df_list[1].drop(df_list[1][(df_list[1]['Header 1'] == None)].index) 
df_list[1] = df_list[1].drop(df_list[1][df_list[1]['Header 2'].isin(['Go to SNS', 'Go to App'])].index) # clean checked bag

In [247]:
# df_list[1]

3. Air Singapore

In [248]:
# df_list[2]

4. Air Qatar

In [249]:
# df_list[3]

5. Air Turkish

In [250]:
# df_list[4]

In [251]:
# clean empty lines
df_list[4] = df_list[4].drop(df_list[4][(pd.isna(df_list[4]['Header 1']) & pd.isna(df_list[4]['Header 2']) | (df_list[4]['Header 3'] == 'Checked baggage calculation tool'))].index) 
# refill header 1
df_list[4].loc[(pd.isna(df_list[4]['Header 1'])) | (df_list[4]['Header 1'] == ''), 'Header 1'] = df_list[4].loc[(pd.isna(df_list[4]['Header 1'])) | (df_list[4]['Header 1'] == ''), 'Header 2']

In [252]:
# df_list[4]['Header 1'].unique()

6. Japan Airline

In [253]:
# df_list[5]

In [254]:
df_list[5] = df_list[5].drop(df_list[5][pd.isna(df_list[5]['Header 1'])].index)

In [255]:
# df_list[5]['Header 1'].unique()

7. Emirates

In [256]:
# df_list[6]

In [257]:
df_list[6] = df_list[6].drop(df_list[6][pd.isna(df_list[6]['Header 1']) | pd.isna(df_list[6]['Header 1']) & pd.isna(df_list[6]['Header 2']) & pd.isna(df_list[6]['Header 3'])].index)

In [258]:
# df_list[6]['Header 1'].unique()

8. EVA

In [259]:
# df_list[7]

In [260]:
df_list[7] = df_list[7].drop(df_list[7][pd.isna(df_list[7]['Header 1']) & pd.isna(df_list[7]['Header 2']) & pd.isna(df_list[7]['Header 3'])].index)
df_list[7] = df_list[7].drop(df_list[7][pd.isna(df_list[7]['Header 2']) & pd.isna(df_list[7]['Header 3'])].index)

In [261]:
# df_list[7]['Header 1'].unique()

9. ANA 

In [262]:
# df_list[8]

In [263]:
df_list[8] = df_list[8].drop(df_list[8][pd.isna(df_list[8]['Header 1']) | (df_list[8]['Header 1'] == 'Search ANA')].index)

In [264]:
# df_list[8]['Header 1'].unique()

10. Cathay Pacific

In [265]:
# df_list[9]

In [266]:
df_list[9] = df_list[9].drop(df_list[9][pd.isna(df_list[9]['Header 1']) | pd.isna(df_list[9]['Header 1']) & pd.isna(df_list[9]['Header 2']) & pd.isna(df_list[9]['Header 3']) & pd.isna(df_list[9]['Header 4'])].index)
df_list[9] = df_list[9].drop(df_list[9][(df_list[9]['Header 2'] == 'Helpful links') | (df_list[9]['content'] == 'Helpful links')].index)

In [267]:
# df_list[9]['Header 1'].unique()

## Concat all dataframes

In [268]:
df_final = pd.concat(df_list, ignore_index=True) # concat 10 datasets
df_final = df_final.fillna('') # clean data to concat as string

df_final.head()

Unnamed: 0,Header 1,Header 3,Header 4,content,Airline,Header 2,Header 5
0,"Weight, size, and baggage allowance",,,Hand baggage \nChecked baggage \nHow many ba...,france,,
1,"Weight, size, and baggage allowance",What is my baggage allowance?,,Your hand baggage allowance and weight restric...,france,,
2,"Weight, size, and baggage allowance",Hand baggage dimensions,,The maximum dimensions of your hand baggage ar...,france,,
3,"Weight, size, and baggage allowance",Your hand baggage weight,,"In the Economy or Premium Economy cabins, your...",france,,
4,"Weight, size, and baggage allowance",Authorized personal items,,Soft plastic bags and packaging made with rudi...,france,,


In [269]:
# concat by columns for each row and drop duplicate columns
df_final['Concat'] = df_final['Header 3'] + '\n' + df_final['Header 4'] + '\n' + df_final['Header 5'] + '\n' + df_final['content']
df_final.drop(columns=['Header 3', 'Header 4', 'Header 5', 'content'], inplace=True)

df_final.head()


Unnamed: 0,Header 1,Airline,Header 2,Concat
0,"Weight, size, and baggage allowance",france,,\n\n\nHand baggage \nChecked baggage \nHow m...
1,"Weight, size, and baggage allowance",france,,What is my baggage allowance?\n\n\nYour hand b...
2,"Weight, size, and baggage allowance",france,,Hand baggage dimensions\n\n\nThe maximum dimen...
3,"Weight, size, and baggage allowance",france,,Your hand baggage weight\n\n\nIn the Economy o...
4,"Weight, size, and baggage allowance",france,,Authorized personal items\n\n\nSoft plastic ba...


In [271]:
# concat by rows for each Header 1 & Header 2
df_final = df_final.groupby(['Airline', 'Header 1', 'Header 2'], as_index=False)['Concat'].apply(lambda x:'\n\n'.join(x))
df_final = df_final.drop_duplicates()  
df_final.head()

Unnamed: 0,Airline,Header 1,Header 2,Concat
0,ana,Carry-On Items,,\n\n\nHome Travel Information Baggage Informat...
1,ana,Carry-On Items,Information for Carry-On Items and Onboard Rules,"\n\n\nHere, you'll find guidelines for carry-o..."
2,ana,Changes and Refunds (International Flights),,\n\n\nVoluntary Changes and Refunds Reservatio...
3,ana,Changes and Refunds (International Flights),Involuntary Changes and Refunds due to ANA's r...,\n\n\nIf flight delays or cancellations occur ...
4,ana,Changes and Refunds (International Flights),Involuntary Changes and Refunds due to Irregul...,\n\n\nCustomers who experience delays and canc...


In [272]:
# temporarily fill header 1 with header 2 if header 1 is empty
df_final.loc[(df_final['Header 1'] == ''), 'Header 1'] = df_final.loc[(df_final['Header 1'] == ''), 'Header 2']

In [273]:
df_final

Unnamed: 0,Airline,Header 1,Header 2,Concat
0,ana,Carry-On Items,,\n\n\nHome Travel Information Baggage Informat...
1,ana,Carry-On Items,Information for Carry-On Items and Onboard Rules,"\n\n\nHere, you'll find guidelines for carry-o..."
2,ana,Changes and Refunds (International Flights),,\n\n\nVoluntary Changes and Refunds Reservatio...
3,ana,Changes and Refunds (International Flights),Involuntary Changes and Refunds due to ANA's r...,\n\n\nIf flight delays or cancellations occur ...
4,ana,Changes and Refunds (International Flights),Involuntary Changes and Refunds due to Irregul...,\n\n\nCustomers who experience delays and canc...
...,...,...,...,...
332,turkish,Cabin baggage terms and materials that are all...,Cabin baggage terms and materials that are all...,\n\n\nThe baggage you are carrying must be of ...
333,turkish,Checked baggage,Checked baggage,"\n\n\nTo avoid any problems on your journey, y..."
334,turkish,"LOST, DAMAGED, AND DELAYED BAGGAGE",,\n\n\nLost Baggage Means of application for yo...
335,turkish,Materials prohibited in the cabin and in cabin...,Materials prohibited in the cabin and in cabin...,\n\n\nThere are some restrictions on items tha...


## Write to csv

In [274]:
df_final.to_csv('/Users/kay/Desktop/nlp/reviews_scrape/policy/chunking/policy_chunks.csv', index=False)
print("Scraped word written to policy_chunks.csv")

Scraped word written to policy_chunks.csv
