# Simplify headers so that the nodes are cut down

In [23]:
import pandas as pd
import numpy as np

### Read in data

In [4]:
df = pd.read_csv('/Users/kay/Desktop/nlp/nlp_airline_project/data/clean/policy_chunks.csv')
df.head()

Unnamed: 0,Airline,Header 1,Header 2,Concat
0,aa,Carry-on bags,1 personal item and 1 carry-on,Personal item\n\n\nYour personal item like a p...
1,aa,Carry-on bags,Know what you can carry on,\n\n\nThere are some items that can only trave...
2,aa,Carry-on bags,Special notice,\n\n\nCustomers flying Basic Economy are now a...
3,aa,Carry-on bags,You may also like...,Liquids\n\n\nOversize and overweight bags Trav...
4,aa,Checked bag policy,Checked bag allowances,\n\n\nChanges to bag allowances and fees have ...


### Recode airline

In [5]:
df['Airline'].unique()

array(['aa', 'ana', 'cathay', 'delta', 'emirates', 'eva', 'france',
       'japan', 'korean', 'qatar', 'singapore', 'turkish', 'ua'],
      dtype=object)

In [6]:
mapping = {'aa': 'American Airlines', 
           'ana': 'All Nippon Airways', 
           'cathay': 'Cathay Pacific', 
           'delta': 'Delta Air Lines', 
           'emirates': 'Emirates', 
           'eva': 'EVA Air', 
           'france': 'Air France', 
           'japan': 'Japan Airlines', 
           'korean': 'Korean Air', 
           'qatar': 'Qatar Airways', 
           'singapore': 'Singapore Airlines', 
           'turkish': 'Turkish Airlines', 
           'ua': 'United Airlines'}

# Recode 'Airline' column using the dictionary mapping
df['Airline'] = df['Airline'].map(mapping)
df.head()

Unnamed: 0,Airline,Header 1,Header 2,Concat
0,American Airlines,Carry-on bags,1 personal item and 1 carry-on,Personal item\n\n\nYour personal item like a p...
1,American Airlines,Carry-on bags,Know what you can carry on,\n\n\nThere are some items that can only trave...
2,American Airlines,Carry-on bags,Special notice,\n\n\nCustomers flying Basic Economy are now a...
3,American Airlines,Carry-on bags,You may also like...,Liquids\n\n\nOversize and overweight bags Trav...
4,American Airlines,Checked bag policy,Checked bag allowances,\n\n\nChanges to bag allowances and fees have ...


### Recode header 1

In [33]:
# docs = [x for x in df['Header 1'].unique().tolist() if not isinstance(x, float) or not np.isnan(x)]
docs = df['Header 1']
cleans = []
for name in docs:
    if pd.isna(name):
        clean = ''
        cleans.append(clean)
    else:
        header = name.lower()
        if 'check' in header:
            clean = 'Checked Bag Policy'
        elif 'cabin' in header or 'carry on' in header:
            clean = 'Carry on Bag Policy'
        elif 'delay bag' in header or 'lost' in header or 'damage' in header:
            clean = 'Mishandled Bag Policy'
        elif 'restrict' in header or 'dangerous' in header or 'limitation' in header or 'not allowed' or 'battries' in header or 'prohibited' in header:
            clean = 'Restricted Items Policy'
        elif 'missed flight' in header or 'delayed flight' in header or 'canceled flight' in header or 'irregular' in header:
            clean = 'Unnormal Flight Policy'
        elif 'change flight' in header or 'change reservation' in header:
            clean = 'Change Reservation Policy'
        elif 'refund' in header or 'ticket' in header or 'compensate' in header:
            clean = 'Refund Policy'
        else:
            clean = header
        cleans.append(clean)
len(cleans)

440

In [34]:
df['Header 1'] = cleans
df.head()

Unnamed: 0,Airline,Header 1,Header 2,Concat
0,American Airlines,Restricted Items Policy,1 personal item and 1 carry-on,Personal item\n\n\nYour personal item like a p...
1,American Airlines,Restricted Items Policy,Know what you can carry on,\n\n\nThere are some items that can only trave...
2,American Airlines,Restricted Items Policy,Special notice,\n\n\nCustomers flying Basic Economy are now a...
3,American Airlines,Restricted Items Policy,You may also like...,Liquids\n\n\nOversize and overweight bags Trav...
4,American Airlines,Checked Bag Policy,Checked bag allowances,\n\n\nChanges to bag allowances and fees have ...


### Write to csv

In [46]:
df.to_csv('/Users/kay/Desktop/nlp/nlp_airline_project/data/clean/encoded_policy.csv', index=False)

In [47]:
# pd.read_csv('/Users/kay/Desktop/nlp/nlp_airline_project/data/clean/encoded_policy.csv')

Unnamed: 0,Airline,Header 1,Header 2,Concat
0,American Airlines,Restricted Items Policy,1 personal item and 1 carry-on,Personal item\n\n\nYour personal item like a p...
1,American Airlines,Restricted Items Policy,Know what you can carry on,\n\n\nThere are some items that can only trave...
2,American Airlines,Restricted Items Policy,Special notice,\n\n\nCustomers flying Basic Economy are now a...
3,American Airlines,Restricted Items Policy,You may also like...,Liquids\n\n\nOversize and overweight bags Trav...
4,American Airlines,Checked Bag Policy,Checked bag allowances,\n\n\nChanges to bag allowances and fees have ...
...,...,...,...,...
435,United Airlines,Restricted Items Policy,,\n\n\nBags \nChecked bag policy \nCarry-on b...
436,United Airlines,Restricted Items Policy,Enable JavaScript,\n\n\nBags \nChecked bag policy \nCarry-on b...
437,United Airlines,Restricted Items Policy,Flying on a partner airline?,\n\n\nFind helpful information if your trip in...
438,United Airlines,Restricted Items Policy,What can you fly with?,"\n\n\nTo prevent inflight danger, many common ..."
