## Cleaning and Preprocessing for Klook Data

**Import Libraries**

In [1]:
import spacy
import pandas as pd
import re
from decimal import Decimal
import numpy as np
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
import inspect
from spacy import displacy

**Import Dataset**

In [2]:
df = pd.read_csv('Klook_PH.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,region,location,activity,title,description,price,rating,review_count,duration
0,Visayas,Palawan,Day trips,Coron Super Ultimate Day Tour,Visit the highlights around Coron's breathtaki...,"? 1,590",4.7,684 reviews,9hr Duration
1,Visayas,Cebu,Canyoning,Oslob Whale Shark Snorkeling and Badian Canyon...,Learn more about this activity's Enhanced Heal...,"? 3,635",4.7,798 reviews,16hr Duration
2,Visayas,Boracay,Boat tours,Boracay Tour Package (Island Hopping),Book this island tour package and discover Bor...,? 900,4.5,"3,947 reviews",
3,Visayas,Boracay,Sightseeing cruises,"Boracay Sunset Cruise with Kayak, Paddle Board...",Catch the spectacular Boracay sunset while cru...,? 675,4.7,"1,517 reviews",2hr 30min Duration
4,Visayas,Palawan,Boat tours,Puerto Princesa Underground River Tour in Palawan,Explore the Puerto Princesa Underground River ...,"? 1,850",4.6,"2,478 reviews",9hr 30min Duration


**Clean Features**

*Remove ? from price*

In [3]:
df['price'] = df['price'].str.replace('[?, ]', '', regex=True)
df['price'].head()

0    1590
1    3635
2     900
3     675
4    1850
Name: price, dtype: object

*Fill ratings of non-reviewed posts with 3.0 (average rating) and convert to float*

In [4]:
df['rating'] = df['rating'].apply(lambda x: round(x, 1))
df['rating'] = df['rating'].fillna(3.0)
df['rating']

0      4.7
1      4.7
2      4.5
3      4.7
4      4.6
      ... 
167    3.0
168    3.0
169    3.0
170    3.0
171    3.0
Name: rating, Length: 172, dtype: float64

*Convert review_counts to integer and fill Nan with 0. Used 1000 for 1k+ value, and so on*

In [5]:
df['review_count'] = df['review_count'].str.replace('+', '')
df['review_count'] = df['review_count'].str.replace(' reviews', '').replace('K', '000', regex=True)
df['review_count'] = df['review_count'].fillna(0)
df['review_count']

0        684
1        798
2      3,947
3      1,517
4      2,478
       ...  
167        0
168        0
169        0
170        0
171        0
Name: review_count, Length: 172, dtype: object

In [6]:
df

Unnamed: 0,region,location,activity,title,description,price,rating,review_count,duration
0,Visayas,Palawan,Day trips,Coron Super Ultimate Day Tour,Visit the highlights around Coron's breathtaki...,1590,4.7,684,9hr Duration
1,Visayas,Cebu,Canyoning,Oslob Whale Shark Snorkeling and Badian Canyon...,Learn more about this activity's Enhanced Heal...,3635,4.7,798,16hr Duration
2,Visayas,Boracay,Boat tours,Boracay Tour Package (Island Hopping),Book this island tour package and discover Bor...,900,4.5,3947,
3,Visayas,Boracay,Sightseeing cruises,"Boracay Sunset Cruise with Kayak, Paddle Board...",Catch the spectacular Boracay sunset while cru...,675,4.7,1517,2hr 30min Duration
4,Visayas,Palawan,Boat tours,Puerto Princesa Underground River Tour in Palawan,Explore the Puerto Princesa Underground River ...,1850,4.6,2478,9hr 30min Duration
...,...,...,...,...,...,...,...,...,...
167,Luzon,Bicol,Boat tours,Whaleshark Interaction with Firefly Expedition...,Donsol is best known for its whale watching wh...,2640,3.0,0,6hr - 10hr Duration
168,Luzon,Tanay,Wildlife,Moolk Farm Tour in Rizal,Experience the New Zealand of Tanay Rizal and ...,499,3.0,0,
169,Luzon,Tanay,Hiking,Mount Batolusong Join In Day Hike from Manila,Start your morning by hiking to Mt. Batolusong...,1600,3.0,0,1day(s) Duration
170,Visayas,Carles,Multi-day tours,Gigantes Islands All In Package from Iloilo,Enjoy this Ultimate Gigantes Islands All In Pa...,2799,3.0,0,2day(s) - 3day(s) Duration


*Standardize duration value into hours*

In [7]:
#remove (s) from day(S)
df['duration'] = df['duration'].str.replace(r'(s)', 's')

# Function to extract the number of hours
def extract_hours(duration):
    if isinstance(duration, str):
        hours = 0
        match = re.search(r'(\d+)(hr|hour|hours)', duration)
        if match:
            hours += int(match.group(1))

        match = re.search(r'(\d+)(min|minute|minutes)', duration)
        if match:
            hours += int(match.group(1)) / 60

        match = re.search(r'(\d+)(day|days)', duration)
        if match:
            hours += int(match.group(1)) * 24

        return hours
    else:
        return None

# Apply the function to the 'Duration' column
df['duration'] = df['duration'].apply(extract_hours)

df['duration']

0       9.0
1      16.0
2       NaN
3       2.5
4       9.5
       ... 
167     6.0
168     NaN
169    24.0
170    48.0
171     NaN
Name: duration, Length: 172, dtype: float64

In [8]:
df

Unnamed: 0,region,location,activity,title,description,price,rating,review_count,duration
0,Visayas,Palawan,Day trips,Coron Super Ultimate Day Tour,Visit the highlights around Coron's breathtaki...,1590,4.7,684,9.0
1,Visayas,Cebu,Canyoning,Oslob Whale Shark Snorkeling and Badian Canyon...,Learn more about this activity's Enhanced Heal...,3635,4.7,798,16.0
2,Visayas,Boracay,Boat tours,Boracay Tour Package (Island Hopping),Book this island tour package and discover Bor...,900,4.5,3947,
3,Visayas,Boracay,Sightseeing cruises,"Boracay Sunset Cruise with Kayak, Paddle Board...",Catch the spectacular Boracay sunset while cru...,675,4.7,1517,2.5
4,Visayas,Palawan,Boat tours,Puerto Princesa Underground River Tour in Palawan,Explore the Puerto Princesa Underground River ...,1850,4.6,2478,9.5
...,...,...,...,...,...,...,...,...,...
167,Luzon,Bicol,Boat tours,Whaleshark Interaction with Firefly Expedition...,Donsol is best known for its whale watching wh...,2640,3.0,0,6.0
168,Luzon,Tanay,Wildlife,Moolk Farm Tour in Rizal,Experience the New Zealand of Tanay Rizal and ...,499,3.0,0,
169,Luzon,Tanay,Hiking,Mount Batolusong Join In Day Hike from Manila,Start your morning by hiking to Mt. Batolusong...,1600,3.0,0,24.0
170,Visayas,Carles,Multi-day tours,Gigantes Islands All In Package from Iloilo,Enjoy this Ultimate Gigantes Islands All In Pa...,2799,3.0,0,48.0


In [9]:
min_df = df[df['region'] == 'Mindanao']
min_df

Unnamed: 0,region,location,activity,title,description,price,rating,review_count,duration
25,Mindanao,Siargao,Sightseeing cruises,Tri Island Tour in Siargao,Go on a tri-island day trip in Siargao and exp...,1450,4.3,396,24.0
30,Mindanao,Siargao,Day trips,Siargao Land Tour,Dedicate your vacation pleasures with thrillin...,1900,4.6,229,24.0
45,Mindanao,Siargao,Day trips,Sohoton-Bucas Grande Tour in Siargao,Have the ultimate beach day in the Philippines...,3000,3.0,84,24.0
48,Mindanao,Davao,Day trips,Nature Tour in Davao,Go on an adventure in Davao with this Day Dava...,2550,5.0,183,24.0
66,Mindanao,Davao,Day trips,Samal Island Tour in Davao,Explore unique beaches and attractions of Sama...,2450,5.0,137,24.0
83,Mindanao,Davao,Day trips,Davao City Tour,Explore the wonderful sights of Davao with thi...,1300,5.0,2,24.0
120,Mindanao,Siargao,Surfing,Siargao Surfing Lessons,SIARGAO is the top surfing spot in the Philipp...,700,4.5,4,1.0
121,Mindanao,Siargao,Surfing,Siargao Surfing Lessons,SIARGAO is the top surfing spot in the Philipp...,700,4.5,4,1.0
144,Mindanao,Davao,Day trips,Highlands Tour in Davao,Enjoy breathtaking views in Davao with this 1 ...,2050,3.0,0,24.0


*Remove see more in the descriptions*

In [21]:
df['description'] = df['description'].str.replace('See more', '')
df['description'].head()

0    Visit highlights around Corons breathtaking la...
1    Learn activitys Enhanced Health Hygiene Measur...
2    Book island tour package discover Boracays whi...
3    Catch spectacular Boracay sunset cruising alon...
4    Explore Puerto Princesa Underground River Nati...
Name: description, dtype: object

*Remove punctutations and stop words*

In [17]:
import string

# Download the NLTK stopwords corpus (only required once)
nltk.download('stopwords')

# Get the set of English stopwords
stopwords_set = set(stopwords.words('english'))

# Function to remove stopwords and punctuation from a text
def remove_stopwords_punctuation(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords_set]
    
    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

# Apply the function to the 'text_column' column
df['description'] = df['description'].apply(remove_stopwords_punctuation)

df['description'].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    Visit highlights around Corons breathtaking la...
1    Learn activitys Enhanced Health Hygiene Measur...
2    Book island tour package discover Boracays whi...
3    Catch spectacular Boracay sunset cruising alon...
4    Explore Puerto Princesa Underground River Nati...
Name: description, dtype: object

*Separate sentence with spaces*

In [24]:
# Function to insert spaces before capital letters inside a word
def insert_spaces(text):
    # Use regular expression to find capital letters inside a word
    modified_text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)
    return modified_text

# Apply the function to the 'text_column' column
df['description'] = df['description'].apply(insert_spaces)

df['description'].head()

0    Visit highlights around Corons breathtaking la...
1    Learn activitys Enhanced Health Hygiene Measur...
2    Book island tour package discover Boracays whi...
3    Catch spectacular Boracay sunset cruising alon...
4    Explore Puerto Princesa Underground River Nati...
Name: description, dtype: object

In [25]:
df

Unnamed: 0,region,location,activity,title,description,price,rating,review_count,duration
0,Visayas,Palawan,Day trips,Coron Super Ultimate Day Tour,Visit highlights around Corons breathtaking la...,1590,4.7,684,9.0
1,Visayas,Cebu,Canyoning,Oslob Whale Shark Snorkeling and Badian Canyon...,Learn activitys Enhanced Health Hygiene Measur...,3635,4.7,798,16.0
2,Visayas,Boracay,Boat tours,Boracay Tour Package (Island Hopping),Book island tour package discover Boracays whi...,900,4.5,3947,
3,Visayas,Boracay,Sightseeing cruises,"Boracay Sunset Cruise with Kayak, Paddle Board...",Catch spectacular Boracay sunset cruising alon...,675,4.7,1517,2.5
4,Visayas,Palawan,Boat tours,Puerto Princesa Underground River Tour in Palawan,Explore Puerto Princesa Underground River Nati...,1850,4.6,2478,9.5
...,...,...,...,...,...,...,...,...,...
167,Luzon,Bicol,Boat tours,Whaleshark Interaction with Firefly Expedition...,Donsol best known whale watching makes activit...,2640,3.0,0,6.0
168,Luzon,Tanay,Wildlife,Moolk Farm Tour in Rizal,Experience New Zealand Tanay Rizal see majesti...,499,3.0,0,
169,Luzon,Tanay,Hiking,Mount Batolusong Join In Day Hike from Manila,Start morning hiking Mt Batolusong one popular...,1600,3.0,0,24.0
170,Visayas,Carles,Multi-day tours,Gigantes Islands All In Package from Iloilo,Enjoy Ultimate Gigantes Islands Package island...,2799,3.0,0,48.0


In [23]:
df.to_csv('pre-processed_klook.csv', index=False)