In [6]:
import polars as pl
import pandas as pd

## .1 Import

In [7]:
# step: define path to Excel file
pathToData = "Freely_quote_data.xlsx"
dataRaw = pd.read_excel(pathToData, sheet_name="Quotes")
dataDefinitions = pd.read_excel(pathToData, sheet_name="Data Dictionary")

## .2 Viewing Definitions and Start of DataFrame

In [8]:
# step: check definitions
dataDefinitions

Unnamed: 0,Column,Notes
0,destinations,"destinations of travel, can be city, country, ..."
1,trip_start_date,trip departure date
2,trip_end_date,trip return date
3,traveller_ages,number of travellers and individual traveller(...
4,quote_create_time,date and time this quote is generated (between...
5,quote_price,total price of the quote inclusive of boosts c...
6,platform,web: quote from web get a quote path; qw: quot...
7,discount,"discount % applied, noting for a quote with 2 ..."
8,boost_x_name,"extra coverage selected, 9 different extra cov..."
9,boost_x_start_date,extra coverage start date


In [9]:
# step: display data preview
dataRaw.head()

Unnamed: 0,destinations,trip_start_date,trip_end_date,traveller_ages,quote_create_time,quote_price,platform,discount,boost_1_name,boost_1_start_date,...,boost_6_start_date,boost_6_end_date,boost_7_name,boost_7_start_date,boost_7_end_date,boost_8_name,boost_8_start_date,boost_8_end_date,extra_cancellation,convert
0,Vietnam; Sri Lanka; Portugal; Netherlands; Swi...,30/1/2025,16/10/2025,41;40;11;8;5,2024-24-12 13:20:09,1417,web,0.15,Specified Items,30/1/2025,...,,,,,,,,,0.0,NO
1,New Zealand,2024-07-10 00:00:00,14/10/2024,27;25,2024-10-06 11:47:00,79,app,0.15,Adventure Activities,2024-11-10 00:00:00,...,,,,,,,,,0.0,YES
2,All of Europe; Turkey,14/5/2025,2025-08-06 00:00:00,73;73,2024-11-11 12:13:00,516,web,0.15,,,...,,,,,,,,,,NO
3,USA,2025-02-01 00:00:00,21/1/2025,45;45;14;13;8,2024-12-12 14:32:00,391,web,0.15,Snow Sports,2025-05-01 00:00:00,...,,,,,,,,,40000.0,NO
4,United Kingdom,30/11/2024,2024-06-12 00:00:00,60,2024-30-11 11:57:29,60,web,0.1,Extra Cancellation,30/11/2024,...,,,,,,,,,5000.0,NO


# 2 Data Manipulation

## .1 Creating New Variables

### .1 Parsing Traveller Ages into a List

In [10]:
def _parseAges(ages_str):
    if pd.isna(ages_str):
        return []
    ages_str = str(ages_str)
    if ages_str == '' or ages_str == 'nan':
        return []
    return [int(age) for age in ages_str.split(';') if age.strip()]

dataRaw['travellerAges'] = dataRaw['traveller_ages'].apply(_parseAges)

In [11]:
dataRaw['travellerAges'].head()

0     [41, 40, 11, 8, 5]
1               [27, 25]
2               [73, 73]
3    [45, 45, 14, 13, 8]
4                   [60]
Name: travellerAges, dtype: object

### .2 Excess Discount 
this calculates the discount over the pre-applied discount that is dependent on the number of travellers
- 15% for 2 adult travellers
- 20% for 3+ adult travellers

NOTE: TURNS OUT THIS IS NOT REQUIRED. WE GET NEGATIVE DISCOUNTS

In [12]:
# dataRaw['excessDiscount'] = dataRaw['discount'] - dataRaw['travellerAges'].apply(
#     lambda ages: 0.20 
#     if 
#         sum(age >=18 for age in ages) >= 3 
#     else 
#         0.15 if sum(age >= 18 for age in ages) == 2 
#     else 
#         0
#     )

### .3 Quote Creation Hour and Day of Week

In [13]:
# step: hour
dataRaw['quoteCreateHour'] = pd.to_datetime(dataRaw['quote_create_time'], format='%Y-%d-%m %H:%M:%S').dt.hour
# step: day of the week
dataRaw['quoteCreateDay'] = pd.to_datetime(dataRaw['quote_create_time'], format='%Y-%d-%m %H:%M:%S').dt.day_name()

In [14]:
dataRaw[['quoteCreateHour', 'quoteCreateDay']]

Unnamed: 0,quoteCreateHour,quoteCreateDay
0,13,Tuesday
1,11,Sunday
2,12,Monday
3,14,Thursday
4,11,Saturday
...,...,...
69995,22,Thursday
69996,13,Wednesday
69997,9,Monday
69998,13,Friday


### .4 Destination Encoding

In [15]:
# step: get a count of all destinations
allDestinations = dataRaw['destinations'].str.split(';').explode().str.strip().value_counts()
allDestinations

destinations
Japan                 7721
Bali                  5078
Thailand              4653
Indonesia             4535
Domestic Cruise       4208
                      ... 
Benin                    1
Burkina Faso             1
Chad                     1
Cardiff                  1
Great Barrier Reef       1
Name: count, Length: 407, dtype: int64

In [16]:
# step: get a list of all unique destinations
uniqueDestinations = dataRaw['destinations'].str.split(';').explode().str.strip().unique()
sorted(uniqueDestinations)

['Abu Dhabi',
 'Adelaide',
 'Afghanistan',
 'Airlie Beach',
 'Alabama',
 'Alaska',
 'Albania',
 'Alberta',
 'Algeria',
 'Alice Springs',
 'All of Africa',
 'All of Asia (exclude Nepal)',
 'All of Europe',
 'All of Europe (Scandinavia)',
 'All of North America',
 'All of South America',
 'All of South America (Patagonia)',
 'All of UK',
 'All of UK (GBR)',
 'All of UK (Great Britain)',
 'All of UK (Isle of Man)',
 'All of UK (United Kingdom)',
 'All of the Americas',
 'All of the Americas (Central America)',
 'All of the Middle East',
 'All of the Pacific',
 'All of the Pacific (Pacific Islands)',
 'America',
 'American Samoa',
 'Amsterdam',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica (Cruising)',
 'Antarctica-Sightseeing Flight',
 'Antigua and Barbuda',
 'Argentina',
 'Arizona',
 'Armenia',
 'Aruba',
 'Athens',
 'Auckland',
 'Australia',
 'Australia (Domestic Cruise)',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bali',
 'Bangkok',
 'Bangladesh',
 'Barbados',
 'Barcelona',

In [17]:
print(len(uniqueDestinations))

407


# Create continent variable

In [18]:
### .5 Destination → Continent mapping
from __future__ import annotations
from typing import Dict, Set, Iterable
import re
import pandas as pd


# --- 1) Normalization & aliases ------------------------------------------------
def _norm(s: str) -> str:
    """Lowercase, collapse spaces, strip punctuation in parentheses, normalize hyphens."""
    s = str(s).strip().lower()
    s = re.sub(r"\s+", " ", s)
    # unify common punctuation variants
    s = s.replace("’", "'").replace("–", "-").replace("—", "-")
    # drop parenthetical hints like "(HK)", "(Great Britain)", "(PNG)", etc.
    s = re.sub(r"\s*\([^)]*\)", "", s).strip()
    return s


ALIASES: Dict[str, str] = {
    # language / synonyms
    "espana": "spain",
    "estados unidos": "united states of america",
    "holland": "netherlands",
    "u.s.a.": "united states of america",
    "usa": "united states of america",
    "america": "united states of america",
    "great britain": "united kingdom",
    "gbr": "united kingdom",
    # UK collections (map to umbrella to then map to Europe)
    "all of uk": "united kingdom",
    "all of uk isle of man": "united kingdom",
    # UAE / HK variants
    "united arab emirates uae": "united arab emirates",
    "united arab emirates u.a.e.": "united arab emirates",
    "hong kong hk": "hong kong",
    "hong kong hongkong": "hong kong",
    # country name variants
    "netherlands the netherlands": "netherlands",
    "italy italia": "italy",
    "sri lanka srilanka": "sri lanka",
    "papua new guinea png": "papua new guinea",
    "new zealand nz": "new zealand",
    "korea south": "south korea",
    "korea south republic of korea": "south korea",
    "korea south south korea": "south korea",
    "macau": "macao",
    "malvinas": "falkland islands",
    "herzegovina": "bosnia",  # treat as Bosnia & Herzegovina (Europe)
    "kl": "kuala lumpur",
    "port villa": "port vila",
}


def _alias_or_same(s: str) -> str:
    key = _norm(s)
    return ALIASES.get(key, key)


# --- 2) Canonical continent buckets -------------------------------------------
# We’ll use standard 7-continent labels + two helpers for aggregates.
AFRICA = {
    # countries
    "algeria", "angola", "benin", "botswana", "burkina faso", "cameroon", "cape verde",
    "central african republic", "chad", "egypt", "eritrea", "ethiopia", "gambia", "ghana",
    "guinea", "ivory coast", "cote d'ivoire", "kenya", "lesotho", "liberia", "madagascar",
    "malawi", "mali", "mauritius", "morocco", "mozambique", "namibia", "nigeria", "rwanda",
    "senegal", "seychelles", "sierra leone", "south africa", "south sudan", "swaziland",
    "eswatini", "tanzania", "togo", "tunisia", "uganda", "zambia", "zimbabwe", "reunion",
    # cities/places
    "cairo", "cape town", "johannesburg", "marrakech", "nairobi",
}

ASIA = {
    # countries
    "afghanistan", "armenia", "azerbaijan", "bahrain", "bangladesh", "bhutan", "brunei",
    "cambodia", "china", "cyprus", "east timor", "timor-leste", "georgia", "hong kong",
    "india", "indonesia", "iran", "iraq", "israel", "japan", "jordan", "kazakhstan", "kuwait",
    "kyrgyzstan", "laos", "lebanon", "macao", "malaysia", "maldives", "mongolia", "myanmar",
    "burma", "nepal", "oman", "pakistan", "philippines", "qatar", "saudi arabia", "singapore",
    "south korea", "sri lanka", "syria", "taiwan", "tajikistan", "thailand", "turkey",
    "turkmenistan", "united arab emirates", "uzbekistan", "vietnam", "yemen",
    # cities/places
    "abu dhabi", "bangkok", "beijing", "chiang mai", "delhi", "dhaka", "doha", "dubai",
    "hanoi", "ho chi minh city", "istanbul", "jakarta", "kuala lumpur", "kathmandu",
    "osaka", "sapporo", "seoul", "shanghai", "shenzhen", "siem reap", "taipei", "tokyo",
    "penang", "phuket", "koh samui", "denpasar", "kuta", "ubud", "manila", "cebu", "mumbai",
    "bali", "jordan",  # country present in some datasets as city-like token
}

EUROPE = {
    # countries/territories
    "albania", "andorra", "austria", "belgium", "bosnia", "bulgaria", "croatia",
    "czech republic", "denmark", "estonia", "finland", "france", "germany", "gibraltar",
    "greece", "guernsey", "hungary", "iceland", "republic of ireland", "ireland", "italy",
    "jersey", "kosovo", "latvia", "liechtenstein", "lithuania", "luxembourg", "macedonia",
    "malta", "moldova", "monaco", "montenegro", "netherlands", "norway", "poland",
    "portugal", "romania", "san marino", "serbia", "slovakia", "slovenia", "spain",
    "sweden", "switzerland", "united kingdom", "england", "scotland", "wales",
    "northern ireland", "vatican city", "faroe islands", "canary islands", "madeira",
    "lapland", "iceland",
    # cities
    "amsterdam", "athens", "barcelona", "belgrade", "berlin", "brussels", "budapest",
    "cologne", "copenhagen", "dublin", "dubrovnik", "edinburgh", "florence", "frankfurt",
    "geneva", "glasgow", "glasglow", "helsinki", "ibiza", "lisbon", "liverpool", "london",
    "lyon", "madrid", "milan", "munich", "nice", "paris", "prague", "rome", "salzburg",
    "stockholm", "strasbourg", "venice", "vienna", "zurich", "nuremburg", "nuremberg",
    "brussels", "bratislava",
}

NORTH_AMERICA = {
    # countries/regions
    "canada", "united states of america", "united states", "mexico", "greenland",
    # us states
    "alabama", "alaska", "arizona", "california", "colorado", "connecticut", "florida",
    "indiana", "massachusetts", "michigan", "minnesota", "nebraska", "new jersey",
    "new mexico", "new york", "north carolina", "oregon", "tennessee", "texas", "utah",
    "virginia", "hawaii",
    # canada provinces
    "alberta", "british columbia", "nova scotia", "quebec",
    # cities
    "chicago", "honolulu", "las vegas", "los angeles", "miami", "san francisco",
    "new york", "ottawa", "toronto", "vancouver", "calgary", "quebec", "mexico city",
    "cancun",
    # central america & caribbean (geographically North America)
    "belize", "costa rica", "el salvador", "guatemala", "honduras", "nicaragua", "panama",
    "panama city", "bahamas", "barbados", "bermuda", "cayman islands", "cuba", "dominica",
    "dominican rep.", "guadeloupe", "haiti", "jamaica", "martinique", "puerto rico",
    "st. lucia", "st. kitts-nevis", "trinidad and tobago", "antigua and barbuda",
    "aruba", "anguilla", "bermuda", "virgin islands",
}

SOUTH_AMERICA = {
    # countries
    "argentina", "bolivia", "brazil", "chile", "colombia", "ecuador", "guyana", "paraguay",
    "peru", "uruguay", "venezuela", "falkland islands",
    # cities
    "buenos aires", "rio de janeiro", "santiago", "lima",
}

OCEANIA = {
    # countries/territories
    "australia", "new zealand", "fiji", "vanuatu", "new caledonia", "papua new guinea",
    "nauru", "palau", "samoa", "western samoa", "solomon islands", "tonga",
    "french polynesia", "tahiti", "cook islands", "rarotonga", "american samoa",
    "norfolk island", "lord howe island",
    # au states/regions & notable places
    "queensland", "western australia", "south australia", "northern territory", "tasmania",
    "great barrier reef", "whitsundays", "hamilton island", "fraser island",
    "isle of pines", "lifou", "loyalty islands", "port vila", "port vila", "suva", "nadi",
    "noumea", "mystery island",
    # cities (AU/NZ)
    "sydney", "melbourne", "brisbane", "perth", "adelaide", "hobart", "canberra",
    "gold coast", "sunshine coast", "noosa", "port douglas", "cairns", "broome",
    "alice springs", "uluru", "airlie beach", "auckland", "christchurch", "queenstown",
    # misc
    "domestic cruise", "australia domestic cruise", "south west pacific cruise",
    "new zealand",
}

ANTARCTICA = {"antarctica", "antarctica sightseeing flight", "antarctica cruising"}

# --- 3) Aggregated selections (map to continent sets) -------------------------
AGGREGATES: Dict[str, Set[str]] = {
    "all of africa": {"Africa"},
    "all of asia": {"Asia"},
    "all of europe": {"Europe"},
    "all of europe scandinavia": {"Europe"},
    "all of uk": {"Europe"},
    "all of north america": {"North America"},
    "all of south america": {"South America"},
    "all of south america patagonia": {"South America"},
    "all of the americas": {"North America", "South America"},
    "all of the americas central america": {"North America"},
    "all of the pacific": {"Oceania"},
    "all of the pacific pacific islands": {"Oceania"},
    "all of the middle east": {"Asia"},
    "worldwide": {"Worldwide"},
    # Common cruise buckets
    "south west pacific cruise": {"Oceania"},
    "domestic cruise": {"Oceania"},
    "australia domestic cruise": {"Oceania"},
}


# --- 4) Core resolver ---------------------------------------------------------
def _lookup_continents(token: str) -> Set[str]:
    """Return set of continents for a single normalized/aliased token."""
    t = _alias_or_same(token)

    # aggregates first
    if t in AGGREGATES:
        return AGGREGATES[t]

    # exact membership checks
    if t in AFRICA:         return {"Africa"}
    if t in ASIA:           return {"Asia"}
    if t in EUROPE:         return {"Europe"}
    if t in NORTH_AMERICA:  return {"North America"}
    if t in SOUTH_AMERICA:  return {"South America"}
    if t in OCEANIA:        return {"Oceania"}
    if t in ANTARCTICA:     return {"Antarctica"}

    # small heuristics (very conservative)
    if " middle east" in t:
        return {"Asia"}
    if t.endswith(" islands"):
        # many “Islands” in this dataset are Pacific or Caribbean; try best-effort
        if t in {"canary islands", "madeira", "faroe islands"}:
            return {"Europe"}
        # fall through unknown; too ambiguous otherwise
    return set()


def map_row_to_continents(destinations: str) -> str:
    """
    For a semicolon-separated destinations string, return a ';'-joined sorted set of continents.
    If nothing can be resolved, returns 'Unknown'.
    """
    if pd.isna(destinations) or not str(destinations).strip():
        return ""
    tokens = [d.strip() for d in str(destinations).split(";") if d.strip()]
    out: Set[str] = set()
    for tok in tokens:
        out |= _lookup_continents(tok)
    if not out:
        return "Unknown"
    # If both NA and SA present due to 'All of the Americas', keep both
    return ";".join(sorted(out))


# --- 5) Create the column ------------------------------------------------------
dataRaw["destination_continent"] = dataRaw["destinations"].apply(map_row_to_continents)

# --- 6) Coverage report for your uniqueDestinations ---------------------------
_normed_unique = sorted({_alias_or_same(x) for x in uniqueDestinations})
_unmapped = [u for u in _normed_unique if not _lookup_continents(u)]
print(f"Unmapped tokens ({len(_unmapped)}). Add them if you see any you care about:\n", _unmapped[:50])
dataRaw[["destinations", "destination_continent"]].head(10)


Unmapped tokens (14). Add them if you see any you care about:
 ['antarctica-sightseeing flight', 'birmingham', 'brasilia', 'cardiff', 'darwin', 'kangaroo island', 'korea', 'lombok', 'manchester', 'netherlands antilles', 'pattaya', 'salvador', 'tenerife', 'tibet']


Unnamed: 0,destinations,destination_continent
0,Vietnam; Sri Lanka; Portugal; Netherlands; Swi...,Asia;Europe
1,New Zealand,Oceania
2,All of Europe; Turkey,Asia;Europe
3,USA,North America
4,United Kingdom,Europe
5,Croatia,Europe
6,Indonesia,Asia
7,New Zealand,Oceania
8,Greece; Turkey; Malta,Asia;Europe
9,Cook Islands,Oceania
