In [1]:
import polars as pl
import pandas as pd

# 1 Importing and Previewing Data

## .1 Import

In [2]:
# step: define path to Excel file
pathToData = "Freely_quote_data.xlsx"
dataRaw = pd.read_excel(pathToData, sheet_name="Quotes")
dataDefinitions = pd.read_excel(pathToData, sheet_name="Data Dictionary")

## .2 Viewing Definitions and Start of DataFrame

In [3]:
# step: check definitions
dataDefinitions

Unnamed: 0,Column,Notes
0,destinations,"destinations of travel, can be city, country, ..."
1,trip_start_date,trip departure date
2,trip_end_date,trip return date
3,traveller_ages,number of travellers and individual traveller(...
4,quote_create_time,date and time this quote is generated (between...
5,quote_price,total price of the quote inclusive of boosts c...
6,platform,web: quote from web get a quote path; qw: quot...
7,discount,"discount % applied, noting for a quote with 2 ..."
8,boost_x_name,"extra coverage selected, 9 different extra cov..."
9,boost_x_start_date,extra coverage start date


In [4]:
# step: display data preview
dataRaw.head()

Unnamed: 0,destinations,trip_start_date,trip_end_date,traveller_ages,quote_create_time,quote_price,platform,discount,boost_1_name,boost_1_start_date,...,boost_6_start_date,boost_6_end_date,boost_7_name,boost_7_start_date,boost_7_end_date,boost_8_name,boost_8_start_date,boost_8_end_date,extra_cancellation,convert
0,Vietnam; Sri Lanka; Portugal; Netherlands; Swi...,30/1/2025,16/10/2025,41;40;11;8;5,2024-24-12 13:20:09,1417,web,0.15,Specified Items,30/1/2025,...,,,,,,,,,0.0,NO
1,New Zealand,2024-07-10 00:00:00,14/10/2024,27;25,2024-10-06 11:47:00,79,app,0.15,Adventure Activities,2024-11-10 00:00:00,...,,,,,,,,,0.0,YES
2,All of Europe; Turkey,14/5/2025,2025-08-06 00:00:00,73;73,2024-11-11 12:13:00,516,web,0.15,,,...,,,,,,,,,,NO
3,USA,2025-02-01 00:00:00,21/1/2025,45;45;14;13;8,2024-12-12 14:32:00,391,web,0.15,Snow Sports,2025-05-01 00:00:00,...,,,,,,,,,40000.0,NO
4,United Kingdom,30/11/2024,2024-06-12 00:00:00,60,2024-30-11 11:57:29,60,web,0.1,Extra Cancellation,30/11/2024,...,,,,,,,,,5000.0,NO


# 2 Data Manipulation

## .1 Creating New Variables

### .1 Parsing Traveller Ages into a List

In [10]:
def _parseAges(ages_str):
    if pd.isna(ages_str):
        return []
    ages_str = str(ages_str)
    if ages_str == '' or ages_str == 'nan':
        return []
    return [int(age) for age in ages_str.split(';') if age.strip()]

dataRaw['travellerAges'] = dataRaw['traveller_ages'].apply(_parseAges)

In [11]:
dataRaw['travellerAges'].head()

0     [41, 40, 11, 8, 5]
1               [27, 25]
2               [73, 73]
3    [45, 45, 14, 13, 8]
4                   [60]
Name: travellerAges, dtype: object

### .2 Excess Discount 
this calculates the discount over the pre-applied discount that is dependent on the number of travellers
- 15% for 2 adult travellers
- 20% for 3+ adult travellers

NOTE: TURNS OUT THIS IS NOT REQUIRED. WE GET NEGATIVE DISCOUNTS

In [None]:
# dataRaw['excessDiscount'] = dataRaw['discount'] - dataRaw['travellerAges'].apply(
#     lambda ages: 0.20 
#     if 
#         sum(age >=18 for age in ages) >= 3 
#     else 
#         0.15 if sum(age >= 18 for age in ages) == 2 
#     else 
#         0
#     )

### .3 Quote Creation Hour and Day of Week

In [17]:
# step: hour
dataRaw['quoteCreateHour'] = pd.to_datetime(dataRaw['quote_create_time'], format='%Y-%d-%m %H:%M:%S').dt.hour
# step: day of the week
dataRaw['quoteCreateDay'] = pd.to_datetime(dataRaw['quote_create_time'], format='%Y-%d-%m %H:%M:%S').dt.day_name()

In [18]:
dataRaw[['quoteCreateHour', 'quoteCreateDay']]

Unnamed: 0,quoteCreateHour,quoteCreateDay
0,13,Tuesday
1,11,Sunday
2,12,Monday
3,14,Thursday
4,11,Saturday
...,...,...
69995,22,Thursday
69996,13,Wednesday
69997,9,Monday
69998,13,Friday


### .4 Destination Encoding

In [None]:
# step: get a count of all destinations
allDestinations = dataRaw['destinations'].str.split(';').explode().str.strip().value_counts()
allDestinations

destinations
Japan                 7721
Bali                  5078
Thailand              4653
Indonesia             4535
Domestic Cruise       4208
                      ... 
Benin                    1
Burkina Faso             1
Chad                     1
Cardiff                  1
Great Barrier Reef       1
Name: count, Length: 407, dtype: int64

In [27]:
# step: get a list of all unique destinations
uniqueDestinations = dataRaw['destinations'].str.split(';').explode().str.strip().unique()
sorted(uniqueDestinations)

['Abu Dhabi',
 'Adelaide',
 'Afghanistan',
 'Airlie Beach',
 'Alabama',
 'Alaska',
 'Albania',
 'Alberta',
 'Algeria',
 'Alice Springs',
 'All of Africa',
 'All of Asia (exclude Nepal)',
 'All of Europe',
 'All of Europe (Scandinavia)',
 'All of North America',
 'All of South America',
 'All of South America (Patagonia)',
 'All of UK',
 'All of UK (GBR)',
 'All of UK (Great Britain)',
 'All of UK (Isle of Man)',
 'All of UK (United Kingdom)',
 'All of the Americas',
 'All of the Americas (Central America)',
 'All of the Middle East',
 'All of the Pacific',
 'All of the Pacific (Pacific Islands)',
 'America',
 'American Samoa',
 'Amsterdam',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica (Cruising)',
 'Antarctica-Sightseeing Flight',
 'Antigua and Barbuda',
 'Argentina',
 'Arizona',
 'Armenia',
 'Aruba',
 'Athens',
 'Auckland',
 'Australia',
 'Australia (Domestic Cruise)',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bali',
 'Bangkok',
 'Bangladesh',
 'Barbados',
 'Barcelona',