In [1]:
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

import numpy as np

In [2]:
df = pd.read_json("data/data.json")

In [3]:
df

Unnamed: 0,place,country,tags,avg_cost_per_day,trip_duration,best_for,season,climate,popularity
0,Cape Town,South Africa,"[culture, nature, beach, mountains, food, nigh...",4000,5,"[friends, couples, family, solo]","[sep, oct, nov, mar, apr]",Mediterranean,Very High
1,Kruger National Park,South Africa,"[safari, nature, wildlife, adventure, camping,...",4500,4,"[family, friends, couples]","[may, jun, jul, aug, sep]",Savanna,Very High
2,Garden Route,South Africa,"[nature, beach, adventure, forest, lakes, whal...",3500,5,"[family, couples, friends]","[jun, jul, aug, sep, oct]",Oceanic,High
3,Durban,South Africa,"[beach, culture, food, nightlife, water-sports...",3000,3,"[friends, family, couples]","[jun, jul, dec, jan]",Subtropical,High
4,Cairo,Egypt,"[history, heritage, culture, spiritual, nightl...",2500,4,"[friends, solo, family]","[oct, nov, feb, mar, apr]",Desert,Very High
...,...,...,...,...,...,...,...,...,...
1099,Pyin Oo Lwin,Myanmar,"[colonial, gardens, cool, waterfalls, history,...",1800,2,"[relax, history, family]","[nov, dec, jan, feb]",Subtropical,Medium
1100,Koh Rong,Cambodia,"[beach, party, nature, islands, diving, advent...",2000,3,"[backpackers, party, friends]","[nov, dec, jan, feb]",Tropical,High
1101,Phonsavan,Laos,"[history, jars, mystery, offbeat, war-history,...",1800,2,"[history, offbeat, culture]","[nov, dec, jan, feb]",Subtropical,Low
1102,Thakhek,Laos,"[adventure, motorbiking, caves, loop, scenic, ...",1800,3,"[adventure, bikers, friends]","[nov, dec, jan, feb]",Tropical,Medium


In [31]:
climate_mapping = {
    # Tropical group
    "Tropical": "Tropical",
    "Savanna": "Tropical",
    "Tropical highland": "Tropical",

    # Dry group
    "Desert": "Dry",
    "Desert coastal": "Dry",
    "Arid": "Dry",
    "Semi-arid": "Dry",
    "Highland desert": "Dry",

    # Temperate group
    "Temperate": "Temperate",
    "Moderate": "Temperate",
    "Oceanic": "Temperate",

    # Mediterranean
    "Mediterranean": "Mediterranean",

    # Subtropical group
    "Subtropical": "Subtropical",
    "Subtropical highland": "Subtropical",

    # Continental
    "Continental": "Continental",

    # Cold group
    "Cold": "Cold",
    "Subarctic": "Cold",
    "Subpolar": "Cold",

    # Alpine
    "Alpine": "Alpine",

    # Highland group
    "Highland": "Highland",
    "Montane": "Highland"
}

df['climate'] = df['climate'].map(climate_mapping)

In [32]:
df['popularity'] = df['popularity'].replace({
    "Emerging": "Offbeat",
    "Low": "Medium"
})

In [33]:
encode = OneHotEncoder(sparse_output=False)
pop_encoded = encode.fit_transform(df[['popularity']])
pop_df = pd.DataFrame(pop_encoded, columns=encode.get_feature_names_out(['popularity']))
df = pd.concat([df, pop_df], axis=1)

In [34]:
# Smart mapping: map each extra tag to its closest allowed tag(s)
tag_mapping = {
    # Adventure-related
    "4x4": ["adventure"],
    "bungee": ["adventure"],
    "cable-car": ["adventure"],
    "climbing": ["adventure", "trekking"],
    "helicopter": ["adventure"],
    "hot-air-balloon": ["adventure"],
    "balloons": ["adventure"],
    "quad-biking": ["adventure"],
    "road-trip": ["adventure"],
    "rock-climbing": ["adventure"],
    "sandboarding": ["adventure", "desert"],
    "self-drive": ["adventure"],
    "skydiving": ["adventure"],
    "sport": ["adventure"],
    "theme-parks": ["adventure"],
    "ziplining": ["adventure"],

    # Beach / ocean
    "coral-reefs": ["beach", "water-sports"],
    "harbour": ["beach", "boating"],
    "lagoon": ["beach"],
    "lagoons": ["beach"],
    "ocean": ["beach"],
    "overwater": ["beach", "luxury"],
    "pink-beach": ["beach"],
    "white-sand": ["beach"],

    # Water-sports
    "diving": ["water-sports"],
    "fishing": ["water-sports"],
    "kayaking": ["water-sports"],
    "kitesurfing": ["water-sports"],
    "snorkeling": ["water-sports"],
    "surfing": ["water-sports"],
    "tubing": ["water-sports"],

    # Boating
    "floating": ["boating"],
    "floating-villages": ["boating", "culture"],
    "mokoro-canoe": ["boating"],

    # Safari / wildlife
    "balloon-safaris": ["safari"],
    "black-rhino": ["safari"],
    "city-safari": ["safari"],
    "dolphins": ["nature", "water-sports"],
    "dragons": ["nature"],
    "elephants": ["safari"],
    "gorillas": ["safari"],
    "koalas": ["nature"],
    "marine-life": ["nature"],
    "migration": ["safari"],
    "rhinos": ["safari"],
    "savanna": ["safari"],
    "seals": ["nature"],
    "waterhole": ["safari", "nature"],
    "whale-sharks": ["nature", "water-sports"],
    "whale-watching": ["nature"],
    "wildlife": ["safari", "nature"],

    # Culture
    "K-pop": ["culture"],
    "aboriginal": ["culture"],
    "art": ["culture"],
    "festivals": ["culture"],
    "german-culture": ["culture"],
    "lanterns": ["culture"],
    "maori": ["culture"],
    "modern": ["culture"],
    "music": ["culture"],
    "opera": ["culture"],
    "skyscrapers": ["culture"],
    "street-art": ["culture"],
    "technology": ["culture"],

    # Food
    "beer": ["food"],
    "coffee": ["food"],
    "markets": ["food", "culture"],
    "night-market": ["food", "nightlife"],
    "pepper-farms": ["food"],
    "spice-tours": ["food"],
    "wine": ["food"],

    # History / heritage
    "architecture": ["heritage"],
    "bridge": ["heritage"],
    "colonial": ["history"],
    "forbidden-city": ["history", "heritage"],
    "great-wall": ["history", "heritage"],
    "lighthouse": ["heritage"],
    "museums": ["history"],
    "old-city": ["history"],
    "ruins": ["history"],
    "silk-road": ["history"],
    "towers": ["heritage"],
    "walled-city": ["history", "heritage"],

    # Spiritual
    "holy-sites": ["spiritual"],
    "mosques": ["spiritual"],
    "temples": ["spiritual"],

    # Nature
    "crater": ["nature"],
    "fjords": ["nature"],
    "gardens": ["nature"],
    "geothermal": ["nature"],
    "granite-boulders": ["nature"],
    "hot-springs": ["nature"],
    "mist": ["nature"],
    "rainbow": ["nature"],
    "rice-terraces": ["nature", "culture"],
    "rock": ["nature"],
    "scenic": ["nature"],
    "stargazing": ["nature"],
    "sunrise": ["nature"],
    "sunsets": ["nature", "romantic"],
    "volcanic": ["nature", "mountains"],
    "volcanoes": ["mountains", "nature"],

    # Forest
    "rainforest": ["forest"],

    # Desert
    "dunes": ["desert"],

    # Mountains / trekking
    "camel-treks": ["trekking", "desert"],
    "hiking": ["trekking"],
    "horse-trekking": ["trekking"],
    "summit": ["mountains", "trekking"],

    # Lakes (typos/plurals)
    "lake": ["lakes"],
    "lakeS": ["lakes"],
    "canyons": ["canyon"],

    # Luxury / wellness
    "duty-free": ["luxury"],
    "spa": ["luxury"],
    "wellness": ["luxury", "peaceful"],

    # Offbeat
    "hippie": ["offbeat"],
    "no-cars": ["offbeat"],
    "remote": ["offbeat"],
    "unique": ["offbeat"],
    "village": ["offbeat", "culture"],

    # Nightlife
    "party": ["nightlife"],
    "shopping": ["nightlife", "culture"],

    # Camping
    "yurts": ["camping"],

    # Drop (not meaningful destination tags)
    "family": [],
    "traffic": [],
    "windy": [],
}

with open("data/tags.json") as f:
    allowed_tags = set(json.load(f))

def clean_tags(tags_list):
    new_tags = []
    for tag in tags_list:
        if tag in allowed_tags:
            new_tags.append(tag)
        elif tag in tag_mapping:
            new_tags.extend(tag_mapping[tag])
        # else: drop unknown tag
    return list(dict.fromkeys(new_tags))  # deduplicate, preserve order

df['tags'] = df['tags'].apply(clean_tags)

# Verify no extra tags remain
all_tags_after = set()
for t in df['tags']:
    all_tags_after.update(t)
remaining_extra = all_tags_after - allowed_tags
print(f"Extra tags remaining: {remaining_extra if remaining_extra else 'None ✓'}")
print(f"Tags now used: {sorted(all_tags_after)}")

Extra tags remaining: None ✓
Tags now used: ['adventure', 'beach', 'bird-watching', 'boating', 'camping', 'canyon', 'caves', 'culture', 'desert', 'food', 'forest', 'heritage', 'history', 'islands', 'lakes', 'luxury', 'mountains', 'nature', 'nightlife', 'offbeat', 'paragliding', 'peaceful', 'rafting', 'river', 'romantic', 'safari', 'skiing', 'spiritual', 'trekking', 'water-sports', 'waterfalls']


In [35]:
encode = MultiLabelBinarizer()
tags_encoded = encode.fit_transform(df['tags'])
tags_df = pd.DataFrame(tags_encoded,columns=encode.classes_)
df = pd.concat([df,tags_df],axis=1)

In [36]:
df= df.drop(columns="tags")

In [37]:
encode = MultiLabelBinarizer()
season_encoded = encode.fit_transform(df['season'])
season_df = pd.DataFrame(season_encoded,columns=encode.classes_)
df = pd.concat([df,season_df],axis=1)

In [38]:
df=df.drop(columns="season")

In [39]:
# Smart mapping for best_for: map extra values to allowed {solo, couple, family, friends}
role_mapping = {
    # Couple variants
    "couples": ["couple"],
    "honeymoon": ["couple"],
    "romantic": ["couple"],

    # Solo-leaning activities
    "spiritual": ["solo"],
    "photographers": ["solo", "friends"],
    "business": ["solo"],
    "wellness": ["solo", "couple"],
    "relaxation": ["solo", "couple"],
    "peaceful": ["solo", "couple"],

    # Friends-leaning activities
    "adventure": ["friends", "solo"],
    "thrill": ["friends"],
    "thrill-seekers": ["friends"],
    "hikers": ["friends", "solo"],
    "divers": ["friends"],
    "offbeat": ["friends", "solo"],
    "self-drive": ["friends", "couple"],
    "short-trip": ["friends", "couple", "family"],
    "boating": ["friends", "family"],
    "beach": ["friends", "couple", "family"],

    # Family-leaning
    "nature": ["family", "couple"],
    "wildlife": ["family", "friends"],
    "waterfalls": ["family", "friends"],

    # Culture / food / history → broad appeal
    "culture": ["solo", "couple", "friends"],
    "history": ["solo", "couple"],
    "history-buffs": ["solo", "friends"],
    "food": ["friends", "couple"],
    "food-lovers": ["friends", "couple"],
    "luxury": ["couple", "solo"],
}

allowed_roles = {"solo", "couple", "family", "friends"}

def clean_best_for(roles_list):
    if not isinstance(roles_list, list):
        return roles_list
    new_roles = []
    for role in roles_list:
        if role in allowed_roles:
            new_roles.append(role)
        elif role in role_mapping:
            new_roles.extend(role_mapping[role])
    return list(dict.fromkeys(new_roles))  # deduplicate, preserve order

df['best_for'] = df['best_for'].apply(clean_best_for)

# Verify
all_roles_after = set()
for r in df['best_for']:
    if isinstance(r, list):
        all_roles_after.update(r)
remaining = all_roles_after - allowed_roles
print(f"Extra roles remaining: {remaining if remaining else 'None ✓'}")
print(f"Roles now used: {sorted(all_roles_after)}")

Extra roles remaining: None ✓
Roles now used: ['couple', 'family', 'friends', 'solo']


In [40]:
encode = MultiLabelBinarizer()
role_encoded = encode.fit_transform(df['best_for'])
role_df = pd.DataFrame(role_encoded,columns=encode.classes_)
df = pd.concat([df,role_df],axis=1)

In [41]:
df = df.drop(columns="best_for")

In [42]:
total_cost = df['avg_cost_per_day']*df['trip_duration']
df['total_cost_log'] = np.log1p(total_cost)

In [43]:
scaler = StandardScaler()
df['total_cost_log'] = scaler.fit_transform(df[['total_cost_log']])
df['avg_cost_per_day'] = scaler.fit_transform(df[['avg_cost_per_day']])
df['trip_duration'] = scaler.fit_transform(df[['trip_duration']])

In [44]:
df= df.drop(columns="popularity")

In [45]:
country_to_region = {
    # South Asia
    "India": "South Asia",

    # Southeast Asia (+ East Asia)
    "Cambodia": "Southeast Asia",
    "China": "Southeast Asia",
    "Indonesia": "Southeast Asia",
    "Japan": "Southeast Asia",
    "Laos": "Southeast Asia",
    "Malaysia": "Southeast Asia",
    "Myanmar": "Southeast Asia",
    "Philippines": "Southeast Asia",
    "Singapore": "Southeast Asia",
    "South Korea": "Southeast Asia",
    "Thailand": "Southeast Asia",
    "Vietnam": "Southeast Asia",

    # Europe
    "Albania": "Europe",
    "Austria": "Europe",
    "Belgium": "Europe",
    "Bosnia and Herzegovina": "Europe",
    "Bulgaria": "Europe",
    "Croatia": "Europe",
    "Czech Republic": "Europe",
    "Denmark": "Europe",
    "Estonia": "Europe",
    "Finland": "Europe",
    "France": "Europe",
    "Georgia": "Europe",
    "Germany": "Europe",
    "Greece": "Europe",
    "Hungary": "Europe",
    "Iceland": "Europe",
    "Ireland": "Europe",
    "Italy": "Europe",
    "Latvia": "Europe",
    "Lithuania": "Europe",
    "Monaco": "Europe",
    "Montenegro": "Europe",
    "Netherlands": "Europe",
    "Norway": "Europe",
    "Poland": "Europe",
    "Portugal": "Europe",
    "Romania": "Europe",
    "Serbia": "Europe",
    "Slovenia": "Europe",
    "Spain": "Europe",
    "Sweden": "Europe",
    "Switzerland": "Europe",
    "United Kingdom": "Europe",

    # North America (+ Central America, Caribbean, South America)
    "Argentina": "North America",
    "Bahamas": "North America",
    "Belize": "North America",
    "Bolivia": "North America",
    "Brazil": "North America",
    "Canada": "North America",
    "Chile": "North America",
    "Colombia": "North America",
    "Costa Rica": "North America",
    "Cuba": "North America",
    "Dominican Republic": "North America",
    "Ecuador": "North America",
    "Guatemala": "North America",
    "Honduras": "North America",
    "Jamaica": "North America",
    "Mexico": "North America",
    "Nicaragua": "North America",
    "Panama": "North America",
    "Paraguay": "North America",
    "Peru": "North America",
    "Puerto Rico": "North America",
    "United States": "North America",
    "Uruguay": "North America",

    # Middle East (+ Central Asia, North Africa)
    "Egypt": "Middle East",
    "Israel": "Middle East",
    "Jordan": "Middle East",
    "Kazakhstan": "Middle East",
    "Kyrgyzstan": "Middle East",
    "Morocco": "Middle East",
    "Tajikistan": "Middle East",
    "Turkey": "Middle East",
    "Turkmenistan": "Middle East",
    "UAE": "Middle East",
    "Uzbekistan": "Middle East",

    # Africa
    "Botswana": "Africa",
    "Kenya": "Africa",
    "Mauritius": "Africa",
    "Namibia": "Africa",
    "Rwanda": "Africa",
    "Seychelles": "Africa",
    "South Africa": "Africa",
    "Tanzania": "Africa",
    "Uganda": "Africa",
    "Zambia/Zimbabwe": "Africa",

    # Oceania
    "Australia": "Oceania",
    "Fiji": "Oceania",
    "French Polynesia": "Oceania",
    "New Zealand": "Oceania",
}

df['region'] = df['country'].map(country_to_region)

# Verify no unmapped countries
unmapped = df[df['region'].isna()]['country'].unique()
print(f"Unmapped countries: {list(unmapped) if len(unmapped) else 'None ✓'}")
print(f"\nRegion distribution:\n{df['region'].value_counts()}")

df = df.drop(columns="country")

Unmapped countries: None ✓

Region distribution:
region
South Asia        279
North America     100
Europe             98
Southeast Asia     43
Africa             23
Middle East        22
Oceania            19
Name: count, dtype: int64


In [46]:
encode = OneHotEncoder(sparse_output=False)
region_encoded = encode.fit_transform(df[['region']])
region_df = pd.DataFrame(region_encoded, columns=encode.get_feature_names_out(['region']))
df = pd.concat([df, region_df], axis=1)

In [47]:
df = df.drop(columns="region")

In [48]:
df

Unnamed: 0,place,avg_cost_per_day,trip_duration,climate,popularity_High,popularity_Medium,popularity_Offbeat,popularity_Very High,adventure,beach,...,friends,solo,total_cost_log,region_Africa,region_Europe,region_Middle East,region_North America,region_Oceania,region_South Asia,region_Southeast Asia
0,Manali,-0.719012,1.708384,Cold,1.0,0.0,0.0,0.0,1,0,...,1,0,-0.067446,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Shimla,-0.762413,-0.336075,Cold,0.0,1.0,0.0,0.0,1,0,...,0,0,-0.815898,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Dharamshala,-0.849214,-0.336075,Cold,0.0,1.0,0.0,0.0,0,0,...,0,1,-1.014866,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,McLeod Ganj,-0.936016,-0.336075,Cold,0.0,1.0,0.0,0.0,1,0,...,0,1,-1.250184,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Kasol,-1.022818,-0.336075,Cold,0.0,1.0,0.0,0.0,1,0,...,1,1,-1.538171,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,Tokyo,0.366009,1.708384,Temperate,0.0,0.0,0.0,1.0,0,0,...,1,1,1.198766,0.0,0.0,0.0,0.0,0.0,0.0,1.0
580,Kyoto,0.149005,0.686154,Temperate,0.0,0.0,0.0,1.0,0,0,...,1,1,0.738302,0.0,0.0,0.0,0.0,0.0,0.0,1.0
581,Seoul,0.149005,0.686154,Temperate,0.0,0.0,0.0,1.0,0,0,...,1,1,0.738302,0.0,0.0,0.0,0.0,0.0,0.0,1.0
582,Beijing,-0.068000,0.686154,Continental,0.0,0.0,0.0,1.0,0,0,...,1,1,0.539297,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [49]:
encode = OneHotEncoder(sparse_output=False)
climate_encoded = encode.fit_transform(df[['climate']])
climate_df = pd.DataFrame(climate_encoded, columns=encode.get_feature_names_out(['climate']))
df = pd.concat([df, climate_df], axis=1)

In [50]:
df = df.drop(columns="climate")

In [51]:
df

Unnamed: 0,place,avg_cost_per_day,trip_duration,popularity_High,popularity_Medium,popularity_Offbeat,popularity_Very High,adventure,beach,bird-watching,...,region_Southeast Asia,climate_Alpine,climate_Cold,climate_Continental,climate_Dry,climate_Highland,climate_Mediterranean,climate_Subtropical,climate_Temperate,climate_Tropical
0,Manali,-0.719012,1.708384,1.0,0.0,0.0,0.0,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Shimla,-0.762413,-0.336075,0.0,1.0,0.0,0.0,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Dharamshala,-0.849214,-0.336075,0.0,1.0,0.0,0.0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,McLeod Ganj,-0.936016,-0.336075,0.0,1.0,0.0,0.0,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Kasol,-1.022818,-0.336075,0.0,1.0,0.0,0.0,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,Tokyo,0.366009,1.708384,0.0,0.0,0.0,1.0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
580,Kyoto,0.149005,0.686154,0.0,0.0,0.0,1.0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
581,Seoul,0.149005,0.686154,0.0,0.0,0.0,1.0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
582,Beijing,-0.068000,0.686154,0.0,0.0,0.0,1.0,0,0,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:

for col in df.columns:
    if df[col].dropna().isin([0, 1]).all():
        df[col] = df[col].astype('int64')


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 71 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   place                  584 non-null    object 
 1   avg_cost_per_day       584 non-null    float64
 2   trip_duration          584 non-null    float64
 3   popularity_High        584 non-null    int64  
 4   popularity_Medium      584 non-null    int64  
 5   popularity_Offbeat     584 non-null    int64  
 6   popularity_Very High   584 non-null    int64  
 7   adventure              584 non-null    int64  
 8   beach                  584 non-null    int64  
 9   bird-watching          584 non-null    int64  
 10  boating                584 non-null    int64  
 11  camping                584 non-null    int64  
 12  canyon                 584 non-null    int64  
 13  caves                  584 non-null    int64  
 14  culture                584 non-null    int64  
 15  desert