# Travel Destination Analysis & Recommendation System

# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import csv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns

# Load and clean data

In [7]:
csv_path = "../data/destinations_cleaned.csv"
df = pd.read_csv(csv_path, encoding='utf-8')


# Replace missing or empty descriptions

In [8]:
df["Description"] = df["Description"].fillna("").replace(r"^\s*$", "Description not found.", regex=True)
df["Description"] = df["Description"].apply(lambda x: x if isinstance(x, str) else "Description not found.")

# Clean approximate annual tourists column

In [9]:
def parse_tourist_count(value):
    if isinstance(value, (int, float)):
        return value
    if pd.isna(value):
        return None
    s = str(value).lower().strip().replace(',', '')
    if "million" in s:
        s = s.replace("million", "").strip()
        if '-' in s:
            parts = s.split('-')
            try:
                nums = [float(part.strip()) for part in parts if part.strip()]
                return sum(nums) / len(nums)
            except ValueError:
                return None
        else:
            try:
                return float(s)
            except ValueError:
                return None
    else:
        if '-' in s:
            parts = s.split('-')
            try:
                nums = [float(part.strip()) for part in parts if part.strip()]
                raw = sum(nums) / len(nums)
            except ValueError:
                return None
        else:
            try:
                raw = float(s)
            except ValueError:
                return None
        return raw / 1e6

if "Approximate Annual Tourists" in df.columns:
    df["Approximate Annual Tourists(million)"] = df["Approximate Annual Tourists"].apply(parse_tourist_count)

# EDA: Tourist Distribution by Country