In [16]:
import pandas as pd
import numpy as np
import json
import re
from collections import defaultdict
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier

In [17]:
# Load the places dataset
places_df = pd.read_csv("places.csv")
# Load the plans dataset
plans_df = pd.read_csv("plans.csv")

In [18]:
places_df['budget'] = places_df['budget'].apply(lambda x: x.split(','))
budget_categories = ['low', 'medium', 'high']
for category in budget_categories:
    places_df[f'budget_{category}'] = places_df['budget'].apply(lambda x: 1 if category in x else 0)

#drop budget
places_df.drop('budget', axis=1, inplace=True)

In [19]:
# Step 1: Parse the plans
def extract_place_ids(plan_str):
    day_plan = json.loads(plan_str)
    all_places = []
    for day_places in day_plan.values():
        all_places.extend(day_places)
    return all_places

plans_df["place_ids"] = plans_df["plan"].apply(extract_place_ids)

In [20]:
plans_df.head(3)

Unnamed: 0,no_of_days,plan,place_ids
0,1,"{""1"": [429, 545, 627, 500]}","[429, 545, 627, 500]"
1,1,"{""1"": [505, 443, 574, 555]}","[505, 443, 574, 555]"
2,2,"{""1"": [429, 627, 605], ""2"": [489, 500, 432]}","[429, 627, 605, 489, 500, 432]"


In [21]:
# Step 2: Create a lookup dict for place_id -> metadata
places_df["place_id"] = places_df["place_id"].astype(int)
places_meta = places_df.set_index("place_id").to_dict(orient="index")

In [48]:

def extract_features_from_plan(place_ids, no_of_days):
    places = [places_meta[pid] for pid in place_ids if pid in places_meta]
    if not places:
        return None

    df = pd.DataFrame(places)
    feature_dict = defaultdict(float)

    # Categorical percentages for single-valued columns
    for col in ["category", "tourism_type"]:
        value_counts = df[col].value_counts(normalize=True)
        for cat, pct in value_counts.items():
            feature_dict[f"{col}_{cat.lower().strip()}"] = pct

    # Handle multi-label "budget" column (e.g., "medium, high")
    if "budget" in df.columns:
        all_budgets = df["budget"].dropna().str.split(",\s*").explode()
        budget_counts = all_budgets.value_counts(normalize=True)
        for budget, pct in budget_counts.items():
            feature_dict[f"budget_{budget.lower().strip()}"] = pct

    # One-hot encode most common popularity category
    if "popularity_category" in df.columns:
        try:
            most_common_pop = df["popularity_category"].mode()[0]
            feature_dict[f"popularity_{most_common_pop.lower().strip()}"] = 1
        except IndexError:
            pass

    # # One-hot encode most common city_id
    # if "city_id" in df.columns:
    #     try:
    #         most_common_city = df["city_id"].mode()[0]
    #         feature_dict[f"city_id_{int(most_common_city)}"] = 1
    #     except IndexError:
    #         pass
    # One-hot encode "with_who" if available
    if "with_who" in df.columns:
        try:
            most_common_with_who = df['with_who'].mode()[0]
            parts = re.split(r",\s*|\s+and\s+", most_common_with_who.lower().strip())
            for part in parts:
                clean_part = part.strip()
                if clean_part:
                    feature_dict[f"with_who_{clean_part}"] = 1
        except IndexError:
            pass  # no mode value found



    # Add number of days
    feature_dict["no_of_days"] = no_of_days

    return dict(feature_dict)


In [49]:
features = []
targets = []

for _, row in plans_df.iterrows():
    f = extract_features_from_plan(row["place_ids"], row["no_of_days"])
    if f:
        features.append(f)
        targets.append(row["place_ids"])

In [24]:
features[0]

{'category_library': 0.25,
 'category_museum': 0.25,
 'category_theater': 0.25,
 'category_garden': 0.25,
 'tourism_type_cultural and historical attractions': 0.75,
 'tourism_type_entertainment and modern attractions': 0.25,
 'popularity_medium': 1,
 'with_who_couple': 1,
 'with_who_friends': 1,
 'no_of_days': 1}

In [50]:

# Convert features and targets
X = pd.DataFrame(features).fillna(0)
mlb = MultiLabelBinarizer()
Y = pd.DataFrame(mlb.fit_transform(targets), columns=mlb.classes_)

# Split and train XGBoost multi-label classifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [26]:
model = MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
model.fit(X_train, Y_train)

In [27]:
# Evaluate
Y_pred = model.predict(X_test)
report = classification_report(Y_test, Y_pred, target_names=[str(c) for c in Y.columns], zero_division=0, output_dict=True)


In [28]:

# Return top-level evaluation summary
summary_report = {
    "micro avg": report["micro avg"],
    "macro avg": report["macro avg"],
    "weighted avg": report["weighted avg"]
}

X.shape, Y.shape, summary_report


((85, 28),
 (85, 25),
 {'micro avg': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 165},
  'macro avg': {'precision': 0.96,
   'recall': 0.96,
   'f1-score': 0.96,
   'support': 165},
  'weighted avg': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 165}})

In [None]:
def predict_places_from_input(no_of_days, category_weights=None, tourism_type_weights=None, budget=None, popularity=None, with_who=None):
    # Create a feature dict in the same way the training data was processed
    feature_dict = {}

    # 1. Categorical features for category and tourism type
    if category_weights:
        for cat, pct in category_weights.items():
            feature_dict[f"category_{cat.lower().strip().replace(' ', '_')}"] = pct

    if tourism_type_weights:
        for ttype, pct in tourism_type_weights.items():
            feature_dict[f"tourism_type_{ttype.lower().strip().replace(' ', '_')}"] = pct

    # 2. Budget (multi-label, should be normalized like during training)
    if budget:
        feature_dict[f"budget_{budget.lower().strip().replace(' ', '_')}"] = 1

    # 3. Popularity category (one-hot encoded)
    if popularity:
        feature_dict[f"popularity_{popularity.lower().strip().replace(' ', '_')}"] = 1


    if with_who:
        parts = re.split(r",\s*|\s+and\s+", with_who.lower().strip())
        for part in parts:
            clean_part = part.strip()
            if clean_part:
                feature_dict[f"with_who_{clean_part.replace(' ', '_')}"] = 1
    # 5. Number of days
    feature_dict["no_of_days"] = no_of_days

    # Convert the feature dict into a DataFrame, using the same columns as the training data
    x_input = pd.DataFrame([feature_dict], columns=X.columns).fillna(0)

    # Make the prediction
    y_pred = model.predict(x_input)

    # Convert the multi-label output back to place IDs
    predicted_place_ids = mlb.inverse_transform(y_pred)[0]  # Will give a list of place_ids

    return predicted_place_ids


In [52]:
x=predicted_places = predict_places_from_input(
    no_of_days=3,
    category_weights={"zoo": 1,"shopping":1,'garden':1},
    tourism_type_weights={"cultural": 1, "entertainment": 1},
    budget="medium",
    popularity="high",
    with_who="solo"
)

print(predicted_places)


(432, 487, 489, 500, 545, 620, 692)


In [53]:
filtered = places_df[places_df["place_id"].isin(x
)]
filtered = filtered[["place_id","name"]]
filtered

Unnamed: 0,place_id,name
3,500,Alexandria Montaza Gardens
4,620,Alexandria National Museum
6,692,Alexandria Zoo
9,545,Bibliotheca Alexandrina Antiquities Museum
10,487,City Centre Alexandria
17,489,Montaza Palace
23,432,Royal Jewelry Museum


In [54]:
no_of_days=3,

In [42]:
!pip install -U google-genai




[notice] A new release of pip is available: 23.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [55]:
from google import genai

client = genai.Client(api_key="------")

response = client.models.generate_content(
    model="gemini-2.0-flash", contents=
    f"""
    You're an expert tour guide. Design an itinerary planner, arranging the places optimally based on the provided places data.
    Given the following places ({filtered} the result as JSON with a structure like:
    {{
        "no_of_days": {no_of_days}
        "plan": {{
            "1": [place_id1, place_id2],
            "2": [place_id3, place_id4],
            "3": [place_id5, place_id6]
        "description": simple inspiring paragraph for the plan, 1 sentence + emoji
        }}
    }}
    Ensure the output is optimal for a {no_of_days}-day itinerary (or other number of days as specified in the input), where each day includes a set of place_ids for the plan. Return the result as JSON.
    Do not include any extra text, explanations, or formatting. Only return the JSON output.

    """
)
print(response.text)

```json
{
    "no_of_days": 3,
    "plan": {
        "1": [
            487,
            692,
            620
        ],
        "2": [
            500,
            489
        ],
        "3": [
            432,
            545
        ]
    },
    "description": "Explore Alexandria's historical treasures and beautiful gardens in this amazing 3-day trip! 🏛️"
}
```


In [57]:
import joblib

# Save model and other necessary components
joblib.dump(model, "model.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")
joblib.dump(mlb, "mlb.pkl")


['mlb.pkl']