## Training ML Model to Generate Packages

## Setup & Imports

In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import joblib
import random
import warnings
import os
warnings.filterwarnings('ignore')

np.random.seed(42)
random.seed(42)

pd.set_option('display.max_columns', None)

## Load Training Data

Loading training dataset with historical customer behavior and outcomes.

In [6]:
# Path to training dataset
TRAINING_DATA_PATH = './PACKAGE_TRAINING_DATA.csv'  # UPDATE THIS PATH!

training_df = pd.read_csv(TRAINING_DATA_PATH)

display(training_df.head(3))

Unnamed: 0,TrainingID,CustomerID,CustomerAge,CustomerGender,TotalPageViews,TotalSearches,PackageViewCount,HotelViewCount,FlightSearchCount,DaysActiveLast30,AvgSessionDuration,PreviousPurchaseCount,AvgPreviousPackagePrice,DaysSinceLastPurchase,PackageID,DestinationCountry,DestinationCity,TripDurationDays,HotelID,HotelRating,HotelRoomType,HotelBedType,HotelPricePerNight,NumOfNights,NumOfGuests,OutboundFlightNumber,OutboundDepartureCity,OutboundDepartureCountry,OutboundAirline,OutboundFlightPrice,InboundFlightNumber,InboundAirline,InboundFlightPrice,NumOfPassengers,HasCar,CarLicenseNo,CarMaxSpeed,CarNumOfPassengers,CarPrice,TotalPackagePrice,HotelCostTotal,FlightCostTotal,CarCostTotal,BookingDate,DayOfWeek,Month,Quarter,DaysInAdvance,IsWeekend,IsHoliday,Season,WasPublished,HasReview,ReviewRating
0,1,144,49,Male,,,,,,0,0.0,0,0.0,,326,USA,Chicago,7,146.0,3.0,Superior Double Room,1 full bed,118.0,12,2,SQ4152,Dallas,USA,Alitalia,918,SQ6637,Alitalia,465,2,1,MA-5094-RR,102.0,4.0,65,1844.0,1416,2766,65,2025-09-06,6,9,3,-208,1,0,Fall,1,1,5.0
1,2,144,49,Male,,,,,,0,0.0,0,0.0,,326,USA,Chicago,7,146.0,3.0,Superior Double Room,1 full bed,118.0,4,1,SQ4152,Dallas,USA,Alitalia,918,SQ6637,Alitalia,465,1,1,MA-5094-RR,102.0,4.0,65,1636.0,472,1383,65,2025-09-05,5,9,3,-207,0,0,Fall,1,1,5.0
2,3,69,52,Female,,,,,,0,0.0,1,1748.0,90.0,327,USA,Seattle,7,5.0,4.0,Junior Suite,1 queen bed,243.0,10,1,SQ3494,Charlotte,USA,British Airways,347,UA2796,ANA,997,3,1,IL-6641-ER,103.0,4.0,67,2132.0,2430,4032,67,2025-10-27,1,10,4,-253,0,0,Fall,1,1,5.0


## Data Cleaning & Preparation

In [7]:
# Drop duplicates
training_df = training_df.drop_duplicates()

# Fix negative values in DaysInAdvance
if 'DaysInAdvance' in training_df.columns:
    neg_days = (training_df['DaysInAdvance'] < 0).sum()
    if neg_days > 0:
        training_df['DaysInAdvance'] = training_df['DaysInAdvance'].abs()

# Fill missing values
numeric_cols = training_df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if training_df[col].isnull().sum() > 0:
        training_df[col].fillna(training_df[col].median(), inplace=True)

## Feature Engineering

In [8]:
# Select behavioral features
behavioral_features = [
    'CustomerAge',
    'TotalPageViews', 
    'TotalSearches',
    'DaysActiveLast30',
    'AvgSessionDuration',
    'PreviousPurchaseCount',
    'AvgPreviousPackagePrice'
]

X_train = training_df[behavioral_features].copy()

X_train = X_train.fillna(0)

X_train.describe()

Unnamed: 0,CustomerAge,TotalPageViews,TotalSearches,DaysActiveLast30,AvgSessionDuration,PreviousPurchaseCount,AvgPreviousPackagePrice
count,2009.0,2009.0,2009.0,2009.0,2009.0,2009.0,2009.0
mean,45.433549,10.980587,5.036834,6.205575,0.300163,0.044798,78.032106
std,16.578279,8.562875,4.437328,9.714656,0.481114,0.220882,385.865102
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,31.0,9.0,4.0,0.0,0.0,0.0,0.0
50%,45.0,9.0,4.0,0.0,0.0,0.0,0.0
75%,60.0,9.0,4.0,11.0,0.5,0.0,0.0
max,75.0,59.0,29.0,30.0,4.0,2.0,2615.0


## Feature Scaling

In [9]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

## Train K-Means Clustering Model

In [10]:
# Find optimal number of clusters
silhouette_scores = []
K_range = range(2, 8)

for k in K_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels_temp = kmeans_temp.fit_predict(X_train_scaled)
    score = silhouette_score(X_train_scaled, labels_temp)
    silhouette_scores.append(score)

best_k = K_range[np.argmax(silhouette_scores)]
best_score = max(silhouette_scores)
print(f"Optimal k: {best_k}")

# Train final model
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
segment_labels = kmeans.fit_predict(X_train_scaled)

training_df['Segment'] = segment_labels

Optimal k: 3


## Analyze Segment Profiles

In [11]:
segment_profiles = {}

for segment in sorted(training_df['Segment'].unique()):
    segment_data = training_df[training_df['Segment'] == segment]
    
    profile = {
        'segment_id': segment,
        'size': len(segment_data),
        'avg_age': segment_data['CustomerAge'].mean(),
        'avg_page_views': segment_data['TotalPageViews'].mean(),
        'avg_searches': segment_data['TotalSearches'].mean(),
        'avg_session_duration': segment_data['AvgSessionDuration'].mean(),
        'avg_previous_purchases': segment_data['PreviousPurchaseCount'].mean(),
    }
    
    if 'TotalPackagePrice' in segment_data.columns:
        profile['avg_package_price'] = segment_data['TotalPackagePrice'].mean()
        profile['avg_trip_duration'] = segment_data['TripDurationDays'].mean()
        profile['avg_guests'] = segment_data['NumOfGuests'].mean()

    if profile['avg_previous_purchases'] >= 2:
        segment_name = "Loyal Customer"
    elif profile.get('avg_package_price', 0) > 2000:
        segment_name = "High-Value Customer"
    elif profile['avg_page_views'] > 10:
        segment_name = "Window Shopper"
    else:
        segment_name = "Potential Customer"
    
    profile['name'] = segment_name
    segment_profiles[segment] = profile
    
    print(f"Segment {segment}: {segment_name}")

Segment 0: Window Shopper
Segment 1: Potential Customer
Segment 2: Potential Customer


## Save Trained Model

In [12]:
os.makedirs('./models', exist_ok=True)

joblib.dump(kmeans, './models/kmeans_model.pkl')

joblib.dump(scaler, './models/scaler.pkl')

joblib.dump(behavioral_features, './models/feature_names.pkl')

joblib.dump(segment_profiles, './models/segment_profiles.pkl')

['./models/segment_profiles.pkl']