## Customer Clustering

### Imports

In [1]:
import gc
import os
import re
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs

from sklearn.cluster import KMeans, DBSCAN, MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# import warnings
# warnings.filterwarnings('ignore')

### Data

In [2]:
# data = pl.concat([train, pl.read_parquet('/kaggle/input/aeroclub-recsys-2025/test.parquet')])
data = pl.read_parquet('/kaggle/input/aeroclub-recsys-2025/train.parquet').drop('selected')

In [3]:
test_data = pl.read_parquet('/kaggle/input/aeroclub-recsys-2025/test.parquet')

In [8]:
new_ids = test_data.join(data, on='profileId', how='anti')
print(f"The number of profiles in train data is {len(data.select('profileId').unique())}")
print(f"The number of profiles in test data is {len(test_data.select('profileId').unique())}")
print(f"the number of new profiles in test data is {len(new_ids.select("profileId").unique())}")

The number of profiles in train data is 32922
The number of profiles in test data is 18981
the number of new profiles in test data is 8281


In [6]:
len(test_data)

6897776

### Utils

In [35]:
def camel_to_snake(name):
    """Convert camelCase or PascalCase to snake_case"""
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1)
    return s2.lower()

def convert_columns_to_snake_case(df):
    """Convert all column names to snake_case"""
    return df.rename({col: camel_to_snake(col) for col in df.columns})

In [36]:
def prepare_features_for_clustering(df, drop_cols=None):
    # Features to drop
    if drop_cols is None:
        drop_cols = ['id', 'ranker_id', 'request_date']

    # Identify categorical features (non-numeric columns)
    categorical_features = [col for col, dtype in df.schema.items() if not dtype.is_numeric()]

    # Using lazy computations to avoid memory issues
    lazy_df = df.lazy()

    # Encode categorical features
    encoding_expressions = []
    encoders = {}  # Mapping of feature name to encoding dictionary so we can evaluate cluster assignments

    for feature in categorical_features:
        # Get unique values for the feature
        unique_values = df[feature].fill_null('MISSING').unique().sort().to_list()

        # Create mapping dictionary
        encoders[feature] = {val: idx for idx, val in enumerate(unique_values)}

        # Convert to categorical then to physical (integer codes)
        encoding_expr = (
            pl.col(feature)
            .fill_null('MISSING')
            .cast(pl.Categorical)
            .to_physical()
            .alias(f'{feature}_encoded')
        )
        encoding_expressions.append(encoding_expr)

    # Apply encodings and exclude original categorical features
    result_df = lazy_df.with_columns(encoding_expressions).select(pl.exclude(categorical_features + drop_cols)).fill_null(0).collect()

    return result_df, encoders

### Engineer Customer Features

In [37]:
# Customer attributes for clustering analysis
# CUSTOMER_ATTRIBUTES = ['profileId', 'companyID', 'sex', 'nationality', 'frequentFlyer', 'isVip', 'bySelf', 'corporateTariffCode']
UNNEEDED_ATTRIBUTES = [
    'ranker_id', 'isAccess3D', 'totalPrice', 'taxes', 'legs0_arrivalAt', 'legs0_duration', 'frequent_flyer',
    r'^legs1_(departureAt|arrivalAt|duration)$'
    r'^legs[01]_segments[0-3]_(operatingCarrier_code|aircraft_code|flightNumber)$',
    r'^legs[01]_segments[0-3]_(arrivalTo|baggage|seats).*$'
]
POLARS_INDEX_COL = ['__index_level_0__']
MAJOR_HUBS = ['ATL','DXB','DFW','HND','LHR','DEN','ORD','IST','PVG','ICN','CDG', 'JFK','CLT','MEX','SFO','EWR','MIA','BKK','GRU','HKG']


def get_cabin_class_columns(df: pl.DataFrame) -> List[str]:
    """Get all cabin class columns from the dataframe."""
    columns = df.columns
    return [col for col in columns if col.startswith('legs') and col.endswith('_cabinClass')]


def create_customer_aggregation_features() -> List[pl.Expr]:
    """Create customer aggregation expressions for basic attributes and search behavior."""
    return [
        # Basic customer attributes (take first non-null value per customer)
        pl.col('companyID').drop_nulls().first().alias('companyID'),
        pl.col('sex').drop_nulls().first().alias('sex'),
        pl.col('nationality').drop_nulls().first().alias('nationality'),
        pl.col('frequentFlyer').drop_nulls().first().alias('frequentFlyer'),
        pl.col('isVip').drop_nulls().first().alias('isVip'),
        pl.col('bySelf').drop_nulls().first().alias('bySelf'),
        pl.col('corporateTariffCode').drop_nulls().first().alias('corporateTariffCode'),

        # Normalized frequentFlyer program, addressing null values as null strings, and translating UT program
        pl.col('frequentFlyer').drop_nulls().first().str.replace('- ЮТэйр ЗАО', 'UT').fill_null('').alias('ff_normalized'),

        # Search behavior metrics
        pl.len().alias('total_searches'),
        pl.col('legs1_departureAt').is_not_null().mean().alias('roundtrip_preference'),
        pl.col('searchRoute').drop_nulls().n_unique().alias('unique_routes_searched'),
    ]


def create_booking_lead_time_features() -> List[pl.Expr]:
    """Create booking lead time statistics."""
    # Calculate booking lead time in days
    booking_lead_expr = (
        (pl.col('legs0_departureAt').str.to_datetime() -
         pl.col('requestDate').cast(pl.Datetime)) / pl.duration(days=1)
    ).cast(pl.Int32)

    return [
        booking_lead_expr.min().alias('min_booking_lead_days'),
        booking_lead_expr.max().alias('max_booking_lead_days'),
        booking_lead_expr.mean().alias('avg_booking_lead_days'),
        booking_lead_expr.median().alias('median_booking_lead_days'),
    ]


def create_travel_preference_features() -> List[pl.Expr]:
    """Create travel preference features for most common airports and carriers."""
    return [
        # Most common departure airport
        pl.col('legs0_segments0_departureFrom_airport_iata').drop_nulls().mode().first().alias('most_common_departure_airport'),
        pl.col('legs0_segments0_departureFrom_airport_iata').drop_nulls().n_unique().alias('unique_departure_airports'),

        # Most common marketing carrier
        pl.col('legs0_segments0_marketingCarrier_code').drop_nulls().mode().first().alias('most_common_carrier'),
        pl.col('legs0_segments0_marketingCarrier_code').drop_nulls().n_unique().alias('unique_carriers_used'),
    ]


def create_cabin_class_features(cabin_class_cols: List[str]) -> List[pl.Expr]:
    """Create cabin class preference statistics."""
    if not cabin_class_cols:
        # Return default values if no cabin class columns found
        return [
            pl.lit(None).alias('min_cabin_class'),
            pl.lit(None).alias('max_cabin_class'),
            pl.lit(None).alias('avg_cabin_class'),
        ]

    return [
        # Cabin class statistics across all segments
        pl.min_horizontal([pl.col(col) for col in cabin_class_cols]).min().alias('min_cabin_class'),
        pl.max_horizontal([pl.col(col) for col in cabin_class_cols]).max().alias('max_cabin_class'),
        pl.mean_horizontal([pl.col(col) for col in cabin_class_cols]).mean().alias('avg_cabin_class'),
    ]


def create_temporal_preference_features() -> List[pl.Expr]:
    """Create temporal preference features for departure patterns."""
    return [
        # Weekday preference (most common day of week for departures)
        pl.col('legs0_departureAt').str.to_datetime().dt.weekday()
          .mode().first().alias('weekday_preference'),

        # Weekend travel rate (percentage of weekend departures - 5=Sat, 6=Sun)
        pl.col('legs0_departureAt').str.to_datetime().dt.weekday()
          .map_elements(lambda x: 1 if x >= 5 else 0, return_dtype=pl.Int8)
          .mean().alias('weekend_travel_rate'),

        # Time of day variance (how consistent are their departure times)
        pl.col('legs0_departureAt').str.to_datetime().dt.hour()
          .std().alias('time_of_day_variance'),

        # Night flight preference (flights departing 22:00-06:00)
        pl.col('legs0_departureAt').str.to_datetime().dt.hour()
          .map_elements(lambda x: 1 if (x >= 22 or x < 6) else 0, return_dtype=pl.Int8)
          .mean().alias('night_flight_preference')
    ]


def create_route_specific_features() -> List[pl.Expr]:
    """Create features related to route preferences and characteristics."""

    return [
        # Route loyalty (how frequently they search the same routes)
        (pl.col('searchRoute').n_unique() / pl.len())
          .map_elements(lambda x: 1 - x if x > 0 else 0)  # Invert so higher = more loyal
          .alias('route_loyalty'),

        # Hub preference (preference for major hub airports)
        pl.concat_list([
            pl.col('legs0_segments0_departureFrom_airport_iata').is_in(MAJOR_HUBS),
            pl.col('legs0_segments0_arrivalTo_airport_iata').is_in(MAJOR_HUBS)
        ]).list.mean().alias('hub_preference'),

        # Connection tolerance (preference for flights with connections)
        pl.col('total_segments').mean().alias('connection_tolerance'),

        # Short haul preference
        (1 - (pl.col('legs0_duration').str.extract(r'^(\d+):(\d+)', 1).cast(pl.Int32) / 12))
          .clip(0, 1).alias('short_haul_preference'),

        # Domestic/international ratio based on route length
        # Assuming routes with same first letter in IATA codes are likely domestic
        pl.col('searchRoute').map_elements(
            lambda route: 1 if route and route[:1] == route[3:4] else 0,
            return_dtype=pl.Int8
        ).mean().alias('domestic_international_ratio')
    ]


def create_price_sensitivity_features() -> List[pl.Expr]:
    """Create features related to price sensitivity and patterns."""
    return [
        # Price position preference (typical percentile chosen)
        pl.col('price_percentile').mean().alias('price_position_preference'),

        # Price to duration sensitivity
        # Higher values mean more willing to pay for shorter flights
        pl.covar(
            pl.col('totalPrice'),
            pl.col('total_duration') * -1  # Negative so higher = more sensitive
        ).alias('price_to_duration_sensitivity'),

        # Premium economy preference (assuming cabin class 2 is premium economy)
        pl.mean_horizontal([
            pl.col(f'legs0_segments{i}_cabinClass') == 2
            for i in range(4)
        ]).mean().alias('premium_economy_preference'),

        # Consistent price tier (lower variance = more consistent)
        pl.col('price_tier').std().map_elements(
            lambda x: 1 - min(x / 3, 1) if x is not None else 0.5  # Invert and normalize
        ).alias('consistent_price_tier')
    ]


def create_service_preference_features() -> List[pl.Expr]:
    """Create features related to service preferences."""
    return [
        # Baggage preference (average selected baggage allowance)
        pl.concat_list([
            pl.col(f'legs0_segments{i}_baggageAllowance_quantity')
            for i in range(4)
        ]).list.mean().alias('baggage_preference'),

        # Loyalty program utilization
        pl.col('frequentFlyer').is_not_null().mean().alias('loyalty_program_utilization')
    ]


def create_derived_metrics() -> List[pl.Expr]:
    """Create complex derived metrics from combinations of features."""
    return [
        # Price flexibility index (higher price variance / booking rate = more flexible)
        (pl.col('totalPrice').std() /
         pl.col('booking_rate').clip(0.01, 1))
        .alias('price_flexibility_index'),

        # Convenience priority score (higher = more emphasis on convenient times)
        ((1 - pl.col('time_of_day_variance')) * 10 +
         pl.col('price_to_duration_sensitivity') * 5)
        .alias('convenience_priority_score'),

        # Loyalty vs price index (higher = more loyal, less price sensitive)
        (pl.col('loyalty_program_utilization') * 10 -
         pl.col('price_position_preference') / 10)
        .alias('loyalty_vs_price_index'),

        # Planning consistency score (inverse of lead time variance)
        (1 / (pl.col('max_booking_lead_days') - pl.col('min_booking_lead_days') + 1))
        .alias('planning_consistency_score'),

        # Luxury index (combination of cabin class and price tier)
        (pl.col('avg_cabin_class') * 20 +
         pl.col('price_position_preference') / 2)
        .alias('luxury_index')
    ]


def extract_customer_features(df: pl.DataFrame) -> pl.DataFrame:
    """
    Extract customer features for clustering analysis.
    Aggregates by profileId to create customer-level features.
    """
    # Check if already processed
    if df.height > 0 and 'total_searches' in df.columns:
        return df

    # Get cabin class columns
    cabin_class_cols = [col for col in df.columns if col.startswith('legs') and col.endswith('_cabinClass')]

    # Create lazy frame and group by profileId
    lazy_df = df.lazy().group_by('profileId')

    # Apply feature groups
    customer_features = lazy_df.agg([
        *create_customer_aggregation_features(),
        *create_booking_lead_time_features(),
        *create_travel_preference_features(),
        *create_cabin_class_features(cabin_class_cols),
        *create_temporal_preference_features(),
        *create_route_specific_features(),
        *create_price_sensitivity_features(),
        *create_service_preference_features()
    ])

    # Materialize to generate the basic features
    base_features = customer_features.collect()

    # Add the derived metrics that depend on the generated features
    enhanced_features = base_features.with_columns(create_derived_metrics())

    print(f"Generated {len(enhanced_features.columns)} customer features for {len(enhanced_features)} customers")
    return enhanced_features


In [None]:
def preprocess_selection_data(df: pl.DataFrame, selections_df=None) -> pl.DataFrame:
    """
    Preprocess the data to add selection information.
    If selections_df is not provided, uses price as a proxy for selection.
    """
    if selections_df is not None and 'selected' in selections_df.columns:
        df_with_selections = df.with_columns(pl.Series('selected', selections_df['selected']))
    else:
        # Use price as a proxy - assume cheaper flights are more likely to be selected
        df_with_selections = df.with_columns([
            (pl.col('price_percentile') < 30).cast(pl.Int32).alias('selected')
        ])

    # Add search session ID
    df_with_search = df_with_selections.with_columns([
        pl.concat_str([
            pl.col('profileId').cast(pl.Str),
            pl.lit('_'),
            pl.col('ranker_id'),
            pl.lit('_'),
            pl.col('requestDate')
        ]).alias('search_session_id')
    ])

    return df_with_search


In [None]:
def generate_customer_features(flight_df: pl.DataFrame, selections_df=None) -> pl.DataFrame:
    """
    Generate comprehensive customer features for flight recommendation.

    Args:
        flight_df: Raw flight search data
        selections_df: Optional dataframe with 'selected' column indicating chosen flights

    Returns:
        Customer-level feature dataframe
    """
    print("Preprocessing flight data...")
    processed_df = preprocess_selection_data(flight_df, selections_df)

    print("Extracting customer features...")
    customer_features = extract_customer_features(processed_df)

    print("Finalizing features...")
    # Fill nulls and convert to appropriate types
    final_features = customer_features.with_columns([
        pl.all().fill_null(0)
    ])

    return final_features

In [38]:
cust_data = extract_customer_features(data)

In [39]:
print(f'cust_data shape: {cust_data.shape}')
cust_data.head(100)

cust_data shape: (32922, 23)


profileId,companyID,sex,nationality,frequentFlyer,isVip,bySelf,corporateTariffCode,ff_normalized,total_searches,roundtrip_preference,unique_routes_searched,min_booking_lead_days,max_booking_lead_days,avg_booking_lead_days,median_booking_lead_days,most_common_departure_airport,unique_departure_airports,most_common_carrier,unique_carriers_used,min_cabin_class,max_cabin_class,avg_cabin_class
i64,i64,bool,i64,str,bool,bool,i64,str,u32,f64,u32,i32,i32,f64,f64,str,u32,str,u32,f64,f64,f64
2877208,61061,true,36,"""SU""",false,true,161,"""SU""",110,1.0,2,6,17,7.054545,7.0,"""NBC""",1,"""SU""",1,1.0,2.0,1.436364
2430627,25515,false,36,,false,true,166,"""""",33,1.0,1,14,14,14.0,14.0,"""ARH""",1,"""SU""",1,1.0,1.0,1.0
995557,44071,true,36,"""SU/S7/N4""",false,true,113,"""SU/S7/N4""",60,1.0,1,28,28,28.0,28.0,"""KJA""",1,"""SU""",3,1.0,1.0,1.0
3392490,42620,false,36,,false,true,108,"""""",63,1.0,1,10,11,10.746032,11.0,"""KUF""",1,"""SU""",1,1.0,1.0,1.0
3580062,56912,false,36,,false,true,75,"""""",117,1.0,1,1,2,1.726496,2.0,"""SVO""",2,"""SU""",3,1.0,1.0,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1422625,36948,false,36,,false,true,153,"""""",105,1.0,1,6,20,13.314286,12.0,"""SVO""",1,"""DP""",2,1.0,2.0,1.057143
3421718,60482,true,47,,false,true,,"""""",5,1.0,1,18,18,18.0,18.0,"""KJA""",1,"""SU""",2,1.0,1.0,1.0
2924922,42702,true,36,,false,true,108,"""""",1116,1.0,1,5,6,5.491935,5.0,"""VKO""",3,"""SU""",5,1.0,2.0,1.241935
2101869,54163,false,36,,false,true,101,"""""",487,0.889117,4,0,21,3.753593,3.0,"""SVO""",7,"""SU""",8,1.0,2.0,1.36653


In [8]:
cust_data.columns

['profileId',
 'companyID',
 'sex',
 'nationality',
 'frequentFlyer',
 'isVip',
 'bySelf',
 'corporateTarrifCode',
 'ff_normalized',
 'total_searches',
 'roundtrip_preference',
 'unique_routes_searched',
 'min_booking_lead_days',
 'max_booking_lead_days',
 'avg_booking_lead_days',
 'median_booking_lead_days',
 'most_common_departure_airport',
 'unique_departure_airports',
 'most_common_carrier',
 'unique_carriers_used',
 'min_cabin_class',
 'max_cabin_class',
 'avg_cabin_class']

### Feature Summary

In [None]:
def display_customer_feature_summary(customer_df: pl.DataFrame) -> None:
    """Display summary statistics for customer features."""
    print("="*60)
    print("CUSTOMER FEATURE SUMMARY")
    print("="*60)

    # Basic info
    print(f"Total unique customers: {customer_df.height:,}")
    print(f"Total features: {len(customer_df.columns)}")

    # Categorical features summary
    categorical_features = ['sex', 'nationality', 'frequentFlyer', 'corporateTarrifCode',
                           'most_common_departure_airport', 'most_common_carrier']

    print("\nCATEGORICAL FEATURES:")
    for feature in categorical_features:
        if feature in customer_df.columns:
            value_counts = customer_df[feature].value_counts().head(5)
            print(f"\n{feature}:")
            for row in value_counts.iter_rows():
                print(f"  {row[0]}: {row[1]:,}")

    # Numerical features summary
    numerical_features = ['total_searches', 'avg_booking_lead_days', 'unique_routes_searched',
                         'unique_departure_airports', 'unique_carriers_used',
                         'min_cabin_class', 'max_cabin_class', 'avg_cabin_class']

    print(f"\nNUMERICAL FEATURES:")
    for feature in numerical_features:
        if feature in customer_df.columns:
            stats = customer_df[feature].describe()
            print(f"\n{feature}:")
            for row in stats.iter_rows():
                print(f"  {row[0]}: {row[1]:.2f}" if row[1] is not None else f"  {row[0]}: None")

    # Boolean features summary
    boolean_features = ['isVip', 'bySelf']
    print(f"\nBOOLEAN FEATURES:")
    for feature in boolean_features:
        if feature in customer_df.columns:
            true_pct = customer_df[feature].mean() * 100 if customer_df[feature].mean() is not None else 0
            print(f"{feature}: {true_pct:.1f}% True")

    print(f"\nRoundtrip preference: {customer_df['roundtrip_preference'].mean() * 100:.1f}% of searches are roundtrip")

In [None]:
display_customer_feature_summary(cust_data)

In [40]:
cust_data = convert_columns_to_snake_case(cust_data)
cust_data.head(20)

profile_id,company_id,sex,nationality,frequent_flyer,is_vip,by_self,corporate_tariff_code,ff_normalized,total_searches,roundtrip_preference,unique_routes_searched,min_booking_lead_days,max_booking_lead_days,avg_booking_lead_days,median_booking_lead_days,most_common_departure_airport,unique_departure_airports,most_common_carrier,unique_carriers_used,min_cabin_class,max_cabin_class,avg_cabin_class
i64,i64,bool,i64,str,bool,bool,i64,str,u32,f64,u32,i32,i32,f64,f64,str,u32,str,u32,f64,f64,f64
2877208,61061,true,36,"""SU""",false,true,161,"""SU""",110,1.0,2,6,17,7.054545,7.0,"""NBC""",1,"""SU""",1,1.0,2.0,1.436364
2430627,25515,false,36,,false,true,166,"""""",33,1.0,1,14,14,14.0,14.0,"""ARH""",1,"""SU""",1,1.0,1.0,1.0
995557,44071,true,36,"""SU/S7/N4""",false,true,113,"""SU/S7/N4""",60,1.0,1,28,28,28.0,28.0,"""KJA""",1,"""SU""",3,1.0,1.0,1.0
3392490,42620,false,36,,false,true,108,"""""",63,1.0,1,10,11,10.746032,11.0,"""KUF""",1,"""SU""",1,1.0,1.0,1.0
3580062,56912,false,36,,false,true,75,"""""",117,1.0,1,1,2,1.726496,2.0,"""SVO""",2,"""SU""",3,1.0,1.0,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2089156,27937,true,36,"""SU/S7/TK""",false,true,24,"""SU/S7/TK""",1699,0.676869,5,1,22,13.703943,11.0,"""LED""",4,"""TK""",14,1.0,2.0,1.222091
1248945,43507,true,36,,false,true,112,"""""",376,0.68617,3,7,16,13.162234,15.0,"""MRV""",3,"""S7""",9,1.0,2.0,1.117465
3226670,44729,true,36,,false,true,161,"""""",42,0.0,2,5,13,11.547619,12.5,"""TJM""",4,"""SU""",8,1.0,1.0,1.0
3570892,53359,true,36,,false,true,,"""""",4,1.0,1,6,6,6.0,6.0,"""ODO""",1,"""IO""",1,1.0,1.0,1.0


### Encode data

In [None]:
# Encode categorical features
drop_cust_cols = ['profile_id']
cust_clustering_df, cust_encoders = prepare_features_for_clustering(cust_data, drop_cust_cols)

# Convert to Pandas DataFrame for StandardScaler
cust_clustering_pd_df = cust_clustering_df.to_pandas().fillna(0)

### Scale Data

In [None]:
# Scale the features for use with KMeans
scaler = StandardScaler()
cust_features_scaled = scaler.fit_transform(cust_clustering_pd_df)

del cust_clustering_pd_df
gc.collect()

In [None]:
cust_clustering_df.head(100)

### Run Cluster Modeling

In [None]:
# Use MiniBatchKMeans to cluster the data (more efficient for very large datasets)
kmeans = MiniBatchKMeans(n_clusters=30, batch_size=50000, random_state=42)
cust_clusters = kmeans.fit_predict(cust_features_scaled)

In [None]:
cust_clusters[:100]

### Analyze Clustering Results

In [None]:
def analyze_cluster_characteristics(customer_df: pl.DataFrame, cluster_col: str = 'cluster') -> Dict:
    """
    Analyze characteristics of each cluster in customer features dataframe.

    Args:
        customer_df: DataFrame with customer features and cluster assignments
        cluster_col: Name of the column containing cluster labels

    Returns:
        Dictionary with detailed cluster analysis
    """
    if cluster_col not in customer_df.columns:
        raise ValueError(f"Cluster column '{cluster_col}' not found in dataframe")

    clusters = sorted(customer_df[cluster_col].unique().to_list())
    analysis = {}

    print(f"Analyzing {len(clusters)} clusters...")

    for cluster_id in clusters:
        cluster_data = customer_df.filter(pl.col(cluster_col) == cluster_id)
        cluster_size = len(cluster_data)
        cluster_pct = (cluster_size / len(customer_df)) * 100

        print(f"Processing Cluster {cluster_id}: {cluster_size:,} customers ({cluster_pct:.1f}%)")

        analysis[f'cluster_{cluster_id}'] = {
            'size': cluster_size,
            'percentage': cluster_pct,
            'demographics': _analyze_demographics(cluster_data),
            'search_behavior': _analyze_search_behavior(cluster_data),
            'booking_patterns': _analyze_booking_patterns(cluster_data),
            'travel_preferences': _analyze_travel_preferences(cluster_data),
            'service_preferences': _analyze_service_preferences(cluster_data)
        }

    return analysis


def _analyze_demographics(cluster_data: pl.DataFrame) -> Dict:
    """Analyze demographic characteristics of a cluster."""
    demographics = {}

    # Gender distribution
    if 'sex' in cluster_data.columns:
        sex_dist = cluster_data['sex'].value_counts().to_dict()
        demographics['gender_distribution'] = sex_dist
        demographics['dominant_gender'] = max(sex_dist, key=sex_dist.get) if sex_dist else None

    # Nationality patterns
    if 'nationality' in cluster_data.columns:
        nationality_counts = cluster_data['nationality'].value_counts().head(5)
        demographics['top_nationalities'] = dict(zip(
            nationality_counts['nationality'].to_list(),
            nationality_counts['count'].to_list()
        ))
        total_customers = len(cluster_data)
        top_nationality_pct = (nationality_counts['count'][0] / total_customers * 100) if len(nationality_counts) > 0 else 0
        demographics['nationality_concentration'] = top_nationality_pct

    # VIP status
    if 'is_vip' in cluster_data.columns:
        vip_pct = cluster_data['is_vip'].mean() * 100 if cluster_data['is_vip'].mean() is not None else 0
        demographics['vip_percentage'] = vip_pct
        demographics['customer_tier'] = 'Premium' if vip_pct > 50 else 'Standard'

    # Travel companion preference
    if 'by_self' in cluster_data.columns:
        solo_pct = cluster_data['by_self'].mean() * 100 if cluster_data['by_self'].mean() is not None else 0
        demographics['solo_traveler_percentage'] = solo_pct
        demographics['travel_style'] = 'Solo-oriented' if solo_pct > 60 else 'Group-oriented' if solo_pct < 40 else 'Mixed'

    # Corporate vs leisure
    if 'corporate_tariff_code' in cluster_data.columns:
        corporate_pct = (cluster_data['corporate_tariff_code'].is_not_null().sum() / len(cluster_data)) * 100
        demographics['corporate_percentage'] = corporate_pct
        demographics['customer_type'] = 'Corporate' if corporate_pct > 70 else 'Leisure' if corporate_pct < 30 else 'Mixed'

    return demographics


def _analyze_search_behavior(cluster_data: pl.DataFrame) -> Dict:
    """Analyze search behavior patterns of a cluster."""
    behavior = {}

    # Search frequency
    if 'total_searches' in cluster_data.columns:
        search_stats = cluster_data['total_searches'].describe()
        behavior['avg_searches'] = search_stats.filter(pl.col('statistic') == 'mean')['value'][0]
        behavior['median_searches'] = search_stats.filter(pl.col('statistic') == '50%')['value'][0]
        behavior['max_searches'] = search_stats.filter(pl.col('statistic') == 'max')['value'][0]

        # Categorize search intensity
        avg_searches = behavior['avg_searches']
        if avg_searches < 2:
            behavior['search_intensity'] = 'Infrequent'
        elif avg_searches < 5:
            behavior['search_intensity'] = 'Moderate'
        elif avg_searches < 10:
            behavior['search_intensity'] = 'Active'
        else:
            behavior['search_intensity'] = 'Power User'

    # Route diversity
    if 'unique_routes_searched' in cluster_data.columns:
        route_stats = cluster_data['unique_routes_searched'].describe()
        behavior['avg_unique_routes'] = route_stats.filter(pl.col('statistic') == 'mean')['value'][0]
        behavior['route_exploration'] = 'Explorer' if behavior['avg_unique_routes'] > 3 else 'Routine' if behavior['avg_unique_routes'] < 1.5 else 'Moderate'

    # Trip type preference
    if 'roundtrip_preference' in cluster_data.columns:
        roundtrip_pct = cluster_data['roundtrip_preference'].mean() * 100
        behavior['roundtrip_percentage'] = roundtrip_pct
        behavior['trip_type_preference'] = 'Roundtrip-focused' if roundtrip_pct > 70 else 'One-way-focused' if roundtrip_pct < 30 else 'Mixed'

    return behavior


def _analyze_booking_patterns(cluster_data: pl.DataFrame) -> Dict:
    """Analyze booking lead time patterns of a cluster."""
    patterns = {}

    lead_time_cols = ['min_booking_lead_days', 'max_booking_lead_days', 'avg_booking_lead_days', 'median_booking_lead_days']

    for col in lead_time_cols:
        if col in cluster_data.columns:
            col_stats = cluster_data[col].describe()
            patterns[col] = col_stats.filter(pl.col('statistic') == 'mean')['value'][0]

    # Categorize booking behavior
    if 'avg_booking_lead_days' in patterns:
        avg_lead = patterns['avg_booking_lead_days']
        if avg_lead < 7:
            patterns['booking_style'] = 'Last-minute'
        elif avg_lead < 30:
            patterns['booking_style'] = 'Short-term planner'
        elif avg_lead < 90:
            patterns['booking_style'] = 'Advance planner'
        else:
            patterns['booking_style'] = 'Long-term planner'

    # Booking consistency
    if 'min_booking_lead_days' in patterns and 'max_booking_lead_days' in patterns:
        lead_range = patterns['max_booking_lead_days'] - patterns['min_booking_lead_days']
        patterns['booking_consistency'] = 'Consistent' if lead_range < 30 else 'Variable' if lead_range < 90 else 'Highly Variable'

    return patterns


def _analyze_travel_preferences(cluster_data: pl.DataFrame) -> Dict:
    """Analyze travel preferences including airports and carriers."""
    preferences = {}

    # Airport preferences
    if 'most_common_departure_airport' in cluster_data.columns:
        airport_counts = cluster_data['most_common_departure_airport'].value_counts().head(10)
        preferences['top_departure_airports'] = dict(zip(
            airport_counts['most_common_departure_airport'].to_list(),
            airport_counts['count'].to_list()
        ))

    # Airport loyalty
    if 'unique_departure_airports' in cluster_data.columns:
        avg_airports = cluster_data['unique_departure_airports'].mean()
        preferences['avg_airports_used'] = avg_airports
        preferences['airport_loyalty'] = 'High' if avg_airports < 2 else 'Moderate' if avg_airports < 4 else 'Low'

    # Carrier preferences
    if 'most_common_carrier' in cluster_data.columns:
        carrier_counts = cluster_data['most_common_carrier'].value_counts().head(10)
        preferences['top_carriers'] = dict(zip(
            carrier_counts['most_common_carrier'].to_list(),
            carrier_counts['count'].to_list()
        ))

    # Carrier loyalty
    if 'unique_carriers_used' in cluster_data.columns:
        avg_carriers = cluster_data['unique_carriers_used'].mean()
        preferences['avg_carriers_used'] = avg_carriers
        preferences['carrier_loyalty'] = 'High' if avg_carriers < 2 else 'Moderate' if avg_carriers < 4 else 'Low'

    # Frequent flyer programs
    if 'ff_normalized' in cluster_data.columns:
        ff_counts = cluster_data['ff_normalized'].value_counts().head(5)
        preferences['frequent_flyer_programs'] = dict(zip(
            ff_counts['ff_normalized'].to_list(),
            ff_counts['count'].to_list()
        ))
        ff_participation = (cluster_data['ff_normalized'].is_not_null().sum() / len(cluster_data)) * 100
        preferences['ff_participation_rate'] = ff_participation

    return preferences


def _analyze_service_preferences(cluster_data: pl.DataFrame) -> Dict:
    """Analyze service level preferences (cabin class)."""
    service = {}

    cabin_cols = ['min_cabin_class', 'max_cabin_class', 'avg_cabin_class']

    for col in cabin_cols:
        if col in cluster_data.columns:
            col_stats = cluster_data[col].describe()
            service[col] = col_stats.filter(pl.col('statistic') == 'mean')['value'][0]

    # Service level categorization
    if 'avg_cabin_class' in service:
        avg_cabin = service['avg_cabin_class']
        if avg_cabin < 1.5:
            service['service_preference'] = 'Economy-focused'
        elif avg_cabin < 2.5:
            service['service_preference'] = 'Premium Economy preferred'
        elif avg_cabin < 3.5:
            service['service_preference'] = 'Business class oriented'
        else:
            service['service_preference'] = 'First class oriented'

    # Service consistency
    if 'min_cabin_class' in service and 'max_cabin_class' in service:
        cabin_range = service['max_cabin_class'] - service['min_cabin_class']
        service['service_consistency'] = 'Consistent' if cabin_range < 1 else 'Flexible' if cabin_range < 2 else 'Highly Variable'

    return service


def display_cluster_interpretation(cluster_analysis: Dict) -> None:
    """Display comprehensive cluster interpretation."""
    print("="*80)
    print("CUSTOMER CLUSTER INTERPRETATION")
    print("="*80)

    total_clusters = len(cluster_analysis)

    for cluster_name, analysis in cluster_analysis.items():
        cluster_id = cluster_name.split('_')[1]

        print(f"\n{'='*60}")
        print(f"CLUSTER {cluster_id}")
        print(f"{'='*60}")
        print(f"Size: {analysis['size']:,} customers ({analysis['percentage']:.1f}% of total)")

        # Demographics
        demo = analysis['demographics']
        print(f"\n📊 DEMOGRAPHICS:")
        if 'dominant_gender' in demo:
            print(f"  • Gender: {demo['dominant_gender']} dominant")
        if 'nationality_concentration' in demo:
            print(f"  • Nationality: {demo['nationality_concentration']:.1f}% concentration in top nationality")
        if 'customer_tier' in demo:
            print(f"  • Customer Tier: {demo['customer_tier']} ({demo.get('vip_percentage', 0):.1f}% VIP)")
        if 'travel_style' in demo:
            print(f"  • Travel Style: {demo['travel_style']} ({demo.get('solo_traveler_percentage', 0):.1f}% solo)")
        if 'customer_type' in demo:
            print(f"  • Customer Type: {demo['customer_type']} ({demo.get('corporate_percentage', 0):.1f}% corporate)")

        # Search Behavior
        behavior = analysis['search_behavior']
        print(f"\n🔍 SEARCH BEHAVIOR:")
        if 'search_intensity' in behavior:
            print(f"  • Search Intensity: {behavior['search_intensity']} ({behavior.get('avg_searches', 0):.1f} avg searches)")
        if 'route_exploration' in behavior:
            print(f"  • Route Exploration: {behavior['route_exploration']} ({behavior.get('avg_unique_routes', 0):.1f} avg routes)")
        if 'trip_type_preference' in behavior:
            print(f"  • Trip Type: {behavior['trip_type_preference']} ({behavior.get('roundtrip_percentage', 0):.1f}% roundtrip)")

        # Booking Patterns
        booking = analysis['booking_patterns']
        print(f"\n📅 BOOKING PATTERNS:")
        if 'booking_style' in booking:
            print(f"  • Booking Style: {booking['booking_style']} ({booking.get('avg_booking_lead_days', 0):.0f} days avg lead time)")
        if 'booking_consistency' in booking:
            print(f"  • Booking Consistency: {booking['booking_consistency']}")

        # Travel Preferences
        travel = analysis['travel_preferences']
        print(f"\n✈️ TRAVEL PREFERENCES:")
        if 'airport_loyalty' in travel:
            print(f"  • Airport Loyalty: {travel['airport_loyalty']} ({travel.get('avg_airports_used', 0):.1f} airports used)")
        if 'carrier_loyalty' in travel:
            print(f"  • Carrier Loyalty: {travel['carrier_loyalty']} ({travel.get('avg_carriers_used', 0):.1f} carriers used)")
        if 'ff_participation_rate' in travel:
            print(f"  • Frequent Flyer Participation: {travel['ff_participation_rate']:.1f}%")

        # Top preferences
        if 'top_departure_airports' in travel and travel['top_departure_airports']:
            top_airport = list(travel['top_departure_airports'].items())[0]
            print(f"  • Top Departure Airport: {top_airport[0]} ({top_airport[1]} customers)")
        if 'top_carriers' in travel and travel['top_carriers']:
            top_carrier = list(travel['top_carriers'].items())[0]
            print(f"  • Top Carrier: {top_carrier[0]} ({top_carrier[1]} customers)")

        # Service Preferences
        service = analysis['service_preferences']
        print(f"\n🛂 SERVICE PREFERENCES:")
        if 'service_preference' in service:
            print(f"  • Service Level: {service['service_preference']} ({service.get('avg_cabin_class', 0):.2f} avg cabin class)")
        if 'service_consistency' in service:
            print(f"  • Service Consistency: {service['service_consistency']}")


def generate_cluster_personas(cluster_analysis: Dict) -> Dict:
    """Generate business personas for each cluster based on analysis."""
    personas = {}

    for cluster_name, analysis in cluster_analysis.items():
        cluster_id = cluster_name.split('_')[1]

        # Extract key characteristics
        demo = analysis['demographics']
        behavior = analysis['search_behavior']
        booking = analysis['booking_patterns']
        travel = analysis['travel_preferences']
        service = analysis['service_preferences']

        # Generate persona name and description
        persona_elements = []

        # Add customer tier
        if demo.get('customer_tier') == 'Premium':
            persona_elements.append('Premium')

        # Add booking style
        booking_style = booking.get('booking_style', '')
        if 'Last-minute' in booking_style:
            persona_elements.append('Spontaneous')
        elif 'Long-term' in booking_style:
            persona_elements.append('Strategic')
        elif 'Advance' in booking_style:
            persona_elements.append('Planned')

        # Add search intensity
        search_intensity = behavior.get('search_intensity', '')
        if search_intensity == 'Power User':
            persona_elements.append('Power')
        elif search_intensity == 'Infrequent':
            persona_elements.append('Occasional')

        # Add travel type
        if demo.get('customer_type') == 'Corporate':
            persona_elements.append('Business')
        elif demo.get('travel_style') == 'Solo-oriented':
            persona_elements.append('Independent')

        # Add service preference
        service_pref = service.get('service_preference', '')
        if 'First class' in service_pref or 'Business class' in service_pref:
            persona_elements.append('Luxury')
        elif 'Economy' in service_pref:
            persona_elements.append('Value')

        # Create persona name
        persona_name = ' '.join(persona_elements[:3]) + ' Traveler'
        if not persona_elements:
            persona_name = f'Segment {cluster_id} Traveler'

        # Generate description
        description_parts = []

        if behavior.get('search_intensity'):
            description_parts.append(f"{behavior['search_intensity']} searchers")

        if booking.get('booking_style'):
            description_parts.append(f"who prefer {booking['booking_style'].lower()} booking")

        if service.get('service_preference'):
            description_parts.append(f"with {service['service_preference'].lower()}")

        if travel.get('carrier_loyalty'):
            description_parts.append(f"and {travel['carrier_loyalty'].lower()} carrier loyalty")

        description = ', '.join(description_parts[:4])

        personas[cluster_name] = {
            'persona_name': persona_name,
            'description': description,
            'size_percentage': analysis['percentage'],
            'key_characteristics': {
                'search_pattern': behavior.get('search_intensity', 'Unknown'),
                'booking_style': booking.get('booking_style', 'Unknown'),
                'service_preference': service.get('service_preference', 'Unknown'),
                'loyalty_level': travel.get('carrier_loyalty', 'Unknown'),
                'customer_tier': demo.get('customer_tier', 'Unknown')
            }
        }

    return personas


def display_cluster_personas(personas: Dict) -> None:
    """Display business personas for clusters."""
    print("\n" + "="*80)
    print("CUSTOMER PERSONAS")
    print("="*80)

    for cluster_name, persona in personas.items():
        cluster_id = cluster_name.split('_')[1]

        print(f"\n🎯 CLUSTER {cluster_id}: {persona['persona_name']}")
        print(f"   Size: {persona['size_percentage']:.1f}% of customer base")
        print(f"   Profile: {persona['description']}")

        chars = persona['key_characteristics']
        print(f"   • Search: {chars['search_pattern']}")
        print(f"   • Booking: {chars['booking_style']}")
        print(f"   • Service: {chars['service_preference']}")
        print(f"   • Loyalty: {chars['loyalty_level']}")
        print(f"   • Tier: {chars['customer_tier']}")


# Usage example:
# cluster_analysis = analyze_cluster_characteristics(customer_features_with_clusters_df)
# display_cluster_interpretation(cluster_analysis)
# personas = generate_cluster_personas(cluster_analysis)
# display_cluster_personas(personas)

In [None]:
# Step 1: Add cluster labels back to your original customer data
def add_clusters_to_customer_data(original_df: pl.DataFrame, cluster_labels: np.ndarray) -> pl.DataFrame:
    """Add cluster labels to the original customer features dataframe."""

    if len(cluster_labels) != len(original_df):
        raise ValueError(f"Cluster labels length ({len(cluster_labels)}) doesn't match dataframe length ({len(original_df)})")

    # Add cluster column
    clustered_df = original_df.with_columns([
        pl.Series(name='cluster', values=cluster_labels.astype(int))
    ])

    print(f"Added cluster labels to {len(clustered_df)} customers")
    print(f"Cluster distribution:")
    cluster_counts = clustered_df['cluster'].value_counts().sort('cluster')
    for row in cluster_counts.iter_rows():
        print(f"  Cluster {row[0]}: {row[1]:,} customers ({row[1]/len(clustered_df)*100:.1f}%)")

    return clustered_df


# Step 2: Quick cluster validation metrics
def evaluate_clustering_quality(scaled_features: np.ndarray, cluster_labels: np.ndarray) -> Dict:
    """Evaluate the quality of clustering using standard metrics."""
    from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

    metrics = {}

    # Calculate metrics
    metrics['silhouette_score'] = silhouette_score(scaled_features, cluster_labels)
    metrics['calinski_harabasz_score'] = calinski_harabasz_score(scaled_features, cluster_labels)
    metrics['davies_bouldin_score'] = davies_bouldin_score(scaled_features, cluster_labels)

    # Number of clusters
    metrics['n_clusters'] = len(np.unique(cluster_labels))

    # Cluster sizes
    unique, counts = np.unique(cluster_labels, return_counts=True)
    metrics['cluster_sizes'] = dict(zip(unique, counts))
    metrics['min_cluster_size'] = counts.min()
    metrics['max_cluster_size'] = counts.max()
    metrics['avg_cluster_size'] = counts.mean()

    return metrics


def display_clustering_metrics(metrics: Dict) -> None:
    """Display clustering quality metrics."""
    print("="*50)
    print("CLUSTERING QUALITY METRICS")
    print("="*50)
    print(f"Number of clusters: {metrics['n_clusters']}")
    print(f"Silhouette Score: {metrics['silhouette_score']:.4f} (higher is better, range: -1 to 1)")
    print(f"Calinski-Harabasz Score: {metrics['calinski_harabasz_score']:.2f} (higher is better)")
    print(f"Davies-Bouldin Score: {metrics['davies_bouldin_score']:.4f} (lower is better)")
    print(f"\nCluster Size Statistics:")
    print(f"  Min cluster size: {metrics['min_cluster_size']:,}")
    print(f"  Max cluster size: {metrics['max_cluster_size']:,}")
    print(f"  Avg cluster size: {metrics['avg_cluster_size']:.0f}")

    # Interpretation
    print(f"\nInterpretation:")
    sil_score = metrics['silhouette_score']
    if sil_score > 0.7:
        sil_quality = "Excellent"
    elif sil_score > 0.5:
        sil_quality = "Good"
    elif sil_score > 0.25:
        sil_quality = "Reasonable"
    else:
        sil_quality = "Poor"

    print(f"  Silhouette score indicates {sil_quality} cluster separation")

    if metrics['min_cluster_size'] < 100:
        print(f"  Warning: Some clusters are very small (min: {metrics['min_cluster_size']})")


# Step 3: Decode cluster characteristics for interpretation
def decode_categorical_features(clustered_df: pl.DataFrame, encoders: Dict) -> pl.DataFrame:
    """Decode categorical features back to original values for interpretation."""

    # Create reverse mapping for each encoder
    reverse_encoders = {}
    for feature, encoder_dict in encoders.items():
        reverse_encoders[f'{feature}_encoded'] = {v: k for k, v in encoder_dict.items()}

    # Decode categorical features
    decode_expressions = []
    for encoded_col, reverse_mapping in reverse_encoders.items():
        # Create mapping expression
        mapping_expr = pl.col(encoded_col).map_elements(
            lambda x: reverse_mapping.get(x, 'Unknown'),
            return_dtype=pl.Utf8
        ).alias(encoded_col.replace('_encoded', '_decoded'))

        decode_expressions.append(mapping_expr)

    # Add decoded columns
    if decode_expressions:
        result_df = clustered_df.with_columns(decode_expressions)
    else:
        result_df = clustered_df

    return result_df


# Step 4: Complete workflow function
def run_cluster_evaluation_workflow(original_customer_df: pl.DataFrame,
                                  scaled_features: np.ndarray,
                                  cluster_labels: np.ndarray,
                                  encoders: Dict) -> Tuple[pl.DataFrame, Dict, Dict]:
    """
    Complete workflow to evaluate and interpret clustering results.

    Args:
        original_customer_df: Original customer features dataframe
        scaled_features: Scaled features used for clustering
        cluster_labels: Cluster assignments from your clustering algorithm
        encoders: Encoding dictionary from prepare_features_for_clustering

    Returns:
        Tuple of (clustered_dataframe, quality_metrics, cluster_analysis)
    """

    print("Starting cluster evaluation workflow...")

    # Step 1: Add clusters to original data
    clustered_df = add_clusters_to_customer_data(original_customer_df, cluster_labels)

    # Step 2: Evaluate clustering quality
    print("\nEvaluating clustering quality...")
    quality_metrics = evaluate_clustering_quality(scaled_features, cluster_labels)
    display_clustering_metrics(quality_metrics)

    # Step 3: Decode categorical features for interpretation
    print("\nDecoding categorical features...")
    decoded_df = decode_categorical_features(clustered_df, encoders)

    # Step 4: Analyze cluster characteristics
    print("\nAnalyzing cluster characteristics...")
    cluster_analysis = analyze_cluster_characteristics(decoded_df, cluster_col='cluster')

    # Step 5: Display interpretation
    display_cluster_interpretation(cluster_analysis)

    # Step 6: Generate personas
    print("\nGenerating customer personas...")
    personas = generate_cluster_personas(cluster_analysis)
    display_cluster_personas(personas)

    return decoded_df, quality_metrics, cluster_analysis


### Add Cluster Labels to Encoded DataFrame

In [None]:
# Add cluster labels to your ENCODED clustering dataframe
clustered_encoded_df = add_clusters_to_customer_data(cust_clustering_df, cust_clusters)

### Eval Cluster Quality

In [None]:
# Evaluate clustering quality
quality_metrics = evaluate_clustering_quality(cust_features_scaled, cust_clusters)
display_clustering_metrics(quality_metrics)

# Decode categorical features for better interpretation
decoded_customer_df = decode_categorical_features(clustered_encoded_df, cust_encoders)

# Run the full cluster analysis
cluster_analysis = analyze_cluster_characteristics(decoded_customer_df, cluster_col='cluster')
display_cluster_interpretation(cluster_analysis)

# Generate business personas
personas = generate_cluster_personas(cluster_analysis)
display_cluster_personas(personas)

### Cluster Optimization

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
# import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

class MiniBatchKMeansOptimizer:
    """
    Optimizer specifically for MiniBatchKMeans on large flight datasets
    """

    def __init__(self, scaled_features, current_labels=None):
        """
        Initialize with your scaled features and current clustering results

        Parameters:
        scaled_features: numpy array or DataFrame of scaled customer features
        current_labels: existing cluster labels (optional)
        """
        self.scaled_features = scaled_features
        self.current_labels = current_labels
        self.n_samples, self.n_features = scaled_features.shape

        print(f"Initialized with {self.n_samples:,} samples and {self.n_features} features")

    def analyze_current_clustering(self):
        """Analyze your current clustering results"""
        if self.current_labels is None:
            print("No current clustering labels provided")
            return None

        print("\n" + "="*60)
        print("CURRENT CLUSTERING ANALYSIS")
        print("="*60)

        # Basic statistics
        unique_labels, counts = np.unique(self.current_labels, return_counts=True)
        n_clusters = len(unique_labels)

        # Calculate metrics (on a sample for efficiency)
        sample_size = min(100000, self.n_samples)
        sample_indices = np.random.choice(self.n_samples, sample_size, replace=False)
        sample_features = self.scaled_features[sample_indices]
        sample_labels = self.current_labels[sample_indices]

        sil_score = silhouette_score(sample_features, sample_labels)
        ch_score = calinski_harabasz_score(sample_features, sample_labels)
        db_score = davies_bouldin_score(sample_features, sample_labels)

        print(f"Metrics (calculated on {sample_size:,} sample):")
        print(f"  Silhouette Score: {sil_score:.4f}")
        print(f"  Calinski-Harabasz Score: {ch_score:.2f}")
        print(f"  Davies-Bouldin Score: {db_score:.4f}")

        # Cluster balance analysis
        cv = counts.std() / counts.mean()
        print(f"\nCluster Balance:")
        print(f"  Number of clusters: {n_clusters}")
        print(f"  Min cluster size: {counts.min():,}")
        print(f"  Max cluster size: {counts.max():,}")
        print(f"  Avg cluster size: {counts.mean():.0f}")
        print(f"  Cluster imbalance (CV): {cv:.3f}")

        # Identify problematic clusters
        large_threshold = np.percentile(counts, 90)
        small_threshold = np.percentile(counts, 10)

        large_clusters = unique_labels[counts > large_threshold]
        small_clusters = unique_labels[counts < small_threshold]

        print(f"\nProblematic Clusters:")
        print(f"  Large clusters (>{large_threshold:.0f}): {large_clusters}")
        print(f"  Small clusters (<{small_threshold:.0f}): {small_clusters}")

        return {
            'metrics': {'silhouette': sil_score, 'ch_score': ch_score, 'db_score': db_score},
            'cluster_sizes': counts,
            'large_clusters': large_clusters,
            'small_clusters': small_clusters,
            'cluster_balance': cv
        }

    def optimize_hyperparameters(self, n_clusters_range=None, batch_sizes=None, n_trials=5):
        """
        Optimize MiniBatchKMeans hyperparameters

        Parameters:
        n_clusters_range: list of cluster numbers to try
        batch_sizes: list of batch sizes to try
        n_trials: number of random initializations per configuration
        """
        if n_clusters_range is None:
            n_clusters_range = [20, 25, 30, 35, 40, 45, 50]

        if batch_sizes is None:
            batch_sizes = [10000, 25000, 50000, 100000]

        print(f"\n🔍 HYPERPARAMETER OPTIMIZATION")
        print(f"Testing {len(n_clusters_range)} cluster counts × {len(batch_sizes)} batch sizes")
        print("="*60)

        results = []
        best_score = -1
        best_config = None

        # Use sample for evaluation to speed up
        eval_sample_size = min(50000, self.n_samples)
        eval_indices = np.random.choice(self.n_samples, eval_sample_size, replace=False)
        eval_features = self.scaled_features[eval_indices]

        for n_clusters in n_clusters_range:
            for batch_size in batch_sizes:
                print(f"Testing n_clusters={n_clusters}, batch_size={batch_size:,}")

                trial_scores = []

                for trial in range(n_trials):
                    # Fit MiniBatchKMeans
                    mbk = MiniBatchKMeans(
                        n_clusters=n_clusters,
                        batch_size=min(batch_size, self.n_samples),
                        random_state=42 + trial,
                        n_init=1,
                        max_iter=100
                    )

                    # Fit on full data, evaluate on sample
                    mbk.fit(self.scaled_features)
                    eval_labels = mbk.predict(eval_features)

                    # Calculate silhouette score
                    sil_score = silhouette_score(eval_features, eval_labels)
                    trial_scores.append(sil_score)

                avg_score = np.mean(trial_scores)
                std_score = np.std(trial_scores)

                results.append({
                    'n_clusters': n_clusters,
                    'batch_size': batch_size,
                    'avg_silhouette': avg_score,
                    'std_silhouette': std_score,
                    'stability': 1 - (std_score / abs(avg_score)) if avg_score != 0 else 0
                })

                print(f"  Avg silhouette: {avg_score:.4f} ± {std_score:.4f}")

                if avg_score > best_score:
                    best_score = avg_score
                    best_config = {
                        'n_clusters': n_clusters,
                        'batch_size': batch_size,
                        'silhouette': avg_score
                    }

        print(f"\n🏆 BEST CONFIGURATION:")
        print(f"  Clusters: {best_config['n_clusters']}")
        print(f"  Batch size: {best_config['batch_size']:,}")
        print(f"  Silhouette score: {best_config['silhouette']:.4f}")

        return results, best_config

    def apply_optimized_clustering(self, n_clusters, batch_size=50000, n_init=20):
        """
        Apply optimized MiniBatchKMeans clustering

        Parameters:
        n_clusters: optimal number of clusters
        batch_size: optimal batch size
        n_init: number of initializations for better stability
        """
        print(f"\n🚀 APPLYING OPTIMIZED CLUSTERING")
        print(f"Parameters: {n_clusters} clusters, batch_size={batch_size:,}, n_init={n_init}")
        print("="*60)

        # Use multiple initializations for better results
        best_mbk = None
        best_inertia = float('inf')

        for init in range(n_init):
            mbk = MiniBatchKMeans(
                n_clusters=n_clusters,
                batch_size=min(batch_size, self.n_samples),
                random_state=42 + init,
                n_init=1,
                max_iter=300,
                tol=1e-6
            )

            mbk.fit(self.scaled_features)

            if mbk.inertia_ < best_inertia:
                best_inertia = mbk.inertia_
                best_mbk = mbk

            if (init + 1) % 5 == 0:
                print(f"  Completed {init + 1}/{n_init} initializations")

        # Get final labels
        optimized_labels = best_mbk.predict(self.scaled_features)

        # Calculate final metrics on sample
        sample_size = min(50000, self.n_samples)
        sample_indices = np.random.choice(self.n_samples, sample_size, replace=False)
        sample_features = self.scaled_features[sample_indices]
        sample_labels = optimized_labels[sample_indices]

        final_sil = silhouette_score(sample_features, sample_labels)
        final_ch = calinski_harabasz_score(sample_features, sample_labels)
        final_db = davies_bouldin_score(sample_features, sample_labels)

        print(f"\n📊 FINAL METRICS:")
        print(f"  Silhouette Score: {final_sil:.4f}")
        print(f"  Calinski-Harabasz: {final_ch:.2f}")
        print(f"  Davies-Bouldin: {final_db:.4f}")
        print(f"  Inertia: {best_inertia:.2f}")

        return optimized_labels, best_mbk

    def post_process_clusters(self, labels, min_cluster_size=500):
        """
        Post-process clusters to handle imbalances

        Parameters:
        labels: cluster labels
        min_cluster_size: minimum allowed cluster size
        """
        print(f"\n🔧 POST-PROCESSING CLUSTERS")
        print(f"Minimum cluster size: {min_cluster_size}")
        print("="*60)

        unique_labels, counts = np.unique(labels, return_counts=True)

        # Identify small clusters
        small_clusters = unique_labels[counts < min_cluster_size]

        if len(small_clusters) == 0:
            print("No small clusters found. No post-processing needed.")
            return labels

        print(f"Found {len(small_clusters)} small clusters: {small_clusters}")

        # Strategy: Merge small clusters with nearest larger clusters
        processed_labels = labels.copy()

        # Calculate cluster centers
        cluster_centers = {}
        for cluster_id in unique_labels:
            mask = labels == cluster_id
            cluster_centers[cluster_id] = np.mean(self.scaled_features[mask], axis=0)

        # For each small cluster, find the nearest large cluster
        for small_cluster in small_clusters:
            if counts[unique_labels == small_cluster][0] < min_cluster_size:
                small_center = cluster_centers[small_cluster]

                # Find nearest large cluster
                min_distance = float('inf')
                nearest_large_cluster = None

                for large_cluster in unique_labels:
                    if large_cluster != small_cluster and counts[unique_labels == large_cluster][0] >= min_cluster_size:
                        large_center = cluster_centers[large_cluster]
                        distance = np.linalg.norm(small_center - large_center)

                        if distance < min_distance:
                            min_distance = distance
                            nearest_large_cluster = large_cluster

                # Merge small cluster into nearest large cluster
                if nearest_large_cluster is not None:
                    processed_labels[labels == small_cluster] = nearest_large_cluster
                    print(f"  Merged cluster {small_cluster} -> {nearest_large_cluster}")

        # Report final cluster distribution
        final_unique, final_counts = np.unique(processed_labels, return_counts=True)
        print(f"\nFinal cluster distribution:")
        print(f"  Number of clusters: {len(final_unique)}")
        print(f"  Min cluster size: {final_counts.min():,}")
        print(f"  Max cluster size: {final_counts.max():,}")
        print(f"  Avg cluster size: {final_counts.mean():.0f}")

        return processed_labels

    def compare_clusterings(self, old_labels, new_labels, sample_size=50000):
        """
        Compare old vs new clustering results

        Parameters:
        old_labels: original cluster labels
        new_labels: new cluster labels
        sample_size: sample size for metric calculation
        """
        print(f"\n📈 CLUSTERING COMPARISON")
        print("="*60)

        # Sample for comparison
        sample_indices = np.random.choice(len(old_labels), min(sample_size, len(old_labels)), replace=False)
        sample_features = self.scaled_features[sample_indices]

        # Calculate metrics for both
        old_sample_labels = old_labels[sample_indices]
        new_sample_labels = new_labels[sample_indices]

        old_sil = silhouette_score(sample_features, old_sample_labels)
        new_sil = silhouette_score(sample_features, new_sample_labels)

        old_ch = calinski_harabasz_score(sample_features, old_sample_labels)
        new_ch = calinski_harabasz_score(sample_features, new_sample_labels)

        old_db = davies_bouldin_score(sample_features, old_sample_labels)
        new_db = davies_bouldin_score(sample_features, new_sample_labels)

        # Cluster balance
        old_unique, old_counts = np.unique(old_labels, return_counts=True)
        new_unique, new_counts = np.unique(new_labels, return_counts=True)

        old_cv = old_counts.std() / old_counts.mean()
        new_cv = new_counts.std() / new_counts.mean()

        print("Metric Comparison:")
        print(f"{'Metric':<20} {'Original':<12} {'Optimized':<12} {'Improvement':<12}")
        print("-" * 60)
        print(f"{'Silhouette':<20} {old_sil:<12.4f} {new_sil:<12.4f} {((new_sil-old_sil)/abs(old_sil)*100):+.1f}%")
        print(f"{'Calinski-Harabasz':<20} {old_ch:<12.2f} {new_ch:<12.2f} {((new_ch-old_ch)/old_ch*100):+.1f}%")
        print(f"{'Davies-Bouldin':<20} {old_db:<12.4f} {new_db:<12.4f} {((old_db-new_db)/old_db*100):+.1f}%")
        print(f"{'Cluster Balance':<20} {old_cv:<12.4f} {new_cv:<12.4f} {((old_cv-new_cv)/old_cv*100):+.1f}%")
        print(f"{'Num Clusters':<20} {len(old_unique):<12} {len(new_unique):<12} {len(new_unique)-len(old_unique):+}")

        return {
            'old_metrics': {'silhouette': old_sil, 'ch': old_ch, 'db': old_db, 'balance': old_cv},
            'new_metrics': {'silhouette': new_sil, 'ch': new_ch, 'db': new_db, 'balance': new_cv}
        }

def quick_fixes_for_current_clustering(scaled_features, current_labels):
    """
    Quick fixes you can apply to your current clustering without re-running everything
    """
    print("⚡ QUICK FIXES FOR CURRENT CLUSTERING")
    print("="*50)

    optimizer = MiniBatchKMeansOptimizer(scaled_features, current_labels)
    analysis = optimizer.analyze_current_clustering()

    if analysis is None:
        return

    # Apply post-processing
    processed_labels = optimizer.post_process_clusters(current_labels, min_cluster_size=500)

    # Compare results
    if not np.array_equal(current_labels, processed_labels):
        comparison = optimizer.compare_clusterings(current_labels, processed_labels)
        return processed_labels, comparison
    else:
        print("No changes needed in post-processing.")
        return current_labels, None

In [None]:
# Example usage with your existing setup:
# Your existing code:
# kmeans = MiniBatchKMeans(n_clusters=30, batch_size=50000, random_state=42)
# cust_clusters = kmeans.fit_predict(cust_features_scaled)

# Initialize optimizer
optimizer = MiniBatchKMeansOptimizer(cust_features_scaled, cust_clusters)

# Analyze current results
current_analysis = optimizer.analyze_current_clustering()

# Find better hyperparameters
optimization_results, best_config = optimizer.optimize_hyperparameters()

# Apply optimized clustering
new_labels, new_model = optimizer.apply_optimized_clustering(
    n_clusters=best_config['n_clusters'],
    batch_size=best_config['batch_size']
)

# Post-process to handle cluster imbalances
final_labels = optimizer.post_process_clusters(new_labels, min_cluster_size=500)

# Compare old vs new
comparison = optimizer.compare_clusterings(cust_clusters, final_labels)

# Quick fix for current clustering (alternative approach)
# quick_labels, quick_comparison = quick_fixes_for_current_clustering(cust_features_scaled, cust_clusters)

### Additional Optimization Efforts

In [9]:
cust_data.head()

profileId,companyID,sex,nationality,frequentFlyer,isVip,bySelf,corporateTarrifCode,ff_normalized,total_searches,roundtrip_preference,unique_routes_searched,min_booking_lead_days,max_booking_lead_days,avg_booking_lead_days,median_booking_lead_days,most_common_departure_airport,unique_departure_airports,most_common_carrier,unique_carriers_used,min_cabin_class,max_cabin_class,avg_cabin_class
i64,i64,bool,i64,str,bool,bool,i64,str,u32,f64,u32,i32,i32,f64,f64,str,u32,str,u32,f64,f64,f64
3539699,62795,True,36,,False,True,139.0,"""""",11,0.0,1,5,39,32.818182,39.0,"""NER""",1,"""S7""",3,1.0,2.0,1.272727
3518450,60537,True,36,,False,True,,"""""",86,1.0,1,4,5,4.209302,4.0,"""PEE""",1,"""DP""",4,1.0,1.0,1.0
3483976,55341,True,36,,False,True,175.0,"""""",20,1.0,1,3,3,3.0,3.0,"""SVO""",1,"""SU""",1,1.0,2.0,1.6
653973,42620,False,36,"""SU""",True,True,108.0,"""SU""",1819,1.0,1,5,6,5.410665,5.0,"""SVO""",3,"""SU""",5,1.0,1.0,1.0
1421964,36948,True,36,"""SU/S7""",False,True,153.0,"""SU/S7""",244,0.0,1,18,19,18.217213,18.0,"""SVO""",3,"""SU""",10,1.0,4.0,1.622951


In [10]:
cust_data = convert_columns_to_snake_case(cust_data)

profile_id,company_id,sex,nationality,frequent_flyer,is_vip,by_self,corporate_tarrif_code,ff_normalized,total_searches,roundtrip_preference,unique_routes_searched,min_booking_lead_days,max_booking_lead_days,avg_booking_lead_days,median_booking_lead_days,most_common_departure_airport,unique_departure_airports,most_common_carrier,unique_carriers_used,min_cabin_class,max_cabin_class,avg_cabin_class
i64,i64,bool,i64,str,bool,bool,i64,str,u32,f64,u32,i32,i32,f64,f64,str,u32,str,u32,f64,f64,f64
3539699,62795,true,36,,false,true,139,"""""",11,0.0,1,5,39,32.818182,39.0,"""NER""",1,"""S7""",3,1.0,2.0,1.272727
3518450,60537,true,36,,false,true,,"""""",86,1.0,1,4,5,4.209302,4.0,"""PEE""",1,"""DP""",4,1.0,1.0,1.0
3483976,55341,true,36,,false,true,175,"""""",20,1.0,1,3,3,3.0,3.0,"""SVO""",1,"""SU""",1,1.0,2.0,1.6
653973,42620,false,36,"""SU""",true,true,108,"""SU""",1819,1.0,1,5,6,5.410665,5.0,"""SVO""",3,"""SU""",5,1.0,1.0,1.0
1421964,36948,true,36,"""SU/S7""",false,true,153,"""SU/S7""",244,0.0,1,18,19,18.217213,18.0,"""SVO""",3,"""SU""",10,1.0,4.0,1.622951
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3094797,42622,true,36,,false,true,108,"""""",237,0.0,2,0,2,1.274262,1.0,"""SVX""",5,"""U6""",7,1.0,2.0,1.092827
1244613,24728,true,36,"""SU/UT""",false,true,161,"""SU/UT""",110,0.309091,6,2,9,3.781818,3.0,"""VKO""",5,"""SU""",10,1.0,1.0,1.0
1076167,43648,false,36,"""SU""",false,true,,"""SU""",1005,1.0,1,18,19,18.39005,18.0,"""SVO""",3,"""SU""",6,1.0,4.0,2.449254
3531306,60628,true,36,,false,true,,"""""",12,0.0,1,6,7,6.166667,6.0,"""NSK""",1,"""S7""",2,1.0,1.0,1.0


In [13]:
print(f'cust_data shape {cust_data.shape}')
cust_data.head(20)

cust_data shape (32922, 22)


profile_id,company_id,sex,nationality,is_vip,by_self,corporate_tarrif_code,ff_normalized,total_searches,roundtrip_preference,unique_routes_searched,min_booking_lead_days,max_booking_lead_days,avg_booking_lead_days,median_booking_lead_days,most_common_departure_airport,unique_departure_airports,most_common_carrier,unique_carriers_used,min_cabin_class,max_cabin_class,avg_cabin_class
i64,i64,bool,i64,bool,bool,i64,str,u32,f64,u32,i32,i32,f64,f64,str,u32,str,u32,f64,f64,f64
3539699,62795,true,36,false,true,139,"""""",11,0.0,1,5,39,32.818182,39.0,"""NER""",1,"""S7""",3,1.0,2.0,1.272727
3518450,60537,true,36,false,true,,"""""",86,1.0,1,4,5,4.209302,4.0,"""PEE""",1,"""DP""",4,1.0,1.0,1.0
3483976,55341,true,36,false,true,175,"""""",20,1.0,1,3,3,3.0,3.0,"""SVO""",1,"""SU""",1,1.0,2.0,1.6
653973,42620,false,36,true,true,108,"""SU""",1819,1.0,1,5,6,5.410665,5.0,"""SVO""",3,"""SU""",5,1.0,1.0,1.0
1421964,36948,true,36,false,true,153,"""SU/S7""",244,0.0,1,18,19,18.217213,18.0,"""SVO""",3,"""SU""",10,1.0,4.0,1.622951
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3094797,42622,true,36,false,true,108,"""""",237,0.0,2,0,2,1.274262,1.0,"""SVX""",5,"""U6""",7,1.0,2.0,1.092827
1244613,24728,true,36,false,true,161,"""SU/UT""",110,0.309091,6,2,9,3.781818,3.0,"""VKO""",5,"""SU""",10,1.0,1.0,1.0
1076167,43648,false,36,false,true,,"""SU""",1005,1.0,1,18,19,18.39005,18.0,"""SVO""",3,"""SU""",6,1.0,4.0,2.449254
3531306,60628,true,36,false,true,,"""""",12,0.0,1,6,7,6.166667,6.0,"""NSK""",1,"""S7""",2,1.0,1.0,1.0


### Another Approach

In [28]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.ensemble import IsolationForest
import umap
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')

class FlightCustomerClusteringAnalyzer:
    def __init__(self, df, use_existing_features=True):
        self.df = df
        self.use_existing_features = use_existing_features
        self.processed_df = None
        self.scaled_features = None
        self.cluster_labels = None

    def engineer_advanced_features(self):
        """Create more sophisticated features for better clustering"""

        if not self.use_existing_features:
            # Original feature engineering for raw data
            lazy_df = self.df.lazy()

            # 1. BEHAVIORAL PATTERNS
            behavioral_features = [
                # Search intensity patterns
                (pl.col('total_searches') / pl.col('unique_routes_searched')).alias('search_intensity_per_route'),

                # Planning behavior
                (pl.col('max_booking_lead_days') - pl.col('min_booking_lead_days')).alias('lead_time_variance'),
                (pl.col('avg_booking_lead_days') / pl.col('median_booking_lead_days')).alias('lead_time_skew'),

                # Loyalty patterns
                (pl.col('unique_carriers_used') / pl.col('total_searches')).alias('carrier_diversity'),
                (pl.col('unique_departure_airports') / pl.col('total_searches')).alias('airport_diversity'),

                # Service preferences
                (pl.col('max_cabin_class') - pl.col('min_cabin_class')).alias('cabin_class_range'),

                # VIP and corporate indicators combined
                (pl.col('is_vip').cast(pl.Int8) * 2 + pl.col('corporate_tariff_code').is_not_null().cast(pl.Int8)).alias('customer_tier'),
            ]

            processed_df = lazy_df.with_columns(behavioral_features).collect()

        else:
            # Work with your existing aggregated features
            print("Using existing customer features - adding enhanced behavioral patterns...")

            # Check which columns exist in your dataset
            available_cols = self.df.columns

            # Create advanced features based on what's available
            enhanced_features = []

            if all(col in available_cols for col in ['total_searches', 'unique_routes_searched']):
                enhanced_features.append(
                    (pl.col('total_searches') / pl.col('unique_routes_searched').clip(1)).alias('search_intensity_per_route')
                )

            if all(col in available_cols for col in ['max_booking_lead_days', 'min_booking_lead_days']):
                enhanced_features.append(
                    (pl.col('max_booking_lead_days') - pl.col('min_booking_lead_days')).alias('lead_time_variance')
                )

            if all(col in available_cols for col in ['avg_booking_lead_days', 'median_booking_lead_days']):
                enhanced_features.append(
                    (pl.col('avg_booking_lead_days') / pl.col('median_booking_lead_days').clip(1)).alias('lead_time_skew')
                )

            if all(col in available_cols for col in ['unique_carriers_used', 'total_searches']):
                enhanced_features.append(
                    (pl.col('unique_carriers_used') / pl.col('total_searches').clip(1)).alias('carrier_diversity')
                )

            if all(col in available_cols for col in ['unique_departure_airports', 'total_searches']):
                enhanced_features.append(
                    (pl.col('unique_departure_airports') / pl.col('total_searches').clip(1)).alias('airport_diversity')
                )

            if all(col in available_cols for col in ['max_cabin_class', 'min_cabin_class']):
                enhanced_features.append(
                    (pl.col('max_cabin_class') - pl.col('min_cabin_class')).alias('cabin_class_range')
                )

            if 'is_vip' in available_cols and 'corporate_tariff_code' in available_cols:
                enhanced_features.append(
                    (pl.col('is_vip').cast(pl.Int8) * 2 + pl.col('corporate_tariff_code').is_not_null().cast(pl.Int8)).alias('customer_tier')
                )

            # Apply enhancements
            if enhanced_features:
                processed_df = self.df.with_columns(enhanced_features)
            else:
                processed_df = self.df.clone()
                print("Warning: Could not create enhanced features - using original dataset")

        return processed_df

    def create_interaction_features(self, df):
        """Create interaction features between key variables"""

        interaction_features = []

        # VIP interactions
        vip_interactions = [
            'search_intensity_per_route * is_vip',
            'carrier_diversity * is_vip',
            'avg_cabin_class * is_vip'
        ]

        # Corporate interactions
        corp_interactions = [
            'total_searches * (corporate_tariff_code.is_not_null())',
            'roundtrip_preference * (corporate_tariff_code.is_not_null())',
            'lead_time_variance * (corporate_tariff_code.is_not_null())'
        ]

        # Convert to pandas for easier interaction creation
        df_pd = df.to_pandas()

        # Create VIP interactions
        df_pd['vip_search_intensity'] = df_pd['search_intensity_per_route'] * df_pd['is_vip']
        df_pd['vip_carrier_diversity'] = df_pd['carrier_diversity'] * df_pd['is_vip']
        df_pd['vip_cabin_preference'] = df_pd['avg_cabin_class'] * df_pd['is_vip']

        # Create corporate interactions
        has_corp = df_pd['corporate_tariff_code'].notna().astype(int)
        df_pd['corp_search_volume'] = df_pd['total_searches'] * has_corp
        df_pd['corp_roundtrip_pref'] = df_pd['roundtrip_preference'] * has_corp
        df_pd['corp_planning_variance'] = df_pd['lead_time_variance'] * has_corp

        return pl.from_pandas(df_pd)

    def remove_outliers(self, df, method='isolation_forest', contamination=0.05):
        """Remove outliers using various methods"""

        # Get numeric columns only
        numeric_cols = [col for col, dtype in df.schema.items() if dtype.is_numeric()]
        df_numeric = df.select(numeric_cols)

        if method == 'isolation_forest':
            iso_forest = IsolationForest(contamination=contamination, random_state=42)
            outlier_labels = iso_forest.fit_predict(df_numeric.to_pandas())
            mask = outlier_labels == 1

        elif method == 'zscore':
            # Remove rows where any feature has |z-score| > 3
            df_pd = df_numeric.to_pandas()
            z_scores = np.abs(zscore(df_pd, nan_policy='omit'))
            mask = (z_scores < 3).all(axis=1)

        elif method == 'iqr':
            # Remove rows outside 1.5*IQR for any feature
            df_pd = df_numeric.to_pandas()
            Q1 = df_pd.quantile(0.25)
            Q3 = df_pd.quantile(0.75)
            IQR = Q3 - Q1
            mask = ~((df_pd < (Q1 - 1.5 * IQR)) | (df_pd > (Q3 + 1.5 * IQR))).any(axis=1)

        cleaned_df = df.filter(pl.Series('mask', mask))
        print(f"Removed {len(df) - len(cleaned_df):,} outliers ({(len(df) - len(cleaned_df))/len(df)*100:.1f}%)")

        return cleaned_df

    def advanced_feature_encoding(self, df):
        """More sophisticated encoding for categorical features"""

        # Get categorical columns
        categorical_cols = [col for col, dtype in df.schema.items() if not dtype.is_numeric() and col not in ['id', 'ranker_id', 'request_date']]

        encoded_df = df.clone()
        encoders = {}

        for col in categorical_cols:
            if col == 'ff_normalized':
                # Special handling for frequent flyer programs (list feature)
                # Create binary features for most common programs
                all_programs = []
                for programs in df[col].to_list():
                    if isinstance(programs, list):
                        all_programs.extend(programs)

                from collections import Counter
                top_programs = Counter(all_programs).most_common(10)

                for program, _ in top_programs:
                    encoded_df = encoded_df.with_columns([
                        pl.col(col).map_elements(
                            lambda x: 1 if isinstance(x, list) and program in x else 0,
                            return_dtype=pl.Int8
                        ).alias(f'ff_{program}')
                    ])

                # Add count of total programs
                encoded_df = encoded_df.with_columns([
                    pl.col(col).map_elements(
                        lambda x: len(x) if isinstance(x, list) else 0,
                        return_dtype=pl.Int8
                    ).alias('ff_program_count')
                ])

            else:
                # Target encoding for high-cardinality categoricals like airports/carriers
                if col in ['most_common_departure_airport', 'most_common_carrier']:
                    # Use frequency encoding
                    value_counts = df[col].value_counts()
                    freq_map = {row[0]: row[1] for row in value_counts.rows()}

                    encoded_df = encoded_df.with_columns([
                        pl.col(col).replace(freq_map, default=0).alias(f'{col}_frequency')
                    ])
                else:
                    # Standard label encoding for low-cardinality features
                    unique_values = df[col].fill_null('MISSING').unique().sort().to_list()
                    encoders[col] = {val: idx for idx, val in enumerate(unique_values)}

                    encoded_df = encoded_df.with_columns([
                        pl.col(col).fill_null('MISSING').replace(encoders[col]).alias(f'{col}_encoded')

                    ])

        # Remove original categorical columns
        final_df = encoded_df.select(pl.exclude(categorical_cols + ['id', 'ranker_id', 'request_date']))

        return final_df.fill_null(0), encoders

    def dimensionality_reduction(self, scaled_features, method='pca', n_components=50):
        """Apply dimensionality reduction before clustering"""

        if method == 'pca':
            reducer = PCA(n_components=n_components, random_state=42)

        elif method == 'truncated_svd':
            reducer = TruncatedSVD(n_components=n_components, random_state=42)

        elif method == 'umap':
            reducer = umap.UMAP(n_components=n_components, random_state=42, n_neighbors=15)

        reduced_features = reducer.fit_transform(scaled_features)

        if hasattr(reducer, 'explained_variance_ratio_'):
            total_variance = reducer.explained_variance_ratio_.sum()
            print(f"{method.upper()} retained {total_variance:.3f} of total variance with {n_components} components")

        return reduced_features, reducer

    def alternative_clustering_methods(self, features):
        """Try different clustering algorithms"""

        results = {}

        # 1. Gaussian Mixture Models
        print("Testing Gaussian Mixture Models...")
        best_gmm_score = -1
        best_gmm_n = 0

        for n_clusters in [15, 20, 25, 30, 35, 40]:
            gmm = GaussianMixture(n_components=n_clusters, random_state=42, covariance_type='full')
            labels = gmm.fit_predict(features)

            if len(set(labels)) > 1:  # Ensure we have multiple clusters
                score = silhouette_score(features, labels)
                if score > best_gmm_score:
                    best_gmm_score = score
                    best_gmm_n = n_clusters

        # Fit best GMM
        best_gmm = GaussianMixture(n_components=best_gmm_n, random_state=42, covariance_type='full')
        gmm_labels = best_gmm.fit_predict(features)
        results['gmm'] = {
            'labels': gmm_labels,
            'silhouette': silhouette_score(features, gmm_labels),
            'n_clusters': len(set(gmm_labels))
        }

        # 2. DBSCAN
        print("Testing DBSCAN...")
        # Test different eps values
        best_dbscan_score = -1
        best_dbscan_eps = 0

        for eps in [0.3, 0.5, 0.7, 1.0, 1.5]:
            dbscan = DBSCAN(eps=eps, min_samples=50)
            labels = dbscan.fit_predict(features)

            if len(set(labels)) > 1 and -1 not in labels:  # Ensure valid clustering
                score = silhouette_score(features, labels)
                if score > best_dbscan_score:
                    best_dbscan_score = score
                    best_dbscan_eps = eps

        if best_dbscan_eps > 0:
            best_dbscan = DBSCAN(eps=best_dbscan_eps, min_samples=50)
            dbscan_labels = best_dbscan.fit_predict(features)
            results['dbscan'] = {
                'labels': dbscan_labels,
                'silhouette': silhouette_score(features, dbscan_labels),
                'n_clusters': len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
            }

        # 3. Agglomerative Clustering
        print("Testing Agglomerative Clustering...")
        best_agg_score = -1
        best_agg_n = 0

        for n_clusters in [10, 15, 20, 25, 30, 35]:
            agg = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
            labels = agg.fit_predict(features)

            score = silhouette_score(features, labels)
            if score > best_agg_score:
                best_agg_score = score
                best_agg_n = n_clusters

        best_agg = AgglomerativeClustering(n_clusters=best_agg_n, linkage='ward')
        agg_labels = best_agg.fit_predict(features)
        results['agglomerative'] = {
            'labels': agg_labels,
            'silhouette': silhouette_score(features, agg_labels),
            'n_clusters': len(set(agg_labels))
        }

        return results

    def comprehensive_clustering_pipeline(self):
        """Complete pipeline with all improvements"""

        print("🚀 Starting comprehensive clustering analysis...")

        # 1. Feature Engineering
        print("\n1️⃣ Advanced feature engineering...")
        enhanced_df = self.engineer_advanced_features()
        enhanced_df = self.create_interaction_features(enhanced_df)

        # 2. Remove outliers
        print("\n2️⃣ Removing outliers...")
        cleaned_df = self.remove_outliers(enhanced_df, method='isolation_forest')

        # 3. Advanced encoding
        print("\n3️⃣ Advanced feature encoding...")
        encoded_df, encoders = self.advanced_feature_encoding(cleaned_df)

        # 4. Feature scaling with robust scaler
        print("\n4️⃣ Feature scaling...")
        scaler = RobustScaler()  # Less sensitive to outliers than StandardScaler
        scaled_features = scaler.fit_transform(encoded_df.to_pandas())

        # 5. Dimensionality reduction
        print("\n5️⃣ Dimensionality reduction...")
        reduced_features, reducer = self.dimensionality_reduction(scaled_features, method='pca', n_components=30)

        # 6. Try alternative clustering methods
        print("\n6️⃣ Testing clustering algorithms...")
        clustering_results = self.alternative_clustering_methods(reduced_features)

        # 7. Compare results
        print("\n🏆 CLUSTERING RESULTS COMPARISON:")
        print("=" * 60)

        for method, results in clustering_results.items():
            print(f"{method.upper():15s} | Silhouette: {results['silhouette']:.4f} | Clusters: {results['n_clusters']:2d}")

        # Select best method
        best_method = max(clustering_results.items(), key=lambda x: x[1]['silhouette'])
        print(f"\n🥇 Best method: {best_method[0].upper()} (Silhouette: {best_method[1]['silhouette']:.4f})")

        # Store results
        self.processed_df = encoded_df
        self.scaled_features = reduced_features
        self.cluster_labels = best_method[1]['labels']
        self.encoders = encoders
        self.scaler = scaler
        self.reducer = reducer

        return {
            'cleaned_df': cleaned_df,
            'processed_df': encoded_df,
            'scaled_features': reduced_features,
            'cluster_labels': best_method[1]['labels'],
            'best_method': best_method[0],
            'best_silhouette': best_method[1]['silhouette'],
            'encoders': encoders,
            'scaler': scaler,
            'reducer': reducer
        }

    def analyze_clusters(self, original_df=None):
        """Analyze the final clusters"""
        if self.cluster_labels is None:
            print("No clustering results found. Run comprehensive_clustering_pipeline first.")
            return

        # Add cluster labels to original dataframe for analysis
        if original_df is not None:
            # Check if sizes match
            if len(self.cluster_labels) != len(original_df):
                print(f"Warning: Cluster labels size ({len(self.cluster_labels)}) doesn't match DataFrame size ({len(original_df)})")
                print("This likely happened because outliers were removed during clustering.")

                # Create a mask of rows that were kept after outlier removal
                if hasattr(self, 'outlier_indices'):
                    # If we tracked which rows were removed
                    mask = pl.Series(pl.UInt32, range(len(original_df))).is_in(self.outlier_indices)
                    analysis_df = original_df.filter(mask)
                else:
                    # If we didn't track indices, just use the first n rows
                    # This is not ideal but will avoid the error
                    print("Warning: Cannot map exact rows. Using first rows of original dataframe.")
                    analysis_df = original_df.head(len(self.cluster_labels))
            else:
                # Sizes match, proceed normally
                analysis_df = original_df

            # Now add cluster labels safely
            analysis_df = analysis_df.with_columns(pl.Series('cluster', self.cluster_labels))

            # Cluster profiling
            print("\n📊 CLUSTER PROFILES:")
            print("=" * 50)

            cluster_profiles = analysis_df.group_by('cluster').agg([
                pl.col('total_searches').mean().alias('avg_searches'),
                pl.col('is_vip').mean().alias('vip_rate'),
                pl.col('roundtrip_preference').mean().alias('roundtrip_rate'),
                pl.col('avg_booking_lead_days').mean().alias('avg_lead_days'),
                pl.col('unique_carriers_used').mean().alias('avg_carriers'),
                pl.count().alias('size')
            ]).sort('cluster')

            print(cluster_profiles)

        return self.cluster_labels




In [41]:
# USAGE OPTIONS:

# OPTION 1: Use your existing customer-level aggregated features (RECOMMENDED)

# Initialize analyzer for existing features
analyzer = FlightCustomerClusteringAnalyzer(cust_data.drop('profile_id'), use_existing_features=True)

# Run comprehensive analysis
results = analyzer.comprehensive_clustering_pipeline()

# Analyze clusters
cluster_labels = analyzer.analyze_clusters(results['cleaned_df'])


🚀 Starting comprehensive clustering analysis...

1️⃣ Advanced feature engineering...
Using existing customer features - adding enhanced behavioral patterns...

2️⃣ Removing outliers...
Removed 1,647 outliers (5.0%)

3️⃣ Advanced feature encoding...

4️⃣ Feature scaling...

5️⃣ Dimensionality reduction...
PCA retained 1.000 of total variance with 30 components

6️⃣ Testing clustering algorithms...
Testing Gaussian Mixture Models...
Testing DBSCAN...
Testing Agglomerative Clustering...

🏆 CLUSTERING RESULTS COMPARISON:
GMM             | Silhouette: -0.1461 | Clusters: 35
AGGLOMERATIVE   | Silhouette: 0.8143 | Clusters: 10

🥇 Best method: AGGLOMERATIVE (Silhouette: 0.8143)

📊 CLUSTER PROFILES:
shape: (10, 7)
┌─────────┬──────────────┬──────────┬────────────────┬───────────────┬──────────────┬───────┐
│ cluster ┆ avg_searches ┆ vip_rate ┆ roundtrip_rate ┆ avg_lead_days ┆ avg_carriers ┆ size  │
│ ---     ┆ ---          ┆ ---      ┆ ---            ┆ ---           ┆ ---          ┆ ---   │
│ i

In [21]:
# OPTION 2: Create fresh customer aggregation from raw flight data

# Create customer aggregation with richer behavioral features
def create_enhanced_customer_features(df_raw):
    # Advanced customer-level aggregation from raw search data
    customer_features = df_raw.group_by('profileId').agg([
        # Basic stats
        pl.count().alias('total_searches'),
        pl.col('ranker_id').n_unique().alias('unique_sessions'),

        # Route behavior
        pl.col('searchRoute').n_unique().alias('unique_routes'),
        pl.col('searchRoute').str.contains('/').mean().alias('roundtrip_preference'),
        pl.col('searchRoute').drop_nulls().n_unique().alias('unique_routes_searched'),

        # Pricing behavior
        pl.col('totalPrice').mean().alias('avg_total_price'),
        pl.col('totalPrice').median().alias('median_total_price'),
        pl.col('totalPrice').std().alias('price_std'),

        # Timing patterns
        pl.col('legs0_departureAt').str.to_datetime().dt.hour().mode().first().alias('preferred_departure_hour'),

        # Booking lead time (days between request and departure)
        ((pl.col('legs0_departureAt').str.to_datetime() - pl.col('requestDate').cast(pl.Datetime)) / pl.duration(days=1))
        .cast(pl.Int32).mean().alias('avg_booking_lead_days'),
        ((pl.col('legs0_departureAt').str.to_datetime() - pl.col('requestDate').cast(pl.Datetime)) / pl.duration(days=1))
        .cast(pl.Int32).median().alias('median_booking_lead_days'),
        ((pl.col('legs0_departureAt').str.to_datetime() - pl.col('requestDate').cast(pl.Datetime)) / pl.duration(days=1))
        .cast(pl.Int32).min().alias('min_booking_lead_days'),
        ((pl.col('legs0_departureAt').str.to_datetime() - pl.col('requestDate').cast(pl.Datetime)) / pl.duration(days=1))
        .cast(pl.Int32).max().alias('max_booking_lead_days'),

        # Airline preferences
        pl.col('legs0_segments0_marketingCarrier_code').mode().first().alias('most_common_carrier'),
        pl.col('legs0_segments0_marketingCarrier_code').n_unique().alias('unique_carriers_used'),

        # Airport preferences
        pl.col('legs0_segments0_departureFrom_airport_iata').mode().first().alias('most_common_departure'),
        pl.col('legs0_segments0_departureFrom_airport_iata').n_unique().alias('unique_departure_airports'),

        # Service class preferences
        pl.col('legs0_segments0_cabinClass').mean().alias('avg_cabin_class'),
        pl.col('legs0_segments0_cabinClass').min().alias('min_cabin_class'),
        pl.col('legs0_segments0_cabinClass').max().alias('max_cabin_class'),

        # Selection behavior (from training data)
        pl.col('selected').sum().alias('total_bookings'),
        pl.col('selected').mean().alias('booking_rate'),

        # User attributes (should be constant per user)
        pl.col('sex').first(),
        pl.col('nationality').first(),
        pl.col('isVip').first().alias('is_vip'),
        pl.col('bySelf').first().alias('self_type'),
        pl.col('frequentFlyer').drop_nulls().first().str.replace('- ЮТэйр ЗАО', 'UT').fill_null('').alias('ff_normalized'),
        pl.col('corporateTariffCode').first().alias('corporate_tariff_code'),
        pl.col('companyID').first().alias('company_id'),
    ])

    return customer_features.drop('profileId')


In [None]:
data = pl.read_parquet('/kaggle/input/aeroclub-recsys-2025/train.parquet')

In [22]:
# Create Customer Features
cust_features = create_enhanced_customer_features(data)

In [23]:
cust_features.head(20)

total_searches,unique_sessions,unique_routes,roundtrip_preference,unique_routes_searched,avg_total_price,median_total_price,price_std,preferred_departure_hour,avg_booking_lead_days,median_booking_lead_days,min_booking_lead_days,max_booking_lead_days,most_common_carrier,unique_carriers_used,most_common_departure,unique_departure_airports,avg_cabin_class,min_cabin_class,max_cabin_class,total_bookings,booking_rate,sex,nationality,is_vip,self_type,ff_normalized,corporate_tariff_code,company_id
u32,u32,u32,f64,u32,f64,f64,f64,i8,f64,f64,i32,i32,str,u32,str,u32,f64,f64,f64,i64,f64,bool,i64,bool,bool,str,i64,i64
76,1,1,0.0,1,28523.697368,19168.0,28463.330899,23,2.565789,3.0,2,3,"""SU""",6,"""KUF""",1,1.328947,1.0,2.0,1,0.013158,true,36,false,true,"""""",,42622
1740,2,2,1.0,2,46557.237356,28648.0,35101.445349,8,6.274138,7.0,4,8,"""SU""",7,"""SVO""",4,1.337931,1.0,2.0,2,0.001149,true,36,false,true,"""S7""",,54218
316,1,1,1.0,1,17435.933544,15323.0,5943.099361,18,11.297468,11.0,11,12,"""SU""",4,"""LED""",1,1.0,1.0,1.0,1,0.003165,false,36,false,true,"""SU/S7""",,42620
6079,4,4,0.971377,4,37307.698306,23034.0,30056.837301,8,11.717059,11.0,11,27,"""SU""",11,"""SVO""",4,1.405494,1.0,2.0,4,0.000658,true,36,false,true,"""SU/S7""",,43648
69,1,1,1.0,1,22435.869565,20148.0,6695.33547,5,13.782609,14.0,13,14,"""SU""",2,"""VOG""",1,1.0,1.0,1.0,1,0.014493,true,36,false,true,"""""",,40253
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
186,2,2,0.0,2,24369.075269,14801.0,17799.56644,10,21.860215,23.0,18,24,"""U6""",13,"""AER""",5,1.021505,1.0,2.0,2,0.010753,false,36,false,true,"""""",,54218
4102,19,11,0.004388,11,27593.840322,18662.0,24349.139436,20,10.326914,10.0,2,32,"""SU""",14,"""LED""",9,1.39883,1.0,4.0,19,0.004632,false,36,false,true,"""""",,63418
447,2,2,1.0,2,106717.610738,81378.0,63709.57609,20,25.836689,31.0,15,31,"""HU""",4,"""SVO""",1,1.131991,1.0,4.0,2,0.004474,false,36,false,true,"""""",,59766
19,1,1,1.0,1,14530.947368,12595.0,4469.00146,17,9.631579,10.0,9,10,"""SU""",2,"""NBC""",1,1.0,1.0,1.0,1,0.052632,true,36,false,true,"""UT/S7""",,61061


In [29]:
# Run clustering analysis
analyzer = FlightCustomerClusteringAnalyzer(cust_features, use_existing_features=True)
results = analyzer.comprehensive_clustering_pipeline()


🚀 Starting comprehensive clustering analysis...

1️⃣ Advanced feature engineering...
Using existing customer features - adding enhanced behavioral patterns...

2️⃣ Removing outliers...
Removed 1,647 outliers (5.0%)

3️⃣ Advanced feature encoding...

4️⃣ Feature scaling...

5️⃣ Dimensionality reduction...
PCA retained 1.000 of total variance with 30 components

6️⃣ Testing clustering algorithms...
Testing Gaussian Mixture Models...
Testing DBSCAN...
Testing Agglomerative Clustering...

🏆 CLUSTERING RESULTS COMPARISON:
GMM             | Silhouette: -0.0259 | Clusters: 20
AGGLOMERATIVE   | Silhouette: 0.5304 | Clusters: 10

🥇 Best method: AGGLOMERATIVE (Silhouette: 0.5304)


In [30]:
results['cleaned_df'].head(20)

total_searches,unique_sessions,unique_routes,roundtrip_preference,unique_routes_searched,avg_total_price,median_total_price,price_std,preferred_departure_hour,avg_booking_lead_days,median_booking_lead_days,min_booking_lead_days,max_booking_lead_days,most_common_carrier,unique_carriers_used,most_common_departure,unique_departure_airports,avg_cabin_class,min_cabin_class,max_cabin_class,total_bookings,booking_rate,sex,nationality,is_vip,self_type,ff_normalized,corporate_tariff_code,company_id,search_intensity_per_route,lead_time_variance,lead_time_skew,carrier_diversity,airport_diversity,cabin_class_range,customer_tier,vip_search_intensity,vip_carrier_diversity,vip_cabin_preference,corp_search_volume,corp_roundtrip_pref,corp_planning_variance
u32,u32,u32,f64,u32,f64,f64,f64,i8,f64,f64,i32,i32,str,u32,str,u32,f64,f64,f64,i64,f64,bool,i64,bool,bool,str,f64,i64,f64,i32,f64,f64,f64,f64,i8,f64,f64,f64,i64,f64,i64
76,1,1,0.0,1,28523.697368,19168.0,28463.330899,23,2.565789,3.0,2,3,"""SU""",6,"""KUF""",1,1.328947,1.0,2.0,1,0.013158,true,36,false,true,"""""",,42622,76.0,1,0.855263,0.078947,0.013158,1.0,0,0.0,0.0,0.0,0,0.0,0
1740,2,2,1.0,2,46557.237356,28648.0,35101.445349,8,6.274138,7.0,4,8,"""SU""",7,"""SVO""",4,1.337931,1.0,2.0,2,0.001149,true,36,false,true,"""S7""",,54218,870.0,4,0.896305,0.004023,0.002299,1.0,0,0.0,0.0,0.0,0,0.0,0
316,1,1,1.0,1,17435.933544,15323.0,5943.099361,18,11.297468,11.0,11,12,"""SU""",4,"""LED""",1,1.0,1.0,1.0,1,0.003165,false,36,false,true,"""SU/S7""",,42620,316.0,1,1.027043,0.012658,0.003165,0.0,0,0.0,0.0,0.0,0,0.0,0
6079,4,4,0.971377,4,37307.698306,23034.0,30056.837301,8,11.717059,11.0,11,27,"""SU""",11,"""SVO""",4,1.405494,1.0,2.0,4,0.000658,true,36,false,true,"""SU/S7""",,43648,1519.75,16,1.065187,0.00181,0.000658,1.0,0,0.0,0.0,0.0,0,0.0,0
69,1,1,1.0,1,22435.869565,20148.0,6695.33547,5,13.782609,14.0,13,14,"""SU""",2,"""VOG""",1,1.0,1.0,1.0,1,0.014493,true,36,false,true,"""""",,40253,69.0,1,0.984472,0.028986,0.014493,0.0,0,0.0,0.0,0.0,0,0.0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
186,2,2,0.0,2,24369.075269,14801.0,17799.56644,10,21.860215,23.0,18,24,"""U6""",13,"""AER""",5,1.021505,1.0,2.0,2,0.010753,false,36,false,true,"""""",,54218,93.0,6,0.950444,0.069892,0.026882,1.0,0,0.0,0.0,0.0,0,0.0,0
447,2,2,1.0,2,106717.610738,81378.0,63709.57609,20,25.836689,31.0,15,31,"""HU""",4,"""SVO""",1,1.131991,1.0,4.0,2,0.004474,false,36,false,true,"""""",,59766,223.5,16,0.833442,0.008949,0.002237,3.0,0,0.0,0.0,0.0,0,0.0,0
19,1,1,1.0,1,14530.947368,12595.0,4469.00146,17,9.631579,10.0,9,10,"""SU""",2,"""NBC""",1,1.0,1.0,1.0,1,0.052632,true,36,false,true,"""UT/S7""",,61061,19.0,1,0.963158,0.105263,0.052632,0.0,0,0.0,0.0,0.0,0,0.0,0
135,8,2,0.0,2,47595.37037,37695.0,28414.224536,10,20.444444,11.0,11,37,"""SU""",3,"""GDX""",2,1.111111,1.0,2.0,8,0.059259,true,36,false,true,"""S7""",,60734,67.5,26,1.858586,0.022222,0.014815,1.0,0,0.0,0.0,0.0,0,0.0,0


In [31]:
# Analyze clusters
cluster_labels = analyzer.analyze_clusters(results['cleaned_df'])


📊 CLUSTER PROFILES:
shape: (10, 7)
┌─────────┬──────────────┬──────────┬────────────────┬───────────────┬──────────────┬───────┐
│ cluster ┆ avg_searches ┆ vip_rate ┆ roundtrip_rate ┆ avg_lead_days ┆ avg_carriers ┆ size  │
│ ---     ┆ ---          ┆ ---      ┆ ---            ┆ ---           ┆ ---          ┆ ---   │
│ i64     ┆ f64          ┆ f64      ┆ f64            ┆ f64           ┆ f64          ┆ u32   │
╞═════════╪══════════════╪══════════╪════════════════╪═══════════════╪══════════════╪═══════╡
│ 0       ┆ 442.77542    ┆ 0.002867 ┆ 0.571826       ┆ 14.718218     ┆ 5.135479     ┆ 27901 │
│ 1       ┆ 1435.933333  ┆ 1.0      ┆ 0.964778       ┆ 21.626827     ┆ 8.533333     ┆ 15    │
│ 2       ┆ 1373.199488  ┆ 0.0      ┆ 0.850782       ┆ 13.295817     ┆ 7.219096     ┆ 1173  │
│ 3       ┆ 205.142484   ┆ 0.0      ┆ 0.471143       ┆ 21.47051      ┆ 4.988518     ┆ 1916  │
│ 4       ┆ 1938.75      ┆ 1.0      ┆ 1.0            ┆ 9.135501      ┆ 6.375        ┆ 8     │
│ 5       ┆ 5332.0      

In [32]:
results['cleaned_df'].columns

['total_searches',
 'unique_sessions',
 'unique_routes',
 'roundtrip_preference',
 'unique_routes_searched',
 'avg_total_price',
 'median_total_price',
 'price_std',
 'preferred_departure_hour',
 'avg_booking_lead_days',
 'median_booking_lead_days',
 'min_booking_lead_days',
 'max_booking_lead_days',
 'most_common_carrier',
 'unique_carriers_used',
 'most_common_departure',
 'unique_departure_airports',
 'avg_cabin_class',
 'min_cabin_class',
 'max_cabin_class',
 'total_bookings',
 'booking_rate',
 'sex',
 'nationality',
 'is_vip',
 'self_type',
 'ff_normalized',
 'corporate_tariff_code',
 'company_id',
 'search_intensity_per_route',
 'lead_time_variance',
 'lead_time_skew',
 'carrier_diversity',
 'airport_diversity',
 'cabin_class_range',
 'customer_tier',
 'vip_search_intensity',
 'vip_carrier_diversity',
 'vip_cabin_preference',
 'corp_search_volume',
 'corp_roundtrip_pref',
 'corp_planning_variance']