In [None]:
def calculate_competitive_landscape(self, df, selections_df=None):
        """Calculate popularity relative to similar routes and competitive metrics"""
        routes_df = self.extract_route_info(df)

        # Group routes by origin city to compare destinations from same origin
        origin_groups = routes_df.groupby('origin')['route'].apply(list).to_dict()

        popularity_scores = {}

        for origin, route_list in origin_groups.items():
            if len(route_list) < 2:  # Need at least 2 routes to compare
                continue

            origin_routes = routes_df[routes_df['route'].isin(route_list)]

            # Calculate metrics for routes from this origin
            route_metrics = origin_routes.groupby('route').agg({
                'route': 'count',
                'total_price': ['mean', 'std'],
                'passenger_count': 'sum'
            }).round(2)

            route_metrics.columns = ['searches', 'avg_price', 'price_std', 'total_passengers']

            # Calculate competitive metrics
            max_searches = route_metrics['searches'].max()
            route_metrics['market_share'] = (
                route_metrics['searches'] / route_metrics['searches'].sum() * 100
            ).round(1)

            route_metrics['relative_popularity'] = (
                route_metrics['searches'] / max_searches * 100
            ).round(1)

            # Price competitiveness (lower prices = more competitive)
            min_price = route_metrics['avg_price'].min()
            route_metrics['price_competitiveness'] = (
                (1 - (route_metrics['avg_price'] - min_price) / route_metrics['avg_price'].max()) * 100
            ).round(1)

            # Store results
            for route in route_metrics.index:
                popularity_scores[route] = {
                    'origin': origin,
                    'market_share': route_metrics.loc[route, 'market_share'],
                    'relative_popularity_score': route_metrics.loc[route, 'relative_popularity'],
                    'price_competitiveness': route_metrics.loc[route, 'price_competitiveness'],
                    'searches': route_metrics.loc[route, 'searches'],
                    'competing_routes': len(route_list) - 1,
                    'avg_price': route_metrics.loc[route, 'avg_price']
                }

        return pd.DataFrame(popularity_scores).T

    def calculate_advance_booking_patterns(self, df, selections_df=None):
        """Calculate how booking patterns vary by advance booking time"""
        routes_df = self.extract_route_info(df)

        if selections_df is not None and 'selected' in selections_df.columns:
            routes_df['selected'] = selections_df['selected']
        else:
            price_threshold = df['totalPrice'].quantile(0.3)
            routes_df['selected'] = (df['totalPrice'] <= price_threshold).astype(int)

        # Calculate advance booking days
        routes_df['advance_days'] = (routes_df['departure_date'] - routes_df['request_date']).dt.days

        # Create advance booking categories
        routes_df['advance_category'] = pd.cut(
            routes_df['advance_days'],
            bins=[-1, 7, 14, 30, 60, 365],
            labels=['Same Week', '1-2 Weeks', '2-4 Weeks', '1-2 Months', '2+ Months']
        )

        # Calculate patterns by advance booking time
        advance_stats = routes_df.groupby(['route', 'advance_category']).agg({
            'route': 'count',
            'selected': ['sum', 'mean'],
            'total_price': 'mean',
            'advance_days': 'mean'
        }).round(2)

        advance_stats.columns = [
            'searches',
            'bookings',
            'conversion_rate',
            'avg_price',
            'avg_advance_days'
        ]

        # Find optimal booking windows for each route
        route_best_advance = advance_stats.groupby('route')['conversion_rate'].idxmax()
        route_optimal_advance = {route: cat[1] for route, cat in route_best_advance.items()}

        return advance_stats, route_optimal_advance

import pandas as pd

import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

class RoutePopularityCalculator:
    def __init__(self):
        self.route_stats = {}
        self.seasonal_patterns = {}
        self.time_decay_factor = 0.95  # Recent data weighted more heavily

    def extract_route_info(self, df):
        """Extract route information from flight data"""
        routes_df = pd.DataFrame(index=df.index)

        # Basic route extraction
        routes_df['route'] = df['searchRoute']

        # Extract origin and destination from route string
        # Assuming format like "TLKKJA/KJATLK" means TLK->KJA outbound, KJA->TLK return
        route_parts = df['searchRoute'].str.split('/', expand=True)

        # Outbound route
        outbound = route_parts[0] if 0 in route_parts.columns else ''
        routes_df['origin'] = outbound.str[:3] if len(outbound.iloc[0]) >= 3 else outbound
        routes_df['destination'] = outbound.str[3:6] if len(outbound.iloc[0]) >= 6 else outbound.str[3:]

        # Return route (if exists)
        if 1 in route_parts.columns:
            return_route = route_parts[1]
            routes_df['return_origin'] = return_route.str[:3]
            routes_df['return_destination'] = return_route.str[3:6]
            routes_df['is_roundtrip'] = True
        else:
            routes_df['is_roundtrip'] = False

        # Additional route characteristics
        routes_df['request_date'] = pd.to_datetime(df['requestDate'])
        routes_df['departure_date'] = pd.to_datetime(df['legs0_departureAt'])
        routes_df['total_price'] = df['totalPrice']
        routes_df['passenger_count'] = df['pricingInfo_passengerCount']

        return routes_df

    def calculate_search_volume_popularity(self, df, time_window_days=90):
        """Calculate popularity based on search volume"""
        routes_df = self.extract_route_info(df)

        # Filter to recent searches
        cutoff_date = routes_df['request_date'].max() - timedelta(days=time_window_days)
        recent_searches = routes_df[routes_df['request_date'] >= cutoff_date]

        # Count searches per route
        search_volume = recent_searches.groupby('route').agg({
            'route': 'count',
            'passenger_count': 'sum',
            'request_date': ['min', 'max'],
            'total_price': ['mean', 'median']
        }).round(2)

        search_volume.columns = [
            'search_count',
            'total_passengers',
            'first_search',
            'last_search',
            'avg_price',
            'median_price'
        ]

        # Calculate popularity score
        max_searches = search_volume['search_count'].max()
        search_volume['search_popularity_score'] = (
            search_volume['search_count'] / max_searches * 100
        ).round(1)

        # Weight by passenger volume
        max_passengers = search_volume['total_passengers'].max()
        search_volume['passenger_popularity_score'] = (
            search_volume['total_passengers'] / max_passengers * 100
        ).round(1)

        # Combined popularity score
        search_volume['combined_popularity_score'] = (
            (search_volume['search_popularity_score'] * 0.6 +
             search_volume['passenger_popularity_score'] * 0.4)
        ).round(1)

        return search_volume.sort_values('combined_popularity_score', ascending=False)

    def calculate_booking_conversion_popularity(self, df, selections_df=None):
        """Calculate popularity based on booking conversion rates"""
        if selections_df is None:
            print("Note: Using synthetic selection data. Provide actual selections for better results.")
            # Create synthetic selections based on price (cheaper flights more likely to be selected)
            selections_df = df.copy()
            price_threshold = df['totalPrice'].quantile(0.3)  # Bottom 30% price range
            selections_df['selected'] = (df['totalPrice'] <= price_threshold).astype(int)

        routes_df = self.extract_route_info(df)
        routes_df['selected'] = selections_df['selected'] if 'selected' in selections_df.columns else 0

        # Calculate conversion metrics per route
        conversion_stats = routes_df.groupby('route').agg({
            'route': 'count',
            'selected': ['sum', 'mean'],
            'total_price': 'mean',
            'passenger_count': 'sum'
        }).round(3)

        conversion_stats.columns = [
            'total_searches',
            'total_bookings',
            'conversion_rate',
            'avg_price',
            'total_passengers'
        ]

        # Filter routes with sufficient data
        min_searches = 10
        conversion_stats = conversion_stats[conversion_stats['total_searches'] >= min_searches]

        # Calculate booking popularity score
        max_bookings = conversion_stats['total_bookings'].max()
        conversion_stats['booking_volume_score'] = (
            conversion_stats['total_bookings'] / max_bookings * 100
        ).round(1)

        # Conversion rate score
        max_conversion = conversion_stats['conversion_rate'].max()
        conversion_stats['conversion_score'] = (
            conversion_stats['conversion_rate'] / max_conversion * 100
        ).round(1)

        # Combined booking popularity
        conversion_stats['booking_popularity_score'] = (
            (conversion_stats['booking_volume_score'] * 0.7 +
             conversion_stats['conversion_score'] * 0.3)
        ).round(1)

        return conversion_stats.sort_values('booking_popularity_score', ascending=False)

    def calculate_monthly_trends(self, df, selections_df=None):
        """Calculate monthly trend patterns (better for limited timeframe)"""
        routes_df = self.extract_route_info(df)

        if selections_df is not None and 'selected' in selections_df.columns:
            routes_df['selected'] = selections_df['selected']
        else:
            # Synthetic selection data
            price_threshold = df['totalPrice'].quantile(0.3)
            routes_df['selected'] = (df['totalPrice'] <= price_threshold).astype(int)

        # Add time features
        routes_df['year_month'] = routes_df['departure_date'].dt.to_period('M')
        routes_df['month'] = routes_df['departure_date'].dt.month

        # Calculate monthly patterns for each route
        monthly_stats = routes_df.groupby(['route', 'year_month']).agg({
            'route': 'count',
            'selected': ['sum', 'mean'],
            'total_price': 'mean'
        }).round(2)

        monthly_stats.columns = [
            'searches',
            'bookings',
            'conversion_rate',
            'avg_price'
        ]

        # Calculate trend (is route growing or declining?)
        trend_stats = monthly_stats.groupby('route').agg({
            'searches': ['count', 'mean', 'std'],
            'conversion_rate': 'mean'
        }).round(3)

        trend_stats.columns = [
            'months_active',
            'avg_monthly_searches',
            'search_volatility',
            'avg_conversion_rate'
        ]

        # Calculate growth trend using linear regression on monthly searches
        route_trends = {}
        for route in routes_df['route'].unique():
            route_monthly = monthly_stats.loc[route] if route in monthly_stats.index else None
            if route_monthly is not None and len(route_monthly) >= 2:
                # Simple trend calculation: (last_month - first_month) / months
                searches = route_monthly['searches'].values
                if len(searches) >= 2:
                    trend_slope = (searches[-1] - searches[0]) / (len(searches) - 1)
                    route_trends[route] = trend_slope
                else:
                    route_trends[route] = 0
            else:
                route_trends[route] = 0

        trend_stats['monthly_growth_trend'] = pd.Series(route_trends)

        return trend_stats

    def calculate_time_based_popularity(self, df, selections_df=None):
        """Calculate popularity with time decay (recent data weighted more)"""
        routes_df = self.extract_route_info(df)

        if selections_df is not None and 'selected' in selections_df.columns:
            routes_df['selected'] = selections_df['selected']
        else:
            price_threshold = df['totalPrice'].quantile(0.3)
            routes_df['selected'] = (df['totalPrice'] <= price_threshold).astype(int)

        # Calculate days since each search
        max_date = routes_df['request_date'].max()
        routes_df['days_ago'] = (max_date - routes_df['request_date']).dt.days

        # Apply time decay weight (recent searches weighted more heavily)
        routes_df['time_weight'] = self.time_decay_factor ** routes_df['days_ago']

        # Calculate weighted popularity metrics
        weighted_stats = routes_df.groupby('route').apply(
            lambda x: pd.Series({
                'weighted_searches': x['time_weight'].sum(),
                'weighted_bookings': (x['selected'] * x['time_weight']).sum(),
                'recent_searches': x[x['days_ago'] <= 30]['route'].count(),
                'recent_bookings': x[x['days_ago'] <= 30]['selected'].sum(),
                'avg_price': x['total_price'].mean(),
                'total_searches': len(x)
            })
        ).round(2)

        # Calculate popularity scores
        max_weighted = weighted_stats['weighted_searches'].max()
        weighted_stats['time_weighted_popularity'] = (
            weighted_stats['weighted_searches'] / max_weighted * 100
        ).round(1)

        # Recent activity score
        max_recent = weighted_stats['recent_searches'].max()
        if max_recent > 0:
            weighted_stats['recent_activity_score'] = (
                weighted_stats['recent_searches'] / max_recent * 100
            ).round(1)
        else:
            weighted_stats['recent_activity_score'] = 0

        return weighted_stats.sort_values('time_weighted_popularity', ascending=False)

    def calculate_day_of_week_patterns(self, df, selections_df=None):
        """Calculate day-of-week booking patterns (good for limited timeframe)"""
        routes_df = self.extract_route_info(df)

        if selections_df is not None and 'selected' in selections_df.columns:
            routes_df['selected'] = selections_df['selected']
        else:
            price_threshold = df['totalPrice'].quantile(0.3)
            routes_df['selected'] = (df['totalPrice'] <= price_threshold).astype(int)

        # Add day of week features
        routes_df['departure_dow'] = routes_df['departure_date'].dt.day_name()
        routes_df['search_dow'] = routes_df['request_date'].dt.day_name()

        # Calculate patterns by departure day of week
        dow_stats = routes_df.groupby(['route', 'departure_dow']).agg({
            'route': 'count',
            'selected': ['sum', 'mean'],
            'total_price': 'mean'
        }).round(2)

        dow_stats.columns = ['searches', 'bookings', 'conversion_rate', 'avg_price']

        # Find most popular departure days for each route
        route_best_days = dow_stats.groupby('route')['searches'].idxmax()
        route_best_days = {route: day[1] for route, day in route_best_days.items()}  # Extract day from tuple

        return dow_stats, route_best_days

    def generate_popularity_features(self, df, selections_df=None):
        """Generate comprehensive popularity features for ML model"""
        print("Calculating route popularity metrics...")

        # Calculate all popularity metrics (updated for limited timeframe)
        search_popularity = self.calculate_search_volume_popularity(df)
        booking_popularity = self.calculate_booking_conversion_popularity(df, selections_df)
        monthly_trends = self.calculate_monthly_trends(df, selections_df)
        time_weighted_popularity = self.calculate_time_based_popularity(df, selections_df)
        competitive_landscape = self.calculate_competitive_landscape(df, selections_df)
        dow_patterns, best_days = self.calculate_day_of_week_patterns(df, selections_df)
        advance_patterns, optimal_advance = self.calculate_advance_booking_patterns(df, selections_df)

        # Create feature dataframe
        routes_df = self.extract_route_info(df)
        features = pd.DataFrame(index=df.index)

        # Map popularity scores back to individual records
        route_to_search_pop = search_popularity['combined_popularity_score'].to_dict()
        features['search_popularity_score'] = routes_df['route'].map(route_to_search_pop).fillna(0)

        if not booking_popularity.empty:
            route_to_booking_pop = booking_popularity['booking_popularity_score'].to_dict()
            features['booking_popularity_score'] = routes_df['route'].map(route_to_booking_pop).fillna(0)
        else:
            features['booking_popularity_score'] = 0

        route_to_time_pop = time_weighted_popularity['time_weighted_popularity'].to_dict()
        features['time_weighted_popularity_score'] = routes_df['route'].map(route_to_time_pop).fillna(0)

        # Monthly trend features
        route_to_growth = monthly_trends['monthly_growth_trend'].to_dict() if not monthly_trends.empty else {}
        features['monthly_growth_trend'] = routes_df['route'].map(route_to_growth).fillna(0)

        route_to_volatility = monthly_trends['search_volatility'].to_dict() if not monthly_trends.empty else {}
        features['search_volatility'] = routes_df['route'].map(route_to_volatility).fillna(0)

        # Competitive landscape features
        if not competitive_landscape.empty:
            route_to_market_share = competitive_landscape['market_share'].to_dict()
            features['market_share'] = routes_df['route'].map(route_to_market_share).fillna(0)

            route_to_price_comp = competitive_landscape['price_competitiveness'].to_dict()
            features['price_competitiveness'] = routes_df['route'].map(route_to_price_comp).fillna(50)

            route_to_comp_routes = competitive_landscape['competing_routes'].to_dict()
            features['competing_routes_count'] = routes_df['route'].map(route_to_comp_routes).fillna(0)
        else:
            features['market_share'] = 0
            features['price_competitiveness'] = 50
            features['competing_routes_count'] = 0

        # Day of week preference (is this a popular departure day for this route?)
        features['is_popular_departure_day'] = 0
        for idx, row in routes_df.iterrows():
            route = row['route']
            departure_day = row['departure_date'].day_name()
            if route in best_days and best_days[route] == departure_day:
                features.loc[idx, 'is_popular_departure_day'] = 1

        # Advance booking optimality
        features['is_optimal_advance_booking'] = 0
        for idx, row in routes_df.iterrows():
            route = row['route']
            advance_days = (row['departure_date'] - row['request_date']).days
            if route in optimal_advance:
                optimal_cat = optimal_advance[route]
                # Map advance days to category
                if advance_days <= 7 and optimal_cat == 'Same Week':
                    features.loc[idx, 'is_optimal_advance_booking'] = 1
                elif 7 < advance_days <= 14 and optimal_cat == '1-2 Weeks':
                    features.loc[idx, 'is_optimal_advance_booking'] = 1
                elif 14 < advance_days <= 30 and optimal_cat == '2-4 Weeks':
                    features.loc[idx, 'is_optimal_advance_booking'] = 1
                elif 30 < advance_days <= 60 and optimal_cat == '1-2 Months':
                    features.loc[idx, 'is_optimal_advance_booking'] = 1
                elif advance_days > 60 and optimal_cat == '2+ Months':
                    features.loc[idx, 'is_optimal_advance_booking'] = 1

        # Create composite popularity score (updated weights for limited timeframe data)
        features['composite_popularity_score'] = (
            features['search_popularity_score'] * 0.25 +
            features['booking_popularity_score'] * 0.25 +
            features['time_weighted_popularity_score'] * 0.20 +
            features['market_share'] * 0.15 +
            features['price_competitiveness'] * 0.10 +
            (features['monthly_growth_trend'] * 10).clip(-50, 50) * 0.05  # Normalize growth trend
        ).round(1)

        print(f"Generated popularity features for {len(features)} records")
        return features

# Example usage and analysis
def analyze_route_popularity(df, selections_df=None):
    """Comprehensive route popularity analysis"""
    calculator = RoutePopularityCalculator()

    print("=== ROUTE POPULARITY ANALYSIS ===\n")

    # Search volume popularity
    print("1. Most searched routes:")
    search_pop = calculator.calculate_search_volume_popularity(df)
    print(search_pop.head(10)[['search_count', 'combined_popularity_score']])

    # Booking conversion popularity
    print("\n2. Best converting routes:")
    booking_pop = calculator.calculate_booking_conversion_popularity(df, selections_df)
    if not booking_pop.empty:
        print(booking_pop.head(10)[['conversion_rate', 'booking_popularity_score']])

    # Seasonal patterns
    print("\n3. Seasonal popularity patterns (top 5 routes):")
    seasonal_pop = calculator.calculate_seasonal_popularity(df, selections_df)
    print(seasonal_pop.head())

    # Generate features for ML
    print("\n4. Generating ML features...")
    popularity_features = calculator.generate_popularity_features(df, selections_df)

    print("\nPopularity feature summary:")
    print(popularity_features.describe().round(2))

    return popularity_features

# Example usage
if __name__ == "__main__":
    # This would be called with your actual data
    # df = pd.read_csv('your_flight_data.csv')
    # selections_df = pd.read_csv('your_selections_data.csv')  # Optional
    # popularity_features = analyze_route_popularity(df, selections_df)
    print("Route popularity calculator ready. Use analyze_route_popularity(df) with your data.")

With seasonality if the data set spanned a longer period

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

class RoutePopularityCalculator:
    def __init__(self):
        self.route_stats = {}
        self.seasonal_patterns = {}
        self.time_decay_factor = 0.95  # Recent data weighted more heavily

    def extract_route_info(self, df):
        """Extract route information from flight data"""
        routes_df = pd.DataFrame(index=df.index)

        # Basic route extraction
        routes_df['route'] = df['searchRoute']

        # Extract origin and destination from route string
        # Assuming format like "TLKKJA/KJATLK" means TLK->KJA outbound, KJA->TLK return
        route_parts = df['searchRoute'].str.split('/', expand=True)

        # Outbound route
        outbound = route_parts[0] if 0 in route_parts.columns else ''
        routes_df['origin'] = outbound.str[:3] if len(outbound.iloc[0]) >= 3 else outbound
        routes_df['destination'] = outbound.str[3:6] if len(outbound.iloc[0]) >= 6 else outbound.str[3:]

        # Return route (if exists)
        if 1 in route_parts.columns:
            return_route = route_parts[1]
            routes_df['return_origin'] = return_route.str[:3]
            routes_df['return_destination'] = return_route.str[3:6]
            routes_df['is_roundtrip'] = True
        else:
            routes_df['is_roundtrip'] = False

        # Additional route characteristics
        routes_df['request_date'] = pd.to_datetime(df['requestDate'])
        routes_df['departure_date'] = pd.to_datetime(df['legs0_departureAt'])
        routes_df['total_price'] = df['totalPrice']
        routes_df['passenger_count'] = df['pricingInfo_passengerCount']

        return routes_df

    def calculate_search_volume_popularity(self, df, time_window_days=90):
        """Calculate popularity based on search volume"""
        routes_df = self.extract_route_info(df)

        # Filter to recent searches
        cutoff_date = routes_df['request_date'].max() - timedelta(days=time_window_days)
        recent_searches = routes_df[routes_df['request_date'] >= cutoff_date]

        # Count searches per route
        search_volume = recent_searches.groupby('route').agg({
            'route': 'count',
            'passenger_count': 'sum',
            'request_date': ['min', 'max'],
            'total_price': ['mean', 'median']
        }).round(2)

        search_volume.columns = [
            'search_count',
            'total_passengers',
            'first_search',
            'last_search',
            'avg_price',
            'median_price'
        ]

        # Calculate popularity score
        max_searches = search_volume['search_count'].max()
        search_volume['search_popularity_score'] = (
            search_volume['search_count'] / max_searches * 100
        ).round(1)

        # Weight by passenger volume
        max_passengers = search_volume['total_passengers'].max()
        search_volume['passenger_popularity_score'] = (
            search_volume['total_passengers'] / max_passengers * 100
        ).round(1)

        # Combined popularity score
        search_volume['combined_popularity_score'] = (
            (search_volume['search_popularity_score'] * 0.6 +
             search_volume['passenger_popularity_score'] * 0.4)
        ).round(1)

        return search_volume.sort_values('combined_popularity_score', ascending=False)

    def calculate_booking_conversion_popularity(self, df, selections_df=None):
        """Calculate popularity based on booking conversion rates"""
        if selections_df is None:
            print("Note: Using synthetic selection data. Provide actual selections for better results.")
            # Create synthetic selections based on price (cheaper flights more likely to be selected)
            selections_df = df.copy()
            price_threshold = df['totalPrice'].quantile(0.3)  # Bottom 30% price range
            selections_df['selected'] = (df['totalPrice'] <= price_threshold).astype(int)

        routes_df = self.extract_route_info(df)
        routes_df['selected'] = selections_df['selected'] if 'selected' in selections_df.columns else 0

        # Calculate conversion metrics per route
        conversion_stats = routes_df.groupby('route').agg({
            'route': 'count',
            'selected': ['sum', 'mean'],
            'total_price': 'mean',
            'passenger_count': 'sum'
        }).round(3)

        conversion_stats.columns = [
            'total_searches',
            'total_bookings',
            'conversion_rate',
            'avg_price',
            'total_passengers'
        ]

        # Filter routes with sufficient data
        min_searches = 10
        conversion_stats = conversion_stats[conversion_stats['total_searches'] >= min_searches]

        # Calculate booking popularity score
        max_bookings = conversion_stats['total_bookings'].max()
        conversion_stats['booking_volume_score'] = (
            conversion_stats['total_bookings'] / max_bookings * 100
        ).round(1)

        # Conversion rate score
        max_conversion = conversion_stats['conversion_rate'].max()
        conversion_stats['conversion_score'] = (
            conversion_stats['conversion_rate'] / max_conversion * 100
        ).round(1)

        # Combined booking popularity
        conversion_stats['booking_popularity_score'] = (
            (conversion_stats['booking_volume_score'] * 0.7 +
             conversion_stats['conversion_score'] * 0.3)
        ).round(1)

        return conversion_stats.sort_values('booking_popularity_score', ascending=False)

    def calculate_seasonal_popularity(self, df, selections_df=None):
        """Calculate seasonal popularity patterns"""
        routes_df = self.extract_route_info(df)

        if selections_df is not None and 'selected' in selections_df.columns:
            routes_df['selected'] = selections_df['selected']
        else:
            # Synthetic selection data
            price_threshold = df['totalPrice'].quantile(0.3)
            routes_df['selected'] = (df['totalPrice'] <= price_threshold).astype(int)

        # Add time features
        routes_df['month'] = routes_df['departure_date'].dt.month
        routes_df['quarter'] = routes_df['departure_date'].dt.quarter
        routes_df['season'] = routes_df['quarter'].map({
            1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Fall'
        })

        # Calculate seasonal patterns
        seasonal_stats = routes_df.groupby(['route', 'season']).agg({
            'route': 'count',
            'selected': ['sum', 'mean'],
            'total_price': 'mean'
        }).round(2)

        seasonal_stats.columns = [
            'searches',
            'bookings',
            'conversion_rate',
            'avg_price'
        ]

        # Calculate seasonal popularity scores
        seasonal_pivot = seasonal_stats.reset_index().pivot(
            index='route', columns='season', values='searches'
        ).fillna(0)

        # Normalize by total searches per route
        seasonal_normalized = seasonal_pivot.div(seasonal_pivot.sum(axis=1), axis=0) * 100

        return seasonal_normalized.round(1)

    def calculate_time_based_popularity(self, df, selections_df=None):
        """Calculate popularity with time decay (recent data weighted more)"""
        routes_df = self.extract_route_info(df)

        if selections_df is not None and 'selected' in selections_df.columns:
            routes_df['selected'] = selections_df['selected']
        else:
            price_threshold = df['totalPrice'].quantile(0.3)
            routes_df['selected'] = (df['totalPrice'] <= price_threshold).astype(int)

        # Calculate days since each search
        max_date = routes_df['request_date'].max()
        routes_df['days_ago'] = (max_date - routes_df['request_date']).dt.days

        # Apply time decay weight (recent searches weighted more heavily)
        routes_df['time_weight'] = self.time_decay_factor ** routes_df['days_ago']

        # Calculate weighted popularity metrics
        weighted_stats = routes_df.groupby('route').apply(
            lambda x: pd.Series({
                'weighted_searches': x['time_weight'].sum(),
                'weighted_bookings': (x['selected'] * x['time_weight']).sum(),
                'recent_searches': x[x['days_ago'] <= 30]['route'].count(),
                'recent_bookings': x[x['days_ago'] <= 30]['selected'].sum(),
                'avg_price': x['total_price'].mean(),
                'total_searches': len(x)
            })
        ).round(2)

        # Calculate popularity scores
        max_weighted = weighted_stats['weighted_searches'].max()
        weighted_stats['time_weighted_popularity'] = (
            weighted_stats['weighted_searches'] / max_weighted * 100
        ).round(1)

        # Recent activity score
        max_recent = weighted_stats['recent_searches'].max()
        if max_recent > 0:
            weighted_stats['recent_activity_score'] = (
                weighted_stats['recent_searches'] / max_recent * 100
            ).round(1)
        else:
            weighted_stats['recent_activity_score'] = 0

        return weighted_stats.sort_values('time_weighted_popularity', ascending=False)

    def calculate_comparative_popularity(self, df, selections_df=None):
        """Calculate popularity relative to similar routes"""
        routes_df = self.extract_route_info(df)

        # Group routes by origin city to compare destinations from same origin
        origin_groups = routes_df.groupby('origin')['route'].apply(list).to_dict()

        popularity_scores = {}

        for origin, route_list in origin_groups.items():
            if len(route_list) < 2:  # Need at least 2 routes to compare
                continue

            origin_routes = routes_df[routes_df['route'].isin(route_list)]

            # Calculate metrics for routes from this origin
            route_metrics = origin_routes.groupby('route').agg({
                'route': 'count',
                'total_price': 'mean',
                'passenger_count': 'sum'
            }).round(2)

            route_metrics.columns = ['searches', 'avg_price', 'total_passengers']

            # Calculate relative popularity within this origin group
            max_searches = route_metrics['searches'].max()
            route_metrics['relative_popularity'] = (
                route_metrics['searches'] / max_searches * 100
            ).round(1)

            # Store results
            for route in route_metrics.index:
                popularity_scores[route] = {
                    'origin': origin,
                    'relative_popularity_score': route_metrics.loc[route, 'relative_popularity'],
                    'searches': route_metrics.loc[route, 'searches'],
                    'competing_routes': len(route_list) - 1
                }

        return pd.DataFrame(popularity_scores).T

    def generate_popularity_features(self, df, selections_df=None):
        """Generate comprehensive popularity features for ML model"""
        print("Calculating route popularity metrics...")

        # Calculate all popularity metrics
        search_popularity = self.calculate_search_volume_popularity(df)
        booking_popularity = self.calculate_booking_conversion_popularity(df, selections_df)
        seasonal_popularity = self.calculate_seasonal_popularity(df, selections_df)
        time_weighted_popularity = self.calculate_time_based_popularity(df, selections_df)
        comparative_popularity = self.calculate_comparative_popularity(df, selections_df)

        # Create feature dataframe
        routes_df = self.extract_route_info(df)
        features = pd.DataFrame(index=df.index)

        # Map popularity scores back to individual records
        route_to_search_pop = search_popularity['combined_popularity_score'].to_dict()
        features['search_popularity_score'] = routes_df['route'].map(route_to_search_pop).fillna(0)

        if not booking_popularity.empty:
            route_to_booking_pop = booking_popularity['booking_popularity_score'].to_dict()
            features['booking_popularity_score'] = routes_df['route'].map(route_to_booking_pop).fillna(0)
        else:
            features['booking_popularity_score'] = 0

        route_to_time_pop = time_weighted_popularity['time_weighted_popularity'].to_dict()
        features['time_weighted_popularity_score'] = routes_df['route'].map(route_to_time_pop).fillna(0)

        if not comparative_popularity.empty:
            route_to_comp_pop = comparative_popularity['relative_popularity_score'].to_dict()
            features['relative_popularity_score'] = routes_df['route'].map(route_to_comp_pop).fillna(50)  # 50 = average
        else:
            features['relative_popularity_score'] = 50

        # Add seasonal preference (current season popularity)
        current_season_map = {1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Fall'}
        routes_df['current_season'] = routes_df['departure_date'].dt.quarter.map(current_season_map)

        features['seasonal_popularity_score'] = 0
        for idx, row in routes_df.iterrows():
            route = row['route']
            season = row['current_season']
            if route in seasonal_popularity.index and season in seasonal_popularity.columns:
                features.loc[idx, 'seasonal_popularity_score'] = seasonal_popularity.loc[route, season]

        # Create composite popularity score
        features['composite_popularity_score'] = (
            features['search_popularity_score'] * 0.3 +
            features['booking_popularity_score'] * 0.3 +
            features['time_weighted_popularity_score'] * 0.2 +
            features['seasonal_popularity_score'] * 0.1 +
            features['relative_popularity_score'] * 0.1
        ).round(1)

        print(f"Generated popularity features for {len(features)} records")
        return features

# Example usage and analysis
def analyze_route_popularity(df, selections_df=None):
    """Comprehensive route popularity analysis"""
    calculator = RoutePopularityCalculator()

    print("=== ROUTE POPULARITY ANALYSIS ===\n")

    # Search volume popularity
    print("1. Most searched routes:")
    search_pop = calculator.calculate_search_volume_popularity(df)
    print(search_pop.head(10)[['search_count', 'combined_popularity_score']])

    # Booking conversion popularity
    print("\n2. Best converting routes:")
    booking_pop = calculator.calculate_booking_conversion_popularity(df, selections_df)
    if not booking_pop.empty:
        print(booking_pop.head(10)[['conversion_rate', 'booking_popularity_score']])

    # Seasonal patterns
    print("\n3. Seasonal popularity patterns (top 5 routes):")
    seasonal_pop = calculator.calculate_seasonal_popularity(df, selections_df)
    print(seasonal_pop.head())

    # Generate features for ML
    print("\n4. Generating ML features...")
    popularity_features = calculator.generate_popularity_features(df, selections_df)

    print("\nPopularity feature summary:")
    print(popularity_features.describe().round(2))

    return popularity_features

# Example usage
if __name__ == "__main__":
    # This would be called with your actual data
    # df = pd.read_csv('your_flight_data.csv')
    # selections_df = pd.read_csv('your_selections_data.csv')  # Optional
    # popularity_features = analyze_route_popularity(df, selections_df)
    print("Route popularity calculator ready. Use analyze_route_popularity(df) with your data.")