In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

class RelativeCostFeatureEngineer:
    """
    Engineer relative cost features for flight selection prediction.
    Captures price positioning within each search session.
    """

    def __init__(self):
        self.search_stats = {}

    def calculate_search_level_price_features(self, df):
        """
        Calculate relative price features within each search session.
        Assumes each unique combination of (profileId, ranker_id, requestDate) represents one search.
        """
        features = pd.DataFrame(index=df.index)

        # Create search session identifier
        search_cols = ['profileId', 'ranker_id', 'requestDate']
        missing_cols = [col for col in search_cols if col not in df.columns]
        if missing_cols:
            print(f"Warning: Missing columns {missing_cols}. Using available columns for search grouping.")
            available_cols = [col for col in search_cols if col in df.columns]
            if not available_cols:
                print("No search session columns found. Using index as search session.")
                df['search_session'] = 0  # Single search session
                search_id_col = 'search_session'
            else:
                search_id_col = available_cols[0] if len(available_cols) == 1 else 'combined_search_id'
                if len(available_cols) > 1:
                    df[search_id_col] = df[available_cols].astype(str).agg('_'.join, axis=1)
        else:
            search_id_col = 'search_session_id'
            df[search_id_col] = df[search_cols].astype(str).agg('_'.join, axis=1)

        # Group by search session and calculate relative metrics
        for search_id, group in df.groupby(search_id_col):
            group_idx = group.index
            prices = group['totalPrice'].values

            if len(prices) <= 1:
                # Single flight in search - set neutral values
                features.loc[group_idx, 'price_rank'] = 1
                features.loc[group_idx, 'price_percentile'] = 50.0
                features.loc[group_idx, 'price_zscore'] = 0.0
                features.loc[group_idx, 'price_ratio_to_min'] = 1.0
                features.loc[group_idx, 'price_ratio_to_max'] = 1.0
                features.loc[group_idx, 'price_ratio_to_median'] = 1.0
                features.loc[group_idx, 'is_cheapest'] = 1
                features.loc[group_idx, 'is_most_expensive'] = 1
                features.loc[group_idx, 'price_gap_to_next_cheapest'] = 0.0
                features.loc[group_idx, 'options_count'] = 1
                continue

            # Basic ranking features
            price_ranks = stats.rankdata(prices, method='min')
            features.loc[group_idx, 'price_rank'] = price_ranks

            # Percentile within search (0-100, lower = cheaper)
            price_percentiles = [(rank - 1) / (len(prices) - 1) * 100 for rank in price_ranks]
            features.loc[group_idx, 'price_percentile'] = price_percentiles

            # Z-score within search (standardized price position)
            if prices.std() > 0:
                price_zscores = (prices - prices.mean()) / prices.std()
                features.loc[group_idx, 'price_zscore'] = price_zscores
            else:
                features.loc[group_idx, 'price_zscore'] = 0.0

            # Ratio features
            min_price = prices.min()
            max_price = prices.max()
            median_price = np.median(prices)

            features.loc[group_idx, 'price_ratio_to_min'] = prices / min_price
            features.loc[group_idx, 'price_ratio_to_max'] = prices / max_price
            features.loc[group_idx, 'price_ratio_to_median'] = prices / median_price

            # Binary position features
            features.loc[group_idx, 'is_cheapest'] = (prices == min_price).astype(int)
            features.loc[group_idx, 'is_most_expensive'] = (prices == max_price).astype(int)
            features.loc[group_idx, 'is_below_median'] = (prices < median_price).astype(int)

            # Price gap analysis
            sorted_prices = np.sort(prices)
            price_gaps = []
            for price in prices:
                current_rank = np.where(sorted_prices == price)[0][0]
                if current_rank < len(sorted_prices) - 1:
                    gap = sorted_prices[current_rank + 1] - price
                else:
                    gap = 0  # Most expensive flight
                price_gaps.append(gap)

            features.loc[group_idx, 'price_gap_to_next_cheapest'] = price_gaps

            # Search context features
            features.loc[group_idx, 'options_count'] = len(prices)
            features.loc[group_idx, 'price_range'] = max_price - min_price
            features.loc[group_idx, 'price_std'] = prices.std()
            features.loc[group_idx, 'price_cv'] = prices.std() / prices.mean() if prices.mean() > 0 else 0

        return features.fillna(0)

    def calculate_price_tiers(self, df, n_tiers=5):
        """Create price tier features within each search"""
        features = pd.DataFrame(index=df.index)

        # Create search session identifier (same logic as above)
        search_cols = ['profileId', 'ranker_id', 'requestDate']
        available_cols = [col for col in search_cols if col in df.columns]

        if not available_cols:
            df['search_session'] = 0
            search_id_col = 'search_session'
        else:
            search_id_col = available_cols[0] if len(available_cols) == 1 else 'combined_search_id'
            if len(available_cols) > 1:
                df[search_id_col] = df[available_cols].astype(str).agg('_'.join, axis=1)

        for search_id, group in df.groupby(search_id_col):
            group_idx = group.index
            prices = group['totalPrice'].values

            if len(prices) <= 1:
                features.loc[group_idx, 'price_tier'] = 1
                features.loc[group_idx, f'is_tier_1'] = 1
                for tier in range(2, n_tiers + 1):
                    features.loc[group_idx, f'is_tier_{tier}'] = 0
                continue

            # Create price tiers using quantiles
            if len(set(prices)) >= n_tiers:
                # Use quantile-based tiers when we have enough unique prices
                tier_boundaries = np.percentile(prices, [i * 100/n_tiers for i in range(n_tiers + 1)])
                price_tiers = pd.cut(prices, bins=tier_boundaries, labels=range(1, n_tiers + 1),
                                   include_lowest=True, duplicates='drop')
            else:
                # Use rank-based tiers for limited unique prices
                price_ranks = stats.rankdata(prices, method='min')
                max_rank = price_ranks.max()
                tier_size = max_rank / n_tiers
                price_tiers = np.ceil(price_ranks / tier_size).astype(int)
                price_tiers = np.clip(price_tiers, 1, n_tiers)

            features.loc[group_idx, 'price_tier'] = price_tiers

            # Create binary tier features
            for tier in range(1, n_tiers + 1):
                features.loc[group_idx, f'is_tier_{tier}'] = (price_tiers == tier).astype(int)

        return features.fillna(0)

    def calculate_value_score(self, df):
        """
        Calculate a value score that considers both price and quality indicators.
        Lower prices with better features = higher value score.
        """
        features = pd.DataFrame(index=df.index)

        # Create search session identifier
        search_cols = ['profileId', 'ranker_id', 'requestDate']
        available_cols = [col for col in search_cols if col in df.columns]

        if not available_cols:
            df['search_session'] = 0
            search_id_col = 'search_session'
        else:
            search_id_col = available_cols[0] if len(available_cols) == 1 else 'combined_search_id'
            if len(available_cols) > 1:
                df[search_id_col] = df[available_cols].astype(str).agg('_'.join, axis=1)

        for search_id, group in df.groupby(search_id_col):
            group_idx = group.index

            # Quality indicators (you can customize these based on your data)
            quality_features = []

            # Duration efficiency (shorter is better)
            if 'legs0_duration' in df.columns:
                duration_mins = pd.to_timedelta(group['legs0_duration']).dt.total_seconds() / 60
                if 'legs1_duration' in df.columns:
                    duration_mins += pd.to_timedelta(group['legs1_duration']).dt.total_seconds() / 60

                # Normalize duration (lower is better, so invert)
                if duration_mins.std() > 0:
                    duration_score = 1 - (duration_mins - duration_mins.min()) / (duration_mins.max() - duration_mins.min())
                else:
                    duration_score = pd.Series([1.0] * len(group), index=group.index)
                quality_features.append(duration_score)

            # Direct flights preference (fewer segments is better)
            total_segments = 0
            segment_cols = [col for col in df.columns if 'segments1_' in col and 'aircraft_code' in col]
            if segment_cols:
                for col in segment_cols:
                    total_segments += (~group[col].isna()).astype(int)
                total_segments += 2  # Base segments (legs0_segments0 and legs1_segments0)

                # Invert so fewer segments = higher score
                if total_segments.std() > 0:
                    segment_score = 1 - (total_segments - total_segments.min()) / (total_segments.max() - total_segments.min())
                else:
                    segment_score = pd.Series([1.0] * len(group), index=group.index)
                quality_features.append(segment_score)

            # Seat availability (more available seats = better)
            seat_cols = [col for col in df.columns if 'seatsAvailable' in col]
            if seat_cols:
                total_seats = group[seat_cols].sum(axis=1)
                if total_seats.std() > 0:
                    seat_score = (total_seats - total_seats.min()) / (total_seats.max() - total_seats.min())
                else:
                    seat_score = pd.Series([1.0] * len(group), index=group.index)
                quality_features.append(seat_score)

            # Baggage allowance (more is better)
            baggage_cols = [col for col in df.columns if 'baggageAllowance_quantity' in col]
            if baggage_cols:
                total_baggage = group[baggage_cols].sum(axis=1)
                if total_baggage.std() > 0:
                    baggage_score = (total_baggage - total_baggage.min()) / (total_baggage.max() - total_baggage.min())
                else:
                    baggage_score = pd.Series([1.0] * len(group), index=group.index)
                quality_features.append(baggage_score)

            # Combine quality features
            if quality_features:
                quality_score = pd.concat(quality_features, axis=1).mean(axis=1)
            else:
                quality_score = pd.Series([0.5] * len(group), index=group.index)  # Neutral score

            # Price score (lower price = higher score)
            prices = group['totalPrice']
            if prices.std() > 0:
                price_score = 1 - (prices - prices.min()) / (prices.max() - prices.min())
            else:
                price_score = pd.Series([1.0] * len(group), index=group.index)

            # Combined value score (weighted combination)
            value_score = (price_score * 0.6 + quality_score * 0.4) * 100
            features.loc[group_idx, 'value_score'] = value_score

            # Value tier based on value score
            if len(group) > 1:
                value_tiers = pd.qcut(value_score, q=3, labels=['Low Value', 'Medium Value', 'High Value'],
                                    duplicates='drop')
                features.loc[group_idx, 'value_tier'] = value_tiers
                features.loc[group_idx, 'is_high_value'] = (value_tiers == 'High Value').astype(int)
            else:
                features.loc[group_idx, 'value_tier'] = 'Medium Value'
                features.loc[group_idx, 'is_high_value'] = 0

        return features.fillna(0)

    def calculate_competitive_price_position(self, df):
        """Calculate price position relative to market expectations"""
        features = pd.DataFrame(index=df.index)

        # Calculate expected price based on route, duration, etc.
        # Group by similar flights to establish price expectations
        route_price_stats = df.groupby('searchRoute')['totalPrice'].agg(['mean', 'median', 'std']).reset_index()
        route_price_stats.columns = ['searchRoute', 'route_avg_price', 'route_median_price', 'route_price_std']

        # Merge back to get expected prices
        df_with_expectations = df.merge(route_price_stats, on='searchRoute', how='left')

        # Calculate relative position vs. route expectations
        features['price_vs_route_avg'] = (df_with_expectations['totalPrice'] /
                                        df_with_expectations['route_avg_price'] - 1) * 100

        features['price_vs_route_median'] = (df_with_expectations['totalPrice'] /
                                           df_with_expectations['route_median_price'] - 1) * 100

        # Z-score relative to route historical prices
        features['price_zscore_vs_route'] = np.where(
            df_with_expectations['route_price_std'] > 0,
            (df_with_expectations['totalPrice'] - df_with_expectations['route_avg_price']) /
            df_with_expectations['route_price_std'],
            0
        )

        # Price surprise (how different from expected)
        features['price_surprise_score'] = np.abs(features['price_vs_route_median'])

        # Price attractiveness categories
        features['price_category'] = pd.cut(
            features['price_vs_route_median'],
            bins=[-float('inf'), -20, -10, 10, 20, float('inf')],
            labels=['Very Cheap', 'Cheap', 'Fair', 'Expensive', 'Very Expensive']
        )

        return features.fillna(0)

    def generate_all_relative_cost_features(self, df):
        """Generate comprehensive relative cost features"""
        print("Generating relative cost features...")

        # Basic relative price features
        search_level_features = self.calculate_search_level_price_features(df)

        # Price tier features
        tier_features = self.calculate_price_tiers(df, n_tiers=5)

        # Value score features
        value_features = self.calculate_value_score(df)

        # Competitive position features
        competitive_features = self.calculate_competitive_price_position(df)

        # Combine all features
        all_features = pd.concat([
            search_level_features,
            tier_features,
            value_features,
            competitive_features
        ], axis=1)

        print(f"Generated {len(all_features.columns)} relative cost features")
        return all_features

# Example usage and testing
def demonstrate_relative_cost_features():
    """Demonstrate the relative cost feature engineering"""
    print("=== RELATIVE COST FEATURE ENGINEERING DEMO ===\n")

    # Create sample data to demonstrate
    np.random.seed(42)
    sample_data = {
        'profileId': [1, 1, 1, 1, 2, 2, 2],
        'ranker_id': ['a', 'a', 'a', 'a', 'b', 'b', 'b'],
        'requestDate': pd.to_datetime(['2024-06-01'] * 4 + ['2024-06-02'] * 3),
        'totalPrice': [15000, 18000, 22000, 25000, 12000, 16000, 20000],
        'searchRoute': ['TLKKJA/KJATLK'] * 7,
        'legs0_duration': ['02:40:00'] * 7,
        'legs1_duration': ['02:35:00'] * 7
    }

    sample_df = pd.DataFrame(sample_data)

    # Generate features
    feature_engineer = RelativeCostFeatureEngineer()
    cost_features = feature_engineer.generate_all_relative_cost_features(sample_df)

    print("Sample relative cost features:")
    print("=" * 50)

    # Show key features for first search session
    key_features = [
        'price_rank', 'price_percentile', 'price_ratio_to_min',
        'is_cheapest', 'price_tier', 'value_score'
    ]

    print("Search Session 1 (4 flights):")
    search_1_features = cost_features.iloc[:4][key_features]
    search_1_features['totalPrice'] = sample_df.iloc[:4]['totalPrice']
    print(search_1_features.round(2))

    print("\nSearch Session 2 (3 flights):")
    search_2_features = cost_features.iloc[4:][key_features]
    search_2_features['totalPrice'] = sample_df.iloc[4:]['totalPrice']
    print(search_2_features.round(2))

    print(f"\nTotal features generated: {len(cost_features.columns)}")
    print("Feature names:", list(cost_features.columns))

if __name__ == "__main__":
    demonstrate_relative_cost_features()