# Feature Engineering & Hypothesis Testing Notebook

This notebook focuses on generating new features from the cleaned dataset and testing hypotheses based on domain knowledge and EDA insights.

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load cleaned training data
train = pd.read_csv('..\\data\\cleaned_data\\train_cleaned.csv')
test = pd.read_csv('..\\data\\cleaned_data\\test_cleaned.csv')

# Quick check
print(f'Train shape: {train.shape}, Test shape: {test.shape}')
train.head()

## 1. Feature Engineering
We'll create new features to enhance predictive power based on domain knowledge.

In [ ]:
# --- 1. Delay Features ---
# Create delay categories
train['Departure_Delay_Category'] = pd.cut(
    train['Departure_Delay_in_Minutes'],
    bins=[-1, 0, 15, 60, np.inf],
    labels=['On-time', 'Minor', 'Moderate', 'Severe']
)
test['Departure_Delay_Category'] = pd.cut(
    test['Departure_Delay_in_Minutes'],
    bins=[-1, 0, 15, 60, np.inf],
    labels=['On-time', 'Minor', 'Moderate', 'Severe']
)

train['Arrival_Delay_Category'] = pd.cut(
    train['Arrival_Delay_in_Minutes'],
    bins=[-1, 0, 15, 60, np.inf],
    labels=['On-time', 'Minor', 'Moderate', 'Severe']
)
test['Arrival_Delay_Category'] = pd.cut(
    test['Arrival_Delay_in_Minutes'],
    bins=[-1, 0, 15, 60, np.inf],
    labels=['On-time', 'Minor', 'Moderate', 'Severe']
)

# --- 2. Service Ratings Aggregate ---
service_cols = ['Inflight_wifi_service', 'Online_Boarding', 'Seat_Comfort', 'Cleanliness', 
                'Inflight_Entertainment', 'Checkin_Service', 'Food_and_Drink', 'Inflight_Service',
                'Ease_of_Online_booking', 'Gate_Location', 'Leg_Room_Service', 'Baggage_Handling']

# Total service score
train['Total_Service_Score'] = train[service_cols].sum(axis=1)
test['Total_Service_Score'] = test[service_cols].sum(axis=1)

# Average service score
train['Average_Service_Score'] = train[service_cols].mean(axis=1)
test['Average_Service_Score'] = test[service_cols].mean(axis=1)

# High-level rating flag
train['High_Service_Flag'] = np.where(train['Average_Service_Score'] >= 4, 1, 0)
test['High_Service_Flag'] = np.where(test['Average_Service_Score'] >= 4, 1, 0)

## 2. Hypothesis Testing
We'll test some hypotheses from EDA using statistical tests.

In [ ]:
# Hypothesis 1: Business travelers are more satisfied than Leisure travelers
business_satisfaction = train[train['Type_of_Travel']=='Business']['satisfaction']
leisure_satisfaction = train[train['Type_of_Travel']=='Personal']['satisfaction']

t_stat, p_val = stats.ttest_ind(business_satisfaction, leisure_satisfaction)
print(f'T-Test: t={t_stat:.3f}, p={p_val:.4f}')
if p_val < 0.05:
    print('Reject null hypothesis: Business travelers satisfaction differs significantly from Personal travelers')
else:
    print('Fail to reject null hypothesis')

In [ ]:
# Hypothesis 2: Loyalty effect
contingency_table = pd.crosstab(train['Customer_Type'], train['satisfaction'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print(f'Chi-square Test: chi2={chi2:.3f}, p={p:.4f}')
if p < 0.05:
    print('Reject null hypothesis: Customer type is associated with satisfaction')
else:
    print('Fail to reject null hypothesis')

In [ ]:
# Hypothesis 3: High service score correlates with satisfaction
high_service = train[train['High_Service_Flag']==1]['satisfaction']
low_service = train[train['High_Service_Flag']==0]['satisfaction']

t_stat, p_val = stats.ttest_ind(high_service, low_service)
print(f'T-Test (High vs Low Service): t={t_stat:.3f}, p={p_val:.4f}')
if p_val < 0.05:
    print('Reject null hypothesis: High service rating impacts satisfaction')
else:
    print('Fail to reject null hypothesis')

## 3. Visualizing Feature Effects
We'll visualize some of the engineered features vs satisfaction.

In [ ]:
# Plot Total Service Score vs Satisfaction
plt.figure(figsize=(10,6))
sns.boxplot(data=train, x='satisfaction', y='Total_Service_Score')
plt.xticks([0,1], ['Neutral/Dissatisfied', 'Satisfied'])
plt.title('Total Service Score vs Satisfaction')
plt.show()

# Plot Average Service Score distribution
plt.figure(figsize=(10,6))
sns.histplot(data=train, x='Average_Service_Score', hue='satisfaction', bins=20, kde=True, palette=['#44a779','#3c6682'])
plt.title('Average Service Score Distribution by Satisfaction')
plt.show()

## 4. Saving Engineered Features
Save the datasets with new features for modeling.

In [ ]:
train.to_csv('..\\data\\cleaned_data\\train_engineered.csv', index=False)
test.to_csv('..\\data\\cleaned_data\\test_engineered.csv', index=False)
print('Engineered datasets saved successfully.')