# TripX - Travel Recommendation System
## 02. Exploratory Data Analysis (EDA)

**Goal**: Deep dive into feature relationships and patterns to inform our ML recommendation logic.

**Key Questions**:
- How do cost, popularity, and safety relate?
- What patterns exist by trip type and region?
- Which features will be most important for recommendations?
- Are there any feature correlations we should consider?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

# Load data
df = pd.read_csv('../data/raw/dest.csv')
print(f"Loaded {len(df)} destinations for analysis")

## 1. Cost Analysis - Key for Budget Matching

In [None]:
# Cost distribution and patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Cost distribution
axes[0,0].hist(df['avg_cost_per_day'], bins=10, alpha=0.7, color='skyblue')
axes[0,0].set_title('Cost Distribution')
axes[0,0].set_xlabel('Average Cost per Day ($)')
axes[0,0].set_ylabel('Number of Destinations')

# Cost by region
df.boxplot(column='avg_cost_per_day', by='region', ax=axes[0,1])
axes[0,1].set_title('Cost by Region')
axes[0,1].set_xlabel('Region')
axes[0,1].tick_params(axis='x', rotation=45)

# Cost by trip type
df.boxplot(column='avg_cost_per_day', by='trip_type', ax=axes[1,0])
axes[1,0].set_title('Cost by Trip Type')
axes[1,0].set_xlabel('Trip Type')
axes[1,0].tick_params(axis='x', rotation=45)

# Cost vs Popularity
axes[1,1].scatter(df['avg_cost_per_day'], df['popularity_score'], alpha=0.7)
axes[1,1].set_title('Cost vs Popularity')
axes[1,1].set_xlabel('Average Cost per Day ($)')
axes[1,1].set_ylabel('Popularity Score')

plt.tight_layout()
plt.show()

# Cost insights
print("=== COST INSIGHTS ===")
print(f"Most expensive region: {df.groupby('region')['avg_cost_per_day'].mean().idxmax()}")
print(f"Cheapest region: {df.groupby('region')['avg_cost_per_day'].mean().idxmin()}")
print(f"Most expensive trip type: {df.groupby('trip_type')['avg_cost_per_day'].mean().idxmax()}")
print(f"Budget-friendly trip type: {df.groupby('trip_type')['avg_cost_per_day'].mean().idxmin()}")

## 2. Quality Scores Analysis - Popularity vs Safety

In [None]:
# Quality scores analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Popularity distribution
axes[0,0].hist(df['popularity_score'], bins=10, alpha=0.7, color='lightgreen')
axes[0,0].set_title('Popularity Score Distribution')
axes[0,0].set_xlabel('Popularity Score')
axes[0,0].set_ylabel('Number of Destinations')

# Safety distribution
axes[0,1].hist(df['safety_score'], bins=10, alpha=0.7, color='lightcoral')
axes[0,1].set_title('Safety Score Distribution')
axes[0,1].set_xlabel('Safety Score')
axes[0,1].set_ylabel('Number of Destinations')

# Popularity vs Safety scatter
axes[1,0].scatter(df['popularity_score'], df['safety_score'], alpha=0.7, s=60)
axes[1,0].set_title('Popularity vs Safety')
axes[1,0].set_xlabel('Popularity Score')
axes[1,0].set_ylabel('Safety Score')

# Quality by trip type
trip_quality = df.groupby('trip_type')[['popularity_score', 'safety_score']].mean()
trip_quality.plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Average Quality Scores by Trip Type')
axes[1,1].set_xlabel('Trip Type')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].legend(['Popularity', 'Safety'])

plt.tight_layout()
plt.show()

# Quality insights
correlation = df['popularity_score'].corr(df['safety_score'])
print(f"=== QUALITY INSIGHTS ===")
print(f"Popularity-Safety correlation: {correlation:.3f}")
print(f"Highest rated destination: {df.loc[df['popularity_score'].idxmax(), 'destination']}")
print(f"Safest destination: {df.loc[df['safety_score'].idxmax(), 'destination']}")

## 3. Trip Duration Patterns

In [None]:
# Duration analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Duration range by trip type
df['duration_range'] = df['max_days'] - df['min_days']
df.boxplot(column='duration_range', by='trip_type', ax=axes[0])
axes[0].set_title('Duration Flexibility by Trip Type')
axes[0].set_xlabel('Trip Type')
axes[0].set_ylabel('Duration Range (days)')
axes[0].tick_params(axis='x', rotation=45)

# Min days by trip type
df.boxplot(column='min_days', by='trip_type', ax=axes[1])
axes[1].set_title('Minimum Days by Trip Type')
axes[1].set_xlabel('Trip Type')
axes[1].set_ylabel('Minimum Days')
axes[1].tick_params(axis='x', rotation=45)

# Duration vs Cost
axes[2].scatter(df['min_days'], df['avg_cost_per_day'], alpha=0.7, label='Min Days')
axes[2].scatter(df['max_days'], df['avg_cost_per_day'], alpha=0.7, label='Max Days')
axes[2].set_title('Duration vs Cost')
axes[2].set_xlabel('Days')
axes[2].set_ylabel('Cost per Day ($)')
axes[2].legend()

plt.tight_layout()
plt.show()

print("=== DURATION INSIGHTS ===")
print(f"Most flexible trip type: {df.groupby('trip_type')['duration_range'].mean().idxmax()}")
print(f"Longest minimum stay: {df.groupby('trip_type')['min_days'].mean().idxmax()}")
print(f"Average duration range: {df['duration_range'].mean():.1f} days")

## 4. Seasonal and Climate Patterns

In [None]:
# Seasonal and climate analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Season distribution
season_counts = df['season_best'].value_counts()
axes[0,0].pie(season_counts.values, labels=season_counts.index, autopct='%1.1f%%')
axes[0,0].set_title('Best Season Distribution')

# Climate distribution
climate_counts = df['climate'].value_counts()
axes[0,1].pie(climate_counts.values, labels=climate_counts.index, autopct='%1.1f%%')
axes[0,1].set_title('Climate Distribution')

# Cost by season
df.boxplot(column='avg_cost_per_day', by='season_best', ax=axes[1,0])
axes[1,0].set_title('Cost by Best Season')
axes[1,0].set_xlabel('Best Season')
axes[1,0].tick_params(axis='x', rotation=45)

# Popularity by climate
df.boxplot(column='popularity_score', by='climate', ax=axes[1,1])
axes[1,1].set_title('Popularity by Climate')
axes[1,1].set_xlabel('Climate')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("=== SEASONAL INSIGHTS ===")
print(f"Most popular season: {df['season_best'].mode()[0]}")
print(f"Most common climate: {df['climate'].mode()[0]}")
print(f"Most expensive season: {df.groupby('season_best')['avg_cost_per_day'].mean().idxmax()}")

## 5. Feature Correlation Analysis

In [None]:
# Correlation matrix for numerical features
numerical_features = ['avg_cost_per_day', 'min_days', 'max_days', 'popularity_score', 'safety_score', 'duration_range']
correlation_matrix = df[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.3f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("=== CORRELATION INSIGHTS ===")
print("Strong correlations (>0.5 or <-0.5):")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.5:
            print(f"  {correlation_matrix.columns[i]} vs {correlation_matrix.columns[j]}: {corr_val:.3f}")

## 6. Key Findings for ML Model Design

### Feature Importance Ranking:
1. **Budget matching** (`avg_cost_per_day`) - Wide range, clear patterns by region/type
2. **Trip type preference** - Strong differentiator for user preferences
3. **Quality scores** - Popularity and safety are important but not highly correlated
4. **Duration compatibility** - Important for trip planning
5. **Seasonal preference** - Secondary but useful for filtering

### ML Model Strategy:

In [None]:
print("=== ML MODEL STRATEGY ===")
print("\n1. SCORING APPROACH:")
print("   - Budget compatibility score (0-1)")
print("   - Trip type match (binary)")
print("   - Duration fit score (0-1)")
print("   - Quality weighted score (popularity + safety)")
print("   - Season bonus (if matches)")

print("\n2. FEATURE ENGINEERING NEEDS:")
print("   - Normalize cost ranges")
print("   - Encode categorical variables")
print("   - Create duration compatibility function")
print("   - Weight quality scores")

print("\n3. EXPLAINABILITY:")
print("   - Show budget match percentage")
print("   - Highlight trip type alignment")
print("   - Display quality scores")
print("   - Mention seasonal fit")

# Sample user profile for testing
print("\n=== SAMPLE USER PROFILES FOR TESTING ===")
profiles = {
    'budget_traveler': {'budget': 60, 'days': 7, 'type': 'culture', 'season': 'spring'},
    'luxury_traveler': {'budget': 200, 'days': 5, 'type': 'luxury', 'season': 'winter'},
    'beach_lover': {'budget': 100, 'days': 10, 'type': 'beach', 'season': 'summer'}
}

for name, profile in profiles.items():
    print(f"  {name}: {profile}")

## 7. Next Steps

**Day 4 - Feature Preprocessing (`prep.py`)**:
1. Create budget compatibility scoring function
2. Encode categorical variables (trip_type, season, climate)
3. Normalize numerical features
4. Build duration matching logic
5. Create feature vectors for similarity calculation

**Key insights to implement**:
- Budget ranges vary significantly (need smart matching)
- Trip type is a strong differentiator
- Quality scores should be weighted together
- Duration flexibility varies by destination type