# 01 - Data Exploration and Preparation

This notebook explores and prepares the Sri Lankan tourist attraction data for optimization.

## Import Libraries

In [None]:
import sys
sys.path.append('../scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from haversine import haversine, Unit

from data_utils import (
    load_attractions_data,
    calculate_distance_matrix,
    validate_attraction_data,
    prepare_data_for_optimization
)

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## Create Sample Data

Since we don't have actual data yet, let's create sample Sri Lankan tourist attractions data.

In [None]:
# Sample Sri Lankan tourist attractions
attractions_data = [
    {'name': 'Sigiriya Rock Fortress', 'latitude': 7.9570, 'longitude': 80.7603, 'score': 9.5, 'visit_duration': 3.0},
    {'name': 'Temple of the Tooth', 'latitude': 7.2936, 'longitude': 80.6410, 'score': 9.0, 'visit_duration': 2.0},
    {'name': 'Galle Fort', 'latitude': 6.0260, 'longitude': 80.2170, 'score': 8.5, 'visit_duration': 2.5},
    {'name': 'Yala National Park', 'latitude': 6.3719, 'longitude': 81.5158, 'score': 9.2, 'visit_duration': 4.0},
    {'name': 'Dambulla Cave Temple', 'latitude': 7.8567, 'longitude': 80.6489, 'score': 8.8, 'visit_duration': 2.0},
    {'name': 'Ella Rock', 'latitude': 6.8667, 'longitude': 81.0467, 'score': 8.3, 'visit_duration': 3.5},
    {'name': 'Mirissa Beach', 'latitude': 5.9467, 'longitude': 80.4500, 'score': 8.0, 'visit_duration': 3.0},
    {'name': 'Pinnawala Elephant Orphanage', 'latitude': 7.2972, 'longitude': 80.3889, 'score': 7.8, 'visit_duration': 2.5},
    {'name': 'Nine Arch Bridge', 'latitude': 6.8800, 'longitude': 81.0586, 'score': 7.5, 'visit_duration': 1.5},
    {'name': 'Adam\'s Peak', 'latitude': 6.8094, 'longitude': 80.4997, 'score': 9.0, 'visit_duration': 5.0}
]

df = pd.DataFrame(attractions_data)
df.to_csv('../data/sri_lanka_attractions.csv', index=False)
print("Sample data created and saved to ../data/sri_lanka_attractions.csv")

## Load and Explore Data

In [None]:
# Load the data
attractions = load_attractions_data('../data/sri_lanka_attractions.csv')
print(f"Loaded {len(attractions)} attractions")
attractions.head(10)

In [None]:
# Display basic statistics
print("\nData Statistics:")
print(attractions.describe())

In [None]:
# Validate the data
is_valid = validate_attraction_data(attractions)
print(f"\nData validation: {'Passed' if is_valid else 'Failed'}")

## Visualize Attraction Scores and Visit Durations

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot scores
axes[0].barh(attractions['name'], attractions['score'], color='steelblue')
axes[0].set_xlabel('Satisfaction Score')
axes[0].set_title('Tourist Satisfaction Scores')
axes[0].grid(True, alpha=0.3)

# Plot visit durations
axes[1].barh(attractions['name'], attractions['visit_duration'], color='coral')
axes[1].set_xlabel('Visit Duration (hours)')
axes[1].set_title('Required Visit Durations')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Calculate Distance Matrix

In [None]:
# Calculate distances between all attractions
distance_matrix = calculate_distance_matrix(attractions)
print(f"Distance matrix shape: {distance_matrix.shape}")
print(f"\nSample distances (in km):")
print(distance_matrix[:5, :5])

In [None]:
# Visualize distance matrix
plt.figure(figsize=(10, 8))
sns.heatmap(distance_matrix, 
            xticklabels=attractions['name'], 
            yticklabels=attractions['name'],
            cmap='YlOrRd', 
            annot=False,
            fmt='.1f',
            cbar_kws={'label': 'Distance (km)'})
plt.title('Distance Matrix Between Attractions')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Visualize Attraction Locations

In [None]:
# Plot attractions on a scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(attractions['longitude'], 
                     attractions['latitude'],
                     c=attractions['score'],
                     s=attractions['visit_duration']*100,
                     cmap='viridis',
                     alpha=0.6,
                     edgecolors='black',
                     linewidth=1)

# Add labels
for idx, row in attractions.iterrows():
    plt.annotate(row['name'], 
                (row['longitude'], row['latitude']),
                xytext=(5, 5),
                textcoords='offset points',
                fontsize=8,
                bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.3))

plt.colorbar(scatter, label='Satisfaction Score')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Sri Lankan Tourist Attractions\n(Size = Visit Duration, Color = Score)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Prepare Data for Optimization

In [None]:
# Prepare data for optimization algorithms
prepared_data = prepare_data_for_optimization(attractions)

if prepared_data:
    print("Data prepared successfully!")
    print(f"Number of attractions: {prepared_data['n_attractions']}")
    print(f"Score range: {prepared_data['scores'].min():.2f} - {prepared_data['scores'].max():.2f}")
    print(f"Duration range: {prepared_data['visit_durations'].min():.2f} - {prepared_data['visit_durations'].max():.2f} hours")
    
    # Save prepared data
    np.save('../data/distance_matrix.npy', prepared_data['distance_matrix'])
    print("\nDistance matrix saved to ../data/distance_matrix.npy")

## Summary Statistics

In [None]:
print("=" * 60)
print("DATA EXPLORATION SUMMARY")
print("=" * 60)
print(f"Total Attractions: {len(attractions)}")
print(f"Average Score: {attractions['score'].mean():.2f}")
print(f"Average Visit Duration: {attractions['visit_duration'].mean():.2f} hours")
print(f"Total Visit Time (all attractions): {attractions['visit_duration'].sum():.2f} hours")
print(f"Average Distance Between Attractions: {distance_matrix[distance_matrix > 0].mean():.2f} km")
print(f"Max Distance Between Any Two Attractions: {distance_matrix.max():.2f} km")
print("=" * 60)

## Next Steps

The data is now ready for optimization. In the next notebook, we will implement a Genetic Algorithm to solve the Tourist Trip Design Problem.