# West Nile Virus Data Exploration

This notebook explores the West Nile Virus prediction dataset to understand patterns, distributions, and relationships in the data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Data Loading

In [None]:
# Load datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
weather_df = pd.read_csv('data/weather.csv')
spray_df = pd.read_csv('data/spray.csv')

print("Dataset shapes:")
print(f"Training data: {train_df.shape}")
print(f"Test data: {test_df.shape}")
print(f"Weather data: {weather_df.shape}")
print(f"Spray data: {spray_df.shape}")

## 2. Training Data Overview

In [None]:
# Basic info about training data
print("Training Data Info:")
print(train_df.info())
print("\nFirst few rows:")
train_df.head()

In [None]:
# Summary statistics
print("Training Data Summary:")
train_df.describe()

In [None]:
# Check for missing values
print("Missing values in training data:")
missing_values = train_df.isnull().sum()
missing_values[missing_values > 0]

## 3. Target Variable Analysis (WnvPresent)

In [None]:
# Target variable distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Count plot
wnv_counts = train_df['WnvPresent'].value_counts()
axes[0].bar(wnv_counts.index, wnv_counts.values, color=['skyblue', 'lightcoral'])
axes[0].set_title('West Nile Virus Presence Distribution')
axes[0].set_xlabel('WnvPresent')
axes[0].set_ylabel('Count')
axes[0].set_xticks([0, 1])
axes[0].set_xticklabels(['Absent', 'Present'])

# Add count labels
for i, v in enumerate(wnv_counts.values):
    axes[0].text(i, v + 100, str(v), ha='center', va='bottom')

# Pie chart
axes[1].pie(wnv_counts.values, labels=['Absent', 'Present'], autopct='%1.1f%%', 
           colors=['skyblue', 'lightcoral'])
axes[1].set_title('WNV Presence Proportion')

plt.tight_layout()
plt.show()

# Print statistics
total_samples = len(train_df)
positive_samples = train_df['WnvPresent'].sum()
negative_samples = total_samples - positive_samples
imbalance_ratio = negative_samples / positive_samples

print(f"Total samples: {total_samples:,}")
print(f"Positive samples (WNV Present): {positive_samples:,} ({positive_samples/total_samples*100:.2f}%)")
print(f"Negative samples (WNV Absent): {negative_samples:,} ({negative_samples/total_samples*100:.2f}%)")
print(f"Class imbalance ratio: {imbalance_ratio:.1f}:1")

## 4. Species Analysis

In [None]:
# Species distribution
fig, axes = plt.subplots(2, 1, figsize=(15, 12))

# Overall species count
species_counts = train_df['Species'].value_counts()
axes[0].bar(range(len(species_counts)), species_counts.values)
axes[0].set_title('Mosquito Species Distribution')
axes[0].set_xlabel('Species')
axes[0].set_ylabel('Count')
axes[0].set_xticks(range(len(species_counts)))
axes[0].set_xticklabels(species_counts.index, rotation=45, ha='right')

# WNV presence by species
species_wnv = train_df.groupby('Species')['WnvPresent'].agg(['count', 'sum', 'mean']).reset_index()
species_wnv.columns = ['Species', 'Total_Samples', 'WNV_Positive', 'WNV_Rate']
species_wnv = species_wnv.sort_values('WNV_Rate', ascending=False)

bars = axes[1].bar(range(len(species_wnv)), species_wnv['WNV_Rate'])
axes[1].set_title('West Nile Virus Rate by Species')
axes[1].set_xlabel('Species')
axes[1].set_ylabel('WNV Positive Rate')
axes[1].set_xticks(range(len(species_wnv)))
axes[1].set_xticklabels(species_wnv['Species'], rotation=45, ha='right')

# Add value labels on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height + 0.001,
                f'{height:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\nWest Nile Virus by Species:")
print(species_wnv.to_string(index=False))

## 5. Temporal Analysis

In [None]:
# Convert Date column to datetime
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['DayOfYear'] = train_df['Date'].dt.dayofyear
train_df['Week'] = train_df['Date'].dt.isocalendar().week

fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# WNV by Year
yearly_wnv = train_df.groupby('Year')['WnvPresent'].agg(['count', 'sum', 'mean']).reset_index()
yearly_wnv.columns = ['Year', 'Total_Samples', 'WNV_Positive', 'WNV_Rate']

bars1 = axes[0,0].bar(yearly_wnv['Year'], yearly_wnv['WNV_Rate'])
axes[0,0].set_title('WNV Rate by Year')
axes[0,0].set_xlabel('Year')
axes[0,0].set_ylabel('WNV Positive Rate')

# WNV by Month
monthly_wnv = train_df.groupby('Month')['WnvPresent'].agg(['count', 'sum', 'mean']).reset_index()
monthly_wnv.columns = ['Month', 'Total_Samples', 'WNV_Positive', 'WNV_Rate']

bars2 = axes[0,1].bar(monthly_wnv['Month'], monthly_wnv['WNV_Rate'])
axes[0,1].set_title('WNV Rate by Month')
axes[0,1].set_xlabel('Month')
axes[0,1].set_ylabel('WNV Positive Rate')
axes[0,1].set_xticks(range(1, 13))

# WNV by Week
weekly_wnv = train_df.groupby('Week')['WnvPresent'].agg(['count', 'sum', 'mean']).reset_index()
weekly_wnv.columns = ['Week', 'Total_Samples', 'WNV_Positive', 'WNV_Rate']

axes[1,0].plot(weekly_wnv['Week'], weekly_wnv['WNV_Rate'], marker='o')
axes[1,0].set_title('WNV Rate by Week of Year')
axes[1,0].set_xlabel('Week')
axes[1,0].set_ylabel('WNV Positive Rate')
axes[1,0].grid(True, alpha=0.3)

# Sample count over time
date_counts = train_df.groupby('Date').size().reset_index(name='Count')
axes[1,1].plot(date_counts['Date'], date_counts['Count'])
axes[1,1].set_title('Sampling Activity Over Time')
axes[1,1].set_xlabel('Date')
axes[1,1].set_ylabel('Number of Samples')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\nWNV Rate by Year:")
print(yearly_wnv.to_string(index=False))
print("\nWNV Rate by Month:")
print(monthly_wnv.to_string(index=False))

## 6. Geographic Analysis

In [None]:
# Geographic distribution
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

# All samples scatter plot
axes[0].scatter(train_df['Longitude'], train_df['Latitude'], alpha=0.5, s=1)
axes[0].set_title('All Trap Locations')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')

# WNV positive samples
wnv_positive = train_df[train_df['WnvPresent'] == 1]
wnv_negative = train_df[train_df['WnvPresent'] == 0]

axes[1].scatter(wnv_negative['Longitude'], wnv_negative['Latitude'], 
               alpha=0.3, s=1, color='blue', label='WNV Negative')
axes[1].scatter(wnv_positive['Longitude'], wnv_positive['Latitude'], 
               alpha=0.8, s=5, color='red', label='WNV Positive')
axes[1].set_title('WNV Distribution by Location')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].legend()

# Trap locations with WNV rates
trap_wnv = train_df.groupby(['Latitude', 'Longitude'])['WnvPresent'].agg(['count', 'sum', 'mean']).reset_index()
trap_wnv.columns = ['Latitude', 'Longitude', 'Total_Samples', 'WNV_Positive', 'WNV_Rate']
trap_wnv = trap_wnv[trap_wnv['Total_Samples'] >= 10]  # Filter traps with sufficient samples

scatter = axes[2].scatter(trap_wnv['Longitude'], trap_wnv['Latitude'], 
                         c=trap_wnv['WNV_Rate'], s=trap_wnv['Total_Samples']/5, 
                         alpha=0.7, cmap='Reds')
axes[2].set_title('WNV Rate by Trap Location\n(Size = Sample Count)')
axes[2].set_xlabel('Longitude')
axes[2].set_ylabel('Latitude')
plt.colorbar(scatter, ax=axes[2], label='WNV Rate')

plt.tight_layout()
plt.show()

print(f"\nTotal unique trap locations: {len(trap_wnv)}")
print(f"Traps with WNV positive samples: {len(trap_wnv[trap_wnv['WNV_Positive'] > 0])}")
print(f"Highest WNV rate: {trap_wnv['WNV_Rate'].max():.3f}")
print(f"Average WNV rate across traps: {trap_wnv['WNV_Rate'].mean():.3f}")

## 7. NumMosquitos Analysis

In [None]:
# NumMosquitos analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Distribution of NumMosquitos
axes[0,0].hist(train_df['NumMosquitos'], bins=50, alpha=0.7, edgecolor='black')
axes[0,0].set_title('Distribution of Number of Mosquitos')
axes[0,0].set_xlabel('Number of Mosquitos')
axes[0,0].set_ylabel('Frequency')

# Log scale
axes[0,1].hist(np.log1p(train_df['NumMosquitos']), bins=50, alpha=0.7, edgecolor='black')
axes[0,1].set_title('Distribution of log(NumMosquitos + 1)')
axes[0,1].set_xlabel('log(Number of Mosquitos + 1)')
axes[0,1].set_ylabel('Frequency')

# NumMosquitos vs WNV
mosquito_wnv = train_df.groupby('NumMosquitos')['WnvPresent'].agg(['count', 'sum', 'mean']).reset_index()
mosquito_wnv.columns = ['NumMosquitos', 'Total_Samples', 'WNV_Positive', 'WNV_Rate']
mosquito_wnv = mosquito_wnv[mosquito_wnv['Total_Samples'] >= 10]  # Filter for sufficient samples

axes[1,0].scatter(mosquito_wnv['NumMosquitos'], mosquito_wnv['WNV_Rate'], 
                 s=mosquito_wnv['Total_Samples']/5, alpha=0.7)
axes[1,0].set_title('WNV Rate vs Number of Mosquitos\n(Size = Sample Count)')
axes[1,0].set_xlabel('Number of Mosquitos')
axes[1,0].set_ylabel('WNV Rate')

# Box plot
wnv_neg_mosquitos = train_df[train_df['WnvPresent'] == 0]['NumMosquitos']
wnv_pos_mosquitos = train_df[train_df['WnvPresent'] == 1]['NumMosquitos']

axes[1,1].boxplot([wnv_neg_mosquitos, wnv_pos_mosquitos], 
                 labels=['WNV Negative', 'WNV Positive'])
axes[1,1].set_title('NumMosquitos Distribution by WNV Status')
axes[1,1].set_ylabel('Number of Mosquitos')

plt.tight_layout()
plt.show()

print("\nNumMosquitos Statistics:")
print(f"Mean: {train_df['NumMosquitos'].mean():.2f}")
print(f"Median: {train_df['NumMosquitos'].median():.2f}")
print(f"Max: {train_df['NumMosquitos'].max()}")
print(f"Min: {train_df['NumMosquitos'].min()}")

print("\nNumMosquitos by WNV Status:")
print(f"WNV Negative - Mean: {wnv_neg_mosquitos.mean():.2f}, Median: {wnv_neg_mosquitos.median():.2f}")
print(f"WNV Positive - Mean: {wnv_pos_mosquitos.mean():.2f}, Median: {wnv_pos_mosquitos.median():.2f}")

## 8. Weather Data Analysis

In [None]:
# Weather data overview
print("Weather Data Info:")
print(weather_df.info())
print("\nWeather Data Summary:")
weather_df.describe()

In [None]:
# Convert weather date and clean data
weather_df['Date'] = pd.to_datetime(weather_df['Date'])

# Debug: Check PrecipTotal column
print("Debug - Weather columns:")
print(weather_df.columns.tolist())
print(f"\nPrecipTotal sample values: {weather_df['PrecipTotal'].head(10).tolist()}")
print(f"PrecipTotal unique values (first 10): {weather_df['PrecipTotal'].unique()[:10]}")

# Handle missing values and convert to numeric
numeric_columns = ['Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'PrecipTotal', 
                  'StnPressure', 'SeaLevel', 'ResultSpeed', 'AvgSpeed']

for col in numeric_columns:
    if col in weather_df.columns:
        # Special handling for PrecipTotal which may have 'T' for trace amounts
        if col == 'PrecipTotal':
            weather_df[col] = weather_df[col].replace('T', '0.005')  # Replace 'T' with small value
            weather_df[col] = weather_df[col].replace('  T', '0.005')  # Handle padded 'T'
        weather_df[col] = pd.to_numeric(weather_df[col], errors='coerce')
    else:
        print(f"Warning: Column {col} not found in weather data")

# Check missing values in weather data
print("\nMissing values in weather data:")
missing_weather = weather_df.isnull().sum()
print(missing_weather[missing_weather > 0])

In [None]:
# Weather trends
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# Temperature trends
weather_stations = weather_df.groupby(['Date', 'Station'])[['Tmax', 'Tmin', 'Tavg']].mean().reset_index()
weather_avg = weather_stations.groupby('Date')[['Tmax', 'Tmin', 'Tavg']].mean().reset_index()

axes[0,0].plot(weather_avg['Date'], weather_avg['Tmax'], label='Max Temp', alpha=0.8)
axes[0,0].plot(weather_avg['Date'], weather_avg['Tavg'], label='Avg Temp', alpha=0.8)
axes[0,0].plot(weather_avg['Date'], weather_avg['Tmin'], label='Min Temp', alpha=0.8)
axes[0,0].set_title('Temperature Trends Over Time')
axes[0,0].set_xlabel('Date')
axes[0,0].set_ylabel('Temperature (°F)')
axes[0,0].legend()
axes[0,0].tick_params(axis='x', rotation=45)

# Precipitation - check if PrecipTotal exists and has data
if 'PrecipTotal' in weather_df.columns and weather_df['PrecipTotal'].notna().any():
    precip_stations = weather_df.groupby(['Date', 'Station'])['PrecipTotal'].mean().reset_index()
    precip_avg = precip_stations.groupby('Date')['PrecipTotal'].mean().reset_index()
    axes[0,1].plot(precip_avg['Date'], precip_avg['PrecipTotal'])
    axes[0,1].set_title('Precipitation Over Time')
    axes[0,1].set_xlabel('Date')
    axes[0,1].set_ylabel('Precipitation (inches)')
    axes[0,1].tick_params(axis='x', rotation=45)
else:
    axes[0,1].text(0.5, 0.5, 'PrecipTotal data not available', ha='center', va='center', transform=axes[0,1].transAxes)
    axes[0,1].set_title('Precipitation Over Time - Data Not Available')

# DewPoint
dewpoint_stations = weather_df.groupby(['Date', 'Station'])['DewPoint'].mean().reset_index()
dewpoint_avg = dewpoint_stations.groupby('Date')['DewPoint'].mean().reset_index()
axes[1,0].plot(dewpoint_avg['Date'], dewpoint_avg['DewPoint'])
axes[1,0].set_title('Dew Point Over Time')
axes[1,0].set_xlabel('Date')
axes[1,0].set_ylabel('Dew Point (°F)')
axes[1,0].tick_params(axis='x', rotation=45)

# Wind Speed
wind_stations = weather_df.groupby(['Date', 'Station'])['AvgSpeed'].mean().reset_index()
wind_avg = wind_stations.groupby('Date')['AvgSpeed'].mean().reset_index()
axes[1,1].plot(wind_avg['Date'], wind_avg['AvgSpeed'])
axes[1,1].set_title('Average Wind Speed Over Time')
axes[1,1].set_xlabel('Date')
axes[1,1].set_ylabel('Wind Speed (mph)')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 9. Spray Data Analysis

In [None]:
# Spray data overview
print("Spray Data Info:")
print(spray_df.info())
print("\nSpray Data Summary:")
spray_df.describe()

spray_df['Date'] = pd.to_datetime(spray_df['Date'])
print(f"\nSpray date range: {spray_df['Date'].min()} to {spray_df['Date'].max()}")
print(f"Total spray locations: {len(spray_df)}")
print(f"Unique spray dates: {spray_df['Date'].nunique()}")

In [None]:
# Spray locations visualization
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Spray locations
axes[0].scatter(spray_df['Longitude'], spray_df['Latitude'], alpha=0.6, s=1, color='green')
axes[0].set_title('Spray Locations')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')

# Spray activity over time
spray_daily = spray_df.groupby('Date').size().reset_index(name='Count')
axes[1].plot(spray_daily['Date'], spray_daily['Count'], marker='o')
axes[1].set_title('Daily Spray Activity')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Number of Spray Locations')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 10. Correlation Analysis

In [None]:
# Create a merged dataset for correlation analysis
# First, let's merge train data with weather data
weather_avg_daily = weather_df.groupby('Date')[numeric_columns].mean().reset_index()
train_with_weather = train_df.merge(weather_avg_daily, on='Date', how='left')

# Select numeric columns for correlation
numeric_train_cols = ['Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent', 'Year', 'Month', 'DayOfYear', 'Week']
correlation_cols = numeric_train_cols + [col for col in numeric_columns if col in train_with_weather.columns]

# Calculate correlation matrix
correlation_data = train_with_weather[correlation_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(15, 12))
mask = np.triu(np.ones_like(correlation_data, dtype=bool))
sns.heatmap(correlation_data, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix of Numeric Variables')
plt.tight_layout()
plt.show()

# Show correlations with WnvPresent
wnv_correlations = correlation_data['WnvPresent'].abs().sort_values(ascending=False)
print("\nCorrelations with WnvPresent (absolute values):")
print(wnv_correlations[wnv_correlations.index != 'WnvPresent'])

## 11. Key Insights Summary

In [None]:
print("=" * 60)
print("KEY INSIGHTS FROM DATA EXPLORATION")
print("=" * 60)

print("\n1. CLASS IMBALANCE:")
print(f"   - Only {positive_samples/total_samples*100:.2f}% of samples are WNV positive")
print(f"   - Imbalance ratio: {imbalance_ratio:.1f}:1 (negative:positive)")

print("\n2. SPECIES ANALYSIS:")
species_wnv_top = species_wnv.head(3)
print(f"   - {len(species_counts)} different mosquito species")
print(f"   - Top 3 species by WNV rate:")
for _, row in species_wnv_top.iterrows():
    print(f"     * {row['Species']}: {row['WNV_Rate']:.3f} ({row['WNV_Positive']}/{row['Total_Samples']} samples)")

print("\n3. TEMPORAL PATTERNS:")
peak_month = monthly_wnv.loc[monthly_wnv['WNV_Rate'].idxmax()]
peak_week = weekly_wnv.loc[weekly_wnv['WNV_Rate'].idxmax()]
print(f"   - Peak month: {peak_month['Month']} (WNV rate: {peak_month['WNV_Rate']:.3f})")
print(f"   - Peak week: {peak_week['Week']} (WNV rate: {peak_week['WNV_Rate']:.3f})")
print(f"   - Data spans {train_df['Year'].min()} to {train_df['Year'].max()}")

print("\n4. GEOGRAPHIC DISTRIBUTION:")
print(f"   - {len(trap_wnv)} unique trap locations with sufficient data")
print(f"   - {len(trap_wnv[trap_wnv['WNV_Positive'] > 0])} traps have recorded WNV positive samples")
print(f"   - Highest WNV rate at a single trap: {trap_wnv['WNV_Rate'].max():.3f}")

print("\n5. MOSQUITO COUNT PATTERNS:")
print(f"   - Average mosquitos per sample: {train_df['NumMosquitos'].mean():.2f}")
print(f"   - WNV positive samples have higher mosquito counts on average")
print(f"   - WNV Negative: {wnv_neg_mosquitos.mean():.2f} mosquitos")
print(f"   - WNV Positive: {wnv_pos_mosquitos.mean():.2f} mosquitos")

print("\n6. STRONGEST CORRELATIONS WITH WNV:")
top_correlations = wnv_correlations.head(6)  # Top 5 + WnvPresent itself
for var, corr in top_correlations.items():
    if var != 'WnvPresent':
        print(f"   - {var}: {corr:.3f}")

print("\n" + "=" * 60)