# üå°Ô∏è Weather Data Exploration - Varna, Bulgaria

This notebook explores the historical weather data for Varna and performs exploratory data analysis (EDA).

## Contents:
1. Data Loading
2. Statistical Analysis
3. Visualization
4. Seasonality Analysis
5. Correlation Analysis
6. Data Quality Assessment

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import sys
import os
from datetime import datetime, timedelta

# Add src to path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))
import config
from data_loader import WeatherDataLoader

# Settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

print("‚úì Libraries loaded successfully")

## 1. Data Loading

Load historical weather data for Varna from Open-Meteo API.

In [None]:
# Load data
loader = WeatherDataLoader()
df = loader.load_or_fetch_data()

# Display basic info
print(f"Dataset shape: {df.shape}")
print(f"\nDate range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"Duration: {(df['datetime'].max() - df['datetime'].min()).days} days")
print(f"\nColumns: {list(df.columns)}")
df.head(10)

## 2. Statistical Analysis

In [None]:
# Descriptive statistics
print("="*60)
print("DESCRIPTIVE STATISTICS")
print("="*60)
print(df.describe())
print("\n" + "="*60)
print("MISSING VALUES")
print("="*60)
print(df.isnull().sum())
print(f"\nTotal missing: {df.isnull().sum().sum()}")

## 3. Visualization

### 3.1 Temperature Trends Over Time

In [None]:
# Temperature over time
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['datetime'],
    y=df['temperature_2m'],
    mode='lines',
    name='Temperature',
    line=dict(color='#FF6B6B', width=1)
))

fig.update_layout(
    title='Temperature Trend - Varna (2018-2024)',
    xaxis_title='Date',
    yaxis_title='Temperature (¬∞C)',
    hovermode='x unified',
    height=500
)

fig.show()

### 3.2 Distribution of Weather Parameters

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Distribution of Weather Parameters', fontsize=16)

# Temperature
axes[0, 0].hist(df['temperature_2m'], bins=50, color='#FF6B6B', edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Temperature (¬∞C)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Temperature Distribution')
axes[0, 0].axvline(df['temperature_2m'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["temperature_2m"].mean():.2f}¬∞C')
axes[0, 0].legend()

# Humidity
axes[0, 1].hist(df['relative_humidity_2m'], bins=50, color='#4ECDC4', edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Relative Humidity (%)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Humidity Distribution')
axes[0, 1].axvline(df['relative_humidity_2m'].mean(), color='blue', linestyle='--', linewidth=2, label=f'Mean: {df["relative_humidity_2m"].mean():.2f}%')
axes[0, 1].legend()

# Pressure
axes[1, 0].hist(df['surface_pressure'], bins=50, color='#45B7D1', edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Surface Pressure (hPa)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Pressure Distribution')
axes[1, 0].axvline(df['surface_pressure'].mean(), color='green', linestyle='--', linewidth=2, label=f'Mean: {df["surface_pressure"].mean():.2f} hPa')
axes[1, 0].legend()

# Box plot - Temperature by month
df['month'] = df['datetime'].dt.month
df.boxplot(column='temperature_2m', by='month', ax=axes[1, 1])
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('Temperature (¬∞C)')
axes[1, 1].set_title('Temperature by Month')
axes[1, 1].get_figure().suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

## 4. Seasonality Analysis

In [None]:
# Monthly averages
df['year'] = df['datetime'].dt.year
df['month_name'] = df['datetime'].dt.strftime('%B')

monthly_avg = df.groupby('month').agg({
    'temperature_2m': 'mean',
    'relative_humidity_2m': 'mean',
    'surface_pressure': 'mean'
}).reset_index()

fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=('Average Temperature by Month', 'Average Humidity by Month', 'Average Pressure by Month')
)

fig.add_trace(
    go.Bar(x=monthly_avg['month'], y=monthly_avg['temperature_2m'], name='Temperature', marker_color='#FF6B6B'),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=monthly_avg['month'], y=monthly_avg['relative_humidity_2m'], name='Humidity', marker_color='#4ECDC4'),
    row=2, col=1
)

fig.add_trace(
    go.Bar(x=monthly_avg['month'], y=monthly_avg['surface_pressure'], name='Pressure', marker_color='#45B7D1'),
    row=3, col=1
)

fig.update_xaxes(title_text="Month", row=3, col=1)
fig.update_yaxes(title_text="Temperature (¬∞C)", row=1, col=1)
fig.update_yaxes(title_text="Humidity (%)", row=2, col=1)
fig.update_yaxes(title_text="Pressure (hPa)", row=3, col=1)

fig.update_layout(height=900, showlegend=False, title_text="Seasonal Patterns in Weather Data")
fig.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
corr_matrix = df[['temperature_2m', 'relative_humidity_2m', 'surface_pressure']].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8},
            fmt='.3f', ax=ax)
ax.set_title('Correlation Matrix of Weather Parameters', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nCorrelation Insights:")
print("="*60)
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        print(f"{corr_matrix.columns[i]} vs {corr_matrix.columns[j]}: {corr_matrix.iloc[i, j]:.3f}")

## 6. Data Quality Assessment

In [None]:
# Data quality checks
print("="*60)
print("DATA QUALITY REPORT")
print("="*60)

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n1. Duplicate rows: {duplicates}")

# Check for missing values
print(f"\n2. Missing values:")
for col in df.columns:
    missing = df[col].isnull().sum()
    missing_pct = (missing / len(df)) * 100
    print(f"   {col}: {missing} ({missing_pct:.2f}%)")

# Check for outliers (using IQR method)
print(f"\n3. Outlier detection (IQR method):")
for col in ['temperature_2m', 'relative_humidity_2m', 'surface_pressure']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"   {col}: {len(outliers)} outliers ({(len(outliers)/len(df))*100:.2f}%)")

# Data completeness
completeness = (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
print(f"\n4. Overall data completeness: {completeness:.2f}%")

print("\n" + "="*60)
print("‚úì Data quality assessment complete!")
print("="*60)

## Summary

### Key Findings:
1. **Data Coverage**: 5+ years of weather data for Varna
2. **Completeness**: High data quality with minimal missing values
3. **Seasonality**: Clear seasonal patterns in temperature
4. **Correlations**: Notable relationships between weather parameters
5. **Ready for Modeling**: Data is suitable for time series forecasting

### Next Steps:
- Preprocess data for ML models
- Create sequences for LSTM/GRU training
- Train forecasting models
- Evaluate model performance