# Weather anomalies data pipeline
The following data pipeline extracts weather data from a simulated weather api, finds weather anomalies and saves the anomalies found into a database.

In [1]:
import random
import pandas as pd
from datetime import datetime, timedelta
from sklearn.neighbors import LocalOutlierFactor
import sqlite3

In [2]:
def simulate_weather_api(start_date, end_date):
    # Convert string dates to datetime objects
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    
    # Generate a list of dates between start_date and end_date
    date_range = pd.date_range(start_date, end_date)
    
    # Initialize lists to store weather data
    temperatures = []
    humidities = []
    wind_speeds = []
    
    # Generate random weather data for each date
    for date in date_range:
        temperatures.append(round(random.uniform(14, 95), 1))  # Temperature in Fahrenheit
        humidities.append(random.randint(20, 100))  # Humidity in percentage
        wind_speeds.append(round(random.uniform(0, 12.4), 1))  # Wind speed in mph
    
    # Create a DataFrame to store the weather data
    weather_data = pd.DataFrame({
        'Date': date_range,
        'Temperature (°F)': temperatures,
        'Humidity (%)': humidities,
        'Wind Speed (mph)': wind_speeds,
    })
    
    return weather_data

In [3]:
def convert_units(df):
    # Convert temperature from Fahrenheit to Celsius
    df['Temperature (°C)'] = (df['Temperature (°F)'] - 32) * 5.0/9.0
    
    # Convert wind speed from mph to km/h
    df['Wind Speed (km/h)'] = df['Wind Speed (mph)'] * 1.60934
    
    return df

In [4]:
def find_anomalies(df):
    # Select relevant features for anomaly detection
    features = df[['Temperature (°C)', 'Humidity (%)', 'Wind Speed (km/h)']]
    
    # Apply Local Outlier Factor to detect anomalies
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    df['Anomaly'] = lof.fit_predict(features)
    
    # Filter the DataFrame to include only anomalies
    anomalies = df[df['Anomaly'] == -1]
    
    return anomalies

In [5]:
def select_columns(df):
    df = df.drop(columns=['Anomaly', 'Temperature (°F)', 'Wind Speed (mph)'])
    return df

In [6]:
def load_to_sqlite(df, db_name='weather_data.db'):
    # Connect to SQLite database
    conn = sqlite3.connect(db_name)
    
    # Load DataFrame to SQLite database
    df.to_sql('weather_anomalies', conn, if_exists='append', index=False)
    
    # Close the connection
    conn.close()

In [7]:
# Example usage
start_date = '2014-09-01'
end_date = '2024-09-01'
weather_data = simulate_weather_api(start_date, end_date)
weather_data = convert_units(weather_data)
anomalies = find_anomalies(weather_data)
anomalies = select_columns(anomalies)
load_to_sqlite(anomalies)

In [8]:
# Connect to SQLite database
conn = sqlite3.connect('weather_data.db')

# Read data from SQLite database into DataFrame
df = pd.read_sql('SELECT * FROM weather_anomalies', conn)

# Close the connection
conn.close()

df.head()

Unnamed: 0,Date,Humidity (%),Temperature (°C),Wind Speed (km/h)
0,2014-09-19 00:00:00,20,-8.944444,17.863674
1,2014-09-27 00:00:00,81,-9.555556,0.160934
2,2014-09-28 00:00:00,50,19.0,18.829278
3,2014-10-11 00:00:00,37,5.722222,19.955816
4,2014-11-01 00:00:00,72,16.611111,19.794882
