# ENEN90032 - Environmental Analysis Tools
## Assignment #1 - Individual Assignment

---
This notebook contains all solutions from question 1 to 6 for this assignment.


# ENEN90032 — Assignment #1 (Part 1)
**Instructions:** All the data files are in the `data/` folder. Make sure you run the notebook top-to-bottom. You may also run cell by cell.

**Notes:** Each question restricts libraries; I have used only the allowed libraries per question in the corresponding cells.


In [5]:
# Constants and file paths for the statistical analysis tasks
# Datasets for Q1 rainfall CSVs (wet-day analysis; any filename is fine as long as it has a date column and rainfall column)
PERTH_RAIN_CSV = "data/perth_IDCJAC0009_091021_2024_Data.csv"
BRISBANE_RAIN_CSV = "data/brisbane_IDCJAC0009_040224_2024_Data.csv"
MELBOURNE_RAIN_CSV = "data/melbourne_IDCJAC0009_086304_2024_Data.csv"

# Datasets for Q2 Newcomb data (time in seconds for 7442 m)
NEWCOMB_TXT = "data/NewcombLight.txt"

# Datasets for Q3 Tmax Avalon and Moorabbin (daily max temp 2024)
AVALON_TMAX_CSV = "data/AVALON_087113_2024.csv"
MOORABBIN_TMAX_CSV = "data/MOORABBIN_086077_2024.csv"

# Datasets for Q4 O-ring data (xls or csv)
ORING_DATA = "data/O_Ring_Data.xls"

# Datasets for Q5 Cloud seeding (xls or csv)
CLOUDSEED_DATA = "data/Cloud_Seeding_Case_Study.xls"

# Datasets for Q6 Q-TKN data (csv)
QTKN_DATA = "data/Q_TKN_data.csv"

# General parameters
YEAR = 2024
DETECTION_LIMIT_MM = 0.25
RANGE_RANDOM_SEED = 2025


# ENEN90032 — Question #1 - Exploratory Data Analysis - Meteorological Datasets

**Allowed and the used libraries are;** numpy, pandas, matplotlib, scipy.stats, math, statsmodels

In [6]:
# imports
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
# warnings.filterwarnings('ignore')

from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.distributions.empirical_distribution import ECDF

# customize the default appearance of plots generated by Matplotlib
plt.rcParams['figure.dpi'] = 140
plt.rcParams['figure.figsize'] = (7,4.2)

In [None]:
# constants
DETECTION_LIMIT_MM = DETECTION_LIMIT_MM
YEAR = YEAR

def _find_date_col(df):
    candidates = [c for c in df.columns if 'date' in c.lower()]
    if candidates:
        return candidates[0]
    return df.columns[0]

def _find_rain_col(df):
    for c in df.columns:
        if 'rain' in c.lower():
            return c
    raise ValueError("No rain column found. Make sure filename has a column with 'rain' in header.")

def load_rain_csv(path, detection_limit=DETECTION_LIMIT_MM, year=YEAR):
    df = pd.read_csv(path)
    date_col = _find_date_col(df)
    rain_col = _find_rain_col(df)
    df = df.rename(columns={date_col: 'date', rain_col: 'rain_mm'})
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    df = df[df['date'].dt.year == year].copy()
    df['rain_mm'] = pd.to_numeric(df['rain_mm'], errors='coerce')
    missing = df['rain_mm'].isna().sum()
    # Drop missing and apply wet-day filter
    df = df.dropna(subset=['rain_mm'])
    df = df[df['rain_mm'] >= detection_limit].sort_values('date').reset_index(drop=True)
    return df, missing

def trimean(x):
    q1, med, q3 = np.percentile(x, [25,50,75])
    return (q1 + 2*med + q3)/4.0

def iqr(x):
    q1, q3 = np.percentile(x, [25,75])
    return q3 - q1

def mad(x):
    med = np.median(x)
    return np.median(np.abs(x - med))

def yule_kendall(x):
    q1, med, q3 = np.percentile(x, [25,50,75])
    denom = (q3 - q1)
    if denom == 0:
        return np.nan
    return (q3 + q1 - 2*med) / denom

def summarize(x):
    x = np.asarray(x, dtype=float)
    return dict(
        n_wetdays = x.size,
        mean = float(np.mean(x)),
        median = float(np.median(x)),
        trimean = float(trimean(x)),
        std = float(np.std(x, ddof=1)) if x.size>1 else np.nan,
        IQR = float(iqr(x)),
        MAD = float(mad(x)),
        skewness = float(stats.skew(x, bias=False)) if x.size>3 else np.nan,
        yule_kendall = float(yule_kendall(x))
    )






# # --- Constants ---
# DETECTION_LIMIT_MM = DETECTION_LIMIT_MM
# YEAR = YEAR

# # --- Utility Functions ---
# def _find_date_col(df):
#     """Identifies the date column based on common keywords."""
#     for col in df.columns:
#         if 'date' in col.lower() or 'observation' in col.lower():
#             return col
#     raise ValueError("No suitable date column found in the dataset.")

# def _find_rain_col(df):
#     """Identifies the rainfall column based on common keywords."""
#     for col in df.columns:
#         if 'rain' in col.lower() and ('mm' in col.lower() or 'amount' in col.lower()):
#             return col
#     raise ValueError("No suitable rainfall column found in the dataset.")

# def load_rain_csv(path, detection_limit=DETECTION_LIMIT_MM, year=YEAR):
#     """
#     Loads and preprocesses rainfall data from a CSV file.
    
#     Args:
#         path (str): The path to the CSV file.
#         detection_limit (float): The minimum rainfall amount to consider a "wet day".
#         year (int): The year to filter the data for.

#     Returns:
#         tuple: A tuple containing the preprocessed DataFrame and the count of missing values.
#     """
#     try:
#         df = pd.read_csv(path, comment='#')
#         df.columns = df.columns.str.strip().str.replace(' ', '_')
        
#         date_col = _find_date_col(df)
#         rain_col = _find_rain_col(df)
        
#         df = df.rename(columns={date_col: 'date', rain_col: 'rain_mm'})
        
#         df['date'] = pd.to_datetime(df['date'], errors='coerce')
#         df = df.dropna(subset=['date'])
        
#         df['rain_mm'] = pd.to_numeric(df['rain_mm'], errors='coerce')
#         missing_values = df['rain_mm'].isna().sum()
        
#         # Filter for the specified year and wet days
#         df = df[(df['date'].dt.year == year) & (df['rain_mm'].notna())].copy()
#         wet_days = df[df['rain_mm'] >= detection_limit].reset_index(drop=True)
        
#         return wet_days, missing_values
#     except (FileNotFoundError, ValueError) as e:
#         print(f"Error processing {path}: {e}")
#         return pd.DataFrame(), 0

# def trimean(x):
#     """Calculates the trimean for a given array."""
#     q1, med, q3 = np.percentile(x, [25, 50, 75])
#     return (q1 + 2 * med + q3) / 4.0

# def iqr(x):
#     """Calculates the Interquartile Range (IQR)."""
#     q1, q3 = np.percentile(x, [25, 75])
#     return q3 - q1

# def mad(x):
#     """Calculates the Median Absolute Deviation (MAD)."""
#     med = np.median(x)
#     return np.median(np.abs(x - med))

# def yule_kendall(x):
#     """Calculates the Yule-Kendall index."""
#     q1, med, q3 = np.percentile(x, [25, 50, 75])
#     denom = (q3 - q1)
#     if denom == 0:
#         return np.nan
#     return (q3 + q1 - 2 * med) / denom

# def summarize(x):
#     """
#     Calculates and returns a dictionary of summary statistics.
    
#     Args:
#         x (pd.Series or np.ndarray): The data to summarize.
        
#     Returns:
#         dict: A dictionary of summary statistics.
#     """
#     x = np.asarray(x, dtype=float)
#     if x.size < 4:
#         # Avoid errors with small datasets for skewness
#         skewness_val = np.nan
#     else:
#         skewness_val = float(stats.skew(x, bias=False))

#     return {
#         'n_wetdays': x.size,
#         'mean': float(np.mean(x)) if x.size > 0 else np.nan,
#         'median': float(np.median(x)) if x.size > 0 else np.nan,
#         'trimean': float(trimean(x)) if x.size > 0 else np.nan,
#         'std': float(np.std(x, ddof=1)) if x.size > 1 else np.nan,
#         'IQR': float(iqr(x)) if x.size > 0 else np.nan,
#         'MAD': float(mad(x)) if x.size > 0 else np.nan,
#         'skewness': skewness_val,
#         'yule_kendall': float(yule_kendall(x)) if x.size > 0 else np.nan
#     }

# # --- Example Usage (outside the function) ---
# # Extract wet day rainfall data for each city using the correct columns and detection limit
# perth_data = perth.loc[
#     (perth["Rainfall amount (millimetres)"].notna()) &
#     (perth["Rainfall amount (millimetres)"] >= DETECTION_LIMIT_MM),
#     "Rainfall amount (millimetres)"
# ]
# brisbane_data = brisbane.loc[
#     (brisbane["Rainfall amount (millimetres)"].notna()) &
#     (brisbane["Rainfall amount (millimetres)"] >= DETECTION_LIMIT_MM),
#     "Rainfall amount (millimetres)"
# ]
# melbourne_data = melbourne.loc[
#     (melbourne["Rainfall amount (millimetres)"].notna()) &
#     (melbourne["Rainfall amount (millimetres)"] >= DETECTION_LIMIT_MM),
#     "Rainfall amount (millimetres)"
# ]

# summary_results = {
#     'Perth': summarize(perth_data),
#     'Brisbane': summarize(brisbane_data),
#     'Melbourne': summarize(melbourne_data)
# }

# print("--- Summary Results ---")
# for city, stats_dict in summary_results.items():
#     print(f"\nCity: {city}")
#     for stat, value in stats_dict.items():
#         print(f"  {stat}: {value:.3f}" if isinstance(value, float) and not math.isnan(value) else f"  {stat}: {value}")
# print("---------------------")

--- Summary Results ---

City: Perth
  n_wetdays: 78
  mean: 7.504
  median: 4.450
  trimean: 5.113
  std: 8.791
  IQR: 7.550
  MAD: 3.000
  skewness: 2.351
  yule_kendall: 0.351

City: Brisbane
  n_wetdays: 133
  mean: 11.341
  median: 4.400
  trimean: 5.800
  std: 20.373
  IQR: 10.800
  MAD: 3.600
  skewness: 5.526
  yule_kendall: 0.519

City: Melbourne
  n_wetdays: 69
  mean: 7.949
  median: 5.200
  trimean: 5.575
  std: 8.508
  IQR: 6.700
  MAD: 2.800
  skewness: 2.332
  yule_kendall: 0.224
---------------------


In [16]:
# Fit models: Gaussian, Gamma, Weibull (and Lognormal as 4th)
def fit_distributions(x):
    x = np.asarray(x, dtype=float)
    results = {}
    # Gaussian
    mu, sigma = stats.norm.fit(x)
    results['Gaussian'] = {'dist': stats.norm, 'params': (mu, sigma)}
    # Gamma (force loc=0)
    k, loc, scale = stats.gamma.fit(x, floc=0)
    results['Gamma'] = {'dist': stats.gamma, 'params': (k, 0, scale)}
    # Weibull (weibull_min)
    c, loc, scale = stats.weibull_min.fit(x, floc=0)
    results['Weibull'] = {'dist': stats.weibull_min, 'params': (c, 0, scale)}
    # Lognormal (extra)
    shape, loc, scale = stats.lognorm.fit(x, floc=0)
    results['Lognormal'] = {'dist': stats.lognorm, 'params': (shape, 0, scale)}
    return results

def log_likelihood(x, model):
    dist = model['dist']
    params = model['params']
    pdf_vals = dist.pdf(x, *params)
    pdf_vals = np.clip(pdf_vals, 1e-300, None)
    return float(np.sum(np.log(pdf_vals)))
