# ENEN90032 - Environmental Analysis Tools
## Major Assignment #1 - Individual Assignment
Prepared by: Michael Kimani Gathogo (CS-CNS03-23111)

---
This notebook contains all solutions (Q1–Q6) for the assignment.


## Question 1: Exploratory Data Analysis - Meteorological Datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import math
import statsmodels.api as sm

# Load rainfall datasets (replace with actual file paths)
perth = pd.read_csv('data/perth_IDCJAC0009_091021_2024_Data.csv')
brisbane = pd.read_csv('data/brisbane_IDCJAC0009_040224_2024_Data.csv')
melbourne = pd.read_csv('data/melbourne_IDCJAC0009_086304_2024_Data.csv')

def preprocess_rainfall(df):
    col = 'Rainfall amount (millimetres)'
    return df[df[col] > 0.25][col].dropna()

datasets = {
    'Perth': preprocess_rainfall(perth),
    'Brisbane': preprocess_rainfall(brisbane),
    'Melbourne': preprocess_rainfall(melbourne)
}

# Summary statistics
summary_stats = {}
for city, data in datasets.items():
    mean = np.mean(data)
    median = np.median(data)
    trimean = (np.percentile(data, 25) + 2*median + np.percentile(data, 75)) / 4
    std = np.std(data, ddof=1)
    iqr = stats.iqr(data)
    mad = stats.median_abs_deviation(data)
    skew = stats.skew(data)
    q1, q3 = np.percentile(data, [25, 75])
    yk_index = (q3 + q1 - 2*median) / (q3 - q1)
    summary_stats[city] = [mean, median, trimean, std, iqr, mad, skew, yk_index]

summary_df = pd.DataFrame(summary_stats, index=['Mean','Median','Trimean','Std Dev','IQR','MAD','Skewness','Yule-Kendall'])
print(summary_df)


                 Perth   Brisbane  Melbourne
Mean          7.503846  11.341353   7.949275
Median        4.450000   4.400000   5.200000
Trimean       5.112500   5.800000   5.575000
Std Dev       8.790586  20.373137   8.507654
IQR           7.550000  10.800000   6.700000
MAD           3.000000   3.600000   2.800000
Skewness      2.305587   5.463579   2.280701
Yule-Kendall  0.350993   0.518519   0.223881


## Question 2: Hypothesis Test - Newcomb-Michelson Velocity of Light

In [2]:
data = np.loadtxt('data/NewcombLight.txt')
distance = 7442  # meters
velocities = distance / data

# t-test
t_stat, p_val = stats.ttest_1samp(velocities, 299792458)
print('T-test statistic:', t_stat, 'p-value:', p_val)

# 95% CI (t-test)
mean = np.mean(velocities)
se = stats.sem(velocities)
ci = stats.t.interval(0.95, len(velocities)-1, loc=mean, scale=se)
print('95% CI (t-test):', ci)

# Bootstrap
np.random.seed(42)
boot_means = [np.mean(np.random.choice(velocities, size=len(velocities), replace=True)) for _ in range(10000)]
ci_boot = np.percentile(boot_means, [2.5, 97.5])
print('95% CI (bootstrap):', ci_boot)


FileNotFoundError: data/NewcombLight.txt not found.

## Question 3: Daily Max Temperature at Avalon and Moorabbin Airports

In [None]:
avalon = pd.read_csv('data/avalon_temp.csv')
moorabbin = pd.read_csv('data/moorabbin_temp.csv')

# Drop missing days
avalon = avalon.dropna()
moorabbin = moorabbin.dropna()

# Two-sample t-test
t_stat, p_val = stats.ttest_ind(avalon['temp'], moorabbin['temp'], equal_var=False)
print('t-statistic:', t_stat, 'p-value:', p_val)


## Question 4: Hypothesis Test - Space Shuttle O-Ring Failures

In [None]:
oring = pd.read_excel('data/O_Ring_Data.xls')
cool = oring[oring['Condition']=='COOL']['Incidents']
warm = oring[oring['Condition']=='WARM']['Incidents']

# Observed difference
obs_diff = np.mean(cool) - np.mean(warm)

# Permutation test
combined = np.concatenate([cool, warm])
np.random.seed(42)
perm_diffs = []
for _ in range(10000):
    np.random.shuffle(combined)
    new_cool = combined[:len(cool)]
    new_warm = combined[len(cool):]
    perm_diffs.append(np.mean(new_cool) - np.mean(new_warm))

ci = np.percentile(perm_diffs, [0.5, 99.5])
print('Observed diff:', obs_diff, 'Permutation 99% CI:', ci)

plt.hist(perm_diffs, bins=30, alpha=0.7)
plt.axvline(obs_diff, color='red', linestyle='dashed', label='Observed diff')
plt.legend()
plt.show()


## Question 5: Cloud Seeding Experiment

In [None]:
cloud = pd.read_excel('data/Cloud_Seeding_Case_Study.xls')
seeded = cloud[cloud['Seeded']==1]['Rainfall']
unseeded = cloud[cloud['Seeded']==0]['Rainfall']

# Parametric test
t_stat, p_val = stats.ttest_ind(seeded, unseeded, equal_var=False)
print('t-statistic:', t_stat, 'p-value:', p_val)

# Permutation test
combined = np.concatenate([seeded, unseeded])
obs_diff = np.mean(seeded) - np.mean(unseeded)
perm_diffs = []
for _ in range(10000):
    np.random.shuffle(combined)
    new_seeded = combined[:len(seeded)]
    new_unseeded = combined[len(seeded):]
    perm_diffs.append(np.mean(new_seeded) - np.mean(new_unseeded))

ci_perm = np.percentile(perm_diffs, [2.5, 97.5])
print('Observed diff:', obs_diff, 'Permutation 95% CI:', ci_perm)

# Log transform
seeded_log = np.log(seeded)
unseeded_log = np.log(unseeded)
t_stat_log, p_val_log = stats.ttest_ind(seeded_log, unseeded_log, equal_var=False)
print('Log-transformed t-test:', t_stat_log, 'p-value:', p_val_log)


## Question 6: Exploratory Data Analysis and Linear Regression

In [None]:
data_qtkn = pd.read_csv('data/Q_TKN_data.csv')
Q = data_qtkn['Q']
TKN = data_qtkn['TKN']

# Correlations
pearson = stats.pearsonr(Q, TKN)
spearman = stats.spearmanr(Q, TKN)
print('Pearson:', pearson, 'Spearman:', spearman)

# Log-transform
logQ = np.log(Q)
logTKN = np.log(TKN)
pearson_log = stats.pearsonr(logQ, logTKN)
spearman_log = stats.spearmanr(logQ, logTKN)
print('Log Pearson:', pearson_log, 'Log Spearman:', spearman_log)

# Linear regression
import statsmodels.api as sm
X = sm.add_constant(logQ)
model = sm.OLS(logTKN, X).fit()
print(model.summary())

# Predict TKN when Q=2 mm/d
pred_log = model.predict([1, np.log(2)])
pred = np.exp(pred_log)
print('Predicted TKN at Q=2:', pred)

# Residuals
residuals = model.resid
plt.scatter(logQ, residuals)
plt.axhline(0, color='red')
plt.title('Residuals vs logQ')
plt.show()
