In [1]:
import pandas as pd
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.sankey import Sankey
from statsmodels.graphics.gofplots import qqplot
from sklearn.preprocessing import StandardScaler
import scipy
import numpy as np
import random

In [2]:
df = pd.read_csv('../../../examples/EFIplus_medit.zip',compression='zip', sep=";")

In [3]:
# clean up the dataset to remove unnecessary columns (eg. REG) 
df.drop(df.iloc[:,5:15], axis=1, inplace=True)

# let's rename some columns so that they make sense
df.rename(columns={'Sum of Run1_number_all':'Total_fish_individuals'}, inplace=True) # inplace="True" means that df will be updated

# for sake of consistency, let's also make all column labels of type string
df.columns = list(map(str, df.columns))

df.head()

Unnamed: 0,Site_code,Latitude,Longitude,Country,Catchment_name,Water_source_type,Flow_regime,Altitude,Geological_typology,Actual_river_slope,...,Squalius malacitanus,Squalius pyrenaicus,Squalius torgalensis,Thymallus thymallus,Tinca tinca,Zingel asper,Squalius sp,Barbatula sp,Phoxinus sp,Iberochondrostoma_sp
0,ES_01_0002,38.102003,-4.09607,Spain,Guadalquivir,Pluvial,Permanent,199,Siliceous,0.001,...,0,0,0,0,0,0,0,0,0,0
1,ES_02_0001,40.530188,-1.887796,Spain,Tejo,Groundwater,Permanent,1253,Calcareous,13.406,...,0,0,0,0,0,0,0,0,0,0
2,ES_02_0002,40.595432,-1.928079,Spain,Tejo,Groundwater,Permanent,1155,Calcareous,9.398,...,0,0,0,0,0,0,0,0,0,0
3,ES_02_0003,40.656184,-1.989831,Spain,Tejo,Groundwater,Permanent,1074,Calcareous,8.186,...,0,0,0,0,0,0,0,0,0,0
4,ES_02_0004,40.676402,-2.036274,Spain,Tejo,Groundwater,Permanent,1018,Calcareous,11.736,...,0,0,0,0,0,0,0,0,0,0


### 1. Using the EFIplus_medit.zip dataset Test if the frequency of sites with presence and absence of Salmo trutta fario (Brown Trout) are independent from the country. Please state which is/are the null hypothesis of your test(s)

In [4]:
# H0: The presence of Brown Trout is independent from Country

cont_table = pd.crosstab(df['Salmo trutta fario'], df['Country'])

stat, p, degrees_freedom, expected_freq = scipy.stats.chi2_contingency(cont_table)
print('Statistics=%.3f, p=%.6f' % (stat, p))
alpha=0.05
if p > alpha:
 print('fail to reject H0. Rejecting H0 has an error probability <0.05')
else:
 print('reject H0 with an error probability <0.05)')

Statistics=496.372, p=0.000000
reject H0 with an error probability <0.05)


### 2. Run the non-parametric equivalent of the test you used in exercise 5.3 and compare with the ANOVA test (5.2: Test whether there are diferences in the mean elevation in the upstream catchment (Elevation_mean_catch) among the eight most sampled catchments. For which pairs of catchments are these diferences significant? Please state which is/are the null hypothesis of your test(s)).

In [5]:
catch_count = df['Catchment_name'].value_counts()
top_catches = catch_count.index[:8]
df_topcatch = df[df['Catchment_name'].isin(top_catches)]

df_topcatch_nonull = df_topcatch.dropna(subset=['Elevation_mean_catch'])

catch_groups = [(name, group['Elevation_mean_catch']) for name, group in df_topcatch_nonull.groupby('Catchment_name')]
catch_elevation_only = [group for _, group in catch_groups]

# Kruskal-Wallis Test
stat, p_value = scipy.stats.kruskal(*catch_elevation_only)

print("Kruskal-Wallis Test: p_value = ", p_value)
alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis, means are significantly different")
else:
    print("Fail to reject null hypothesis, no significant difference in means")

# 5.3 One-way ANOVA Test
# H0: The means of the elevations in each catchment are equal
stat, p_value = scipy.stats.f_oneway(*catch_elevation_only)

print("One-way ANOVA Test: p_value = ", p_value)
alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis, means are significantly different")
else:
    print("Fail to reject null hypothesis, no significant difference in means")

Kruskal-Wallis Test: p_value =  3.7056116510329714e-284
Reject null hypothesis, means are significantly different
One-way ANOVA Test: p_value =  1.369526482034513e-285
Reject null hypothesis, means are significantly different


### 3. Using the winequality_red.csv file in the examples folder of the github repository, test which wine parameters discriminate the best between wine quality scores categorized into two classes using value 5 as the threshold value (quality>5=“good” and quality<5=“bad”).

In [6]:
df_wine = pd.read_csv('../../../examples/winequality_red.csv', sep=",")

In [7]:
df_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
df_wine["quality_category"] = df_wine["quality"].apply(lambda q: "good" if q > 5 else "bad")

# Separate features and target variable
features = df_wine.drop("quality_category", axis=1).columns
results = {}

# Mann-Whitney U Test
# H0 - Distributions are identical between "good" and "bad" wines
for feature in features:
    good_quality = df_wine[df_wine["quality_category"] == "good"][feature]
    bad_quality = df_wine[df_wine["quality_category"] == "bad"][feature]
    statistic, p_value = scipy.stats.mannwhitneyu(good_quality, bad_quality)
    results[feature] = (statistic, p_value)

# Sort results by p-value (ascending)
sorted_results = dict(sorted(results.items(), key=lambda x: x[1][1]))

# Print results
print("Features ranked by p-value (Mann-Whitney U test):")
for name, (statistic, pval) in sorted_results.items():
    alpha = 0.05
    print(f"{name}: Statistic={statistic:.2f}, p-value={pval:.4f}")
    if pval < alpha:
        print(f"Reject null hypothesis, {name} has different distributions between categories")
    else:
        print(f"Fail to reject null hypothesis, {name} has similar distributions between categories ")
    print()


Features ranked by p-value (Mann-Whitney U test):
quality: Statistic=636120.00, p-value=0.0000
Reject null hypothesis, quality has different distributions between categories

alcohol: Statistic=481313.00, p-value=0.0000
Reject null hypothesis, alcohol has different distributions between categories

sulphates: Statistic=440968.00, p-value=0.0000
Reject null hypothesis, sulphates has different distributions between categories

volatile acidity: Statistic=197208.00, p-value=0.0000
Reject null hypothesis, volatile acidity has different distributions between categories

total sulfur dioxide: Statistic=245006.00, p-value=0.0000
Reject null hypothesis, total sulfur dioxide has different distributions between categories

chlorides: Statistic=254091.00, p-value=0.0000
Reject null hypothesis, chlorides has different distributions between categories

density: Statistic=257552.00, p-value=0.0000
Reject null hypothesis, density has different distributions between categories

citric acid: Statistic=