In [31]:
#ex06
#import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import scipy.stats as sts
import statsmodels.stats as stm
import scikit_posthocs as sp
import seaborn as sns

#load dataset
df = pd.read_csv('EFIplus_medit.zip',compression='zip', sep=";")
# clean up the dataset to remove unnecessary columns (eg. REG) 
df.drop(df.iloc[:,5:15], axis=1, inplace=True)
# handle missing values:
df.dropna()

# let's rename some columns so that they make sense
df.rename(columns={'Sum of Run1_number_all':'Total_fish_individuals'}, inplace=True) # inplace="True" means that df will be updated

# for sake of consistency, let's also make all column labels of type string
df.columns = list(map(str, df.columns))

In [8]:
#6.1 
contingency_table = pd.crosstab(df['Country'], df['Salmo trutta fario'])
chi2, p, dof, expected = sts.chi2_contingency(contingency_table)

#H0:The frequency of sites with presence and absence of Salmo trutta fario is independent of the country.

print(f"Chi-squared Test statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
print(f"Expected frequencies: \n{expected}")


Chi-squared Test statistic: 496.3723854072799
P-value: 2.9162328651936495e-107
Degrees of freedom: 3
Expected frequencies: 
[[  28.39193774   43.60806226]
 [  72.95150669  112.04849331]
 [ 341.88625025  525.11374975]
 [1532.77030533 2354.22969467]]


In [14]:
#6.2 Run the non-parametric equivalent of the test you used in exercise 5.3 
# non-paramentric good for data that DOES NOT DOES NOT FOLLOW NORMAL DISTRIBUTION or WEIRD DATA

#Null hypothesis: There are no significant differences in mean elevation among the 8 most sampled catchments

#replace fway w krastal
from scipy.stats import kruskal
from scipy.stats import f_oneway
import pandas as pd

# Filter the data to include only the eight most sampled catchments
counts = df['Catchment_name'].value_counts()
top_8_sites = counts.nlargest(8).index
top_8_sites_df = df[df['Catchment_name'].isin(top_8_sites)].dropna()

# Create a list of elevation data for each catchment
elevation = [top_8_sites_df[top_8_sites_df['Catchment_name'] == catchment]['Elevation_mean_catch'].values 
             for catchment in top_8_sites]

# Perform the Kruskal-Wallis H-test
stat, p = kruskal(*elevation)

print('Kruskal-Wallis H-test results:')
print('H-statistic=%.3f, p=%.6f' % (stat, p))

alpha = 0.05
if p > alpha:
    print('Fail to reject H0. There are no significant differences in mean elevation among the eight most sampled catchments.')
else:
    print('Reject H0. There are significant differences in mean elevation among the eight most sampled catchments.')

# ANOVA test = Two or more groups have the same population mean
stat, p = f_oneway(*elevation)
print('ANOVA results:')
print('F-statistic=%.3f, p=%.6f' % (stat, p))

alpha = 0.05
if p > alpha:
    print('Fail to reject H0. There are no significant differences in mean elevation among the eight most sampled catchments.')
else:
    print('Reject H0. There are significant differences in mean elevation among the eight most sampled catchments.')





Kruskal-Wallis H-test results:
H-statistic=868.726, p=0.000000
Reject H0. There are significant differences in mean elevation among the eight most sampled catchments.
ANOVA results:
F-statistic=166.490, p=0.000000
Reject H0. There are significant differences in mean elevation among the eight most sampled catchments.


In [47]:
#6.3 Using the winequality_red.csv file in the examples folder of the github repository, test which wine parameters discriminate the best between wine quality scores categorized into two classes using value 5 as the threshold value (quality>5=“good” and quality<5=“bad”).


import pandas as pd
from scipy.stats import chi2_contingency

# Load the CSV file into a DataFrame
df = pd.read_csv('winequality_red.csv')

# Define a binary target variable indicating good or bad quality
df['quality_class'] = df['quality'].apply(lambda x: 'good' if x > 5 else 'bad')


# Perform Chi-squared test for 'quality_class' and each individual continuous parameter
parameters = ["fixed acidity", "volatile acidity", "residual sugar", "chlorides", "free sulfur dioxide",
              "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]

for param in parameters:
    # Create a contingency table
    contingency_table = pd.crosstab(df['quality_class'], pd.cut(df[param], bins=3))
    
    # Perform Chi-squared test
    chi2, p, _, _ = chi2_contingency(contingency_table)
    print(f"Chi-squared test for '{param}':")
    print(f"Chi-squared statistic: {chi2}, p-value: {p}")

Chi-squared test for 'fixed acidity':
Chi-squared statistic: 11.184607460907158, p-value: 0.0037264332708660647
Chi-squared test for 'volatile acidity':
Chi-squared statistic: 109.09271177835417, p-value: 2.0455901764083764e-24
Chi-squared test for 'residual sugar':
Chi-squared statistic: 1.058317045209553, p-value: 0.5891004759044617
Chi-squared test for 'chlorides':
Chi-squared statistic: 3.195137673117301, p-value: 0.20238795857226552
Chi-squared test for 'free sulfur dioxide':
Chi-squared statistic: 6.973395802930409, p-value: 0.03060175552824865
Chi-squared test for 'total sulfur dioxide':
Chi-squared statistic: 84.70389190695666, p-value: 4.0437432636901795e-19
Chi-squared test for 'density':
Chi-squared statistic: 45.95504807655456, p-value: 1.0495136765272779e-10
Chi-squared test for 'pH':
Chi-squared statistic: 1.0972961103370333, p-value: 0.5777303421937462
Chi-squared test for 'sulphates':
Chi-squared statistic: 4.494846069660519, p-value: 0.10567118495821066
Chi-squared tes

In [50]:
from scipy.stats import ttest_ind

# Extract 'pH' values for 'good' and 'bad' quality wines
good_ph = df[df['quality'] > 5]['pH']
bad_ph = df[df['quality'] <= 5]['pH']

# Perform independent samples t-test
t_statistic, p_value = ttest_ind(good_ph, bad_ph)

# Display the results
print("Independent samples t-test for pH between 'good' and 'bad' quality wines:")
print(f"t-statistic: {t_statistic}")
print(f"p-value: {p_value}")


from scipy.stats import ttest_ind

# Extract 'residual sugar' values for 'good' and 'bad' quality wines
good_res_sugar = df[df['quality'] > 5]['residual sugar']
bad_res_sugar = df[df['quality'] <= 5]['residual sugar']

# Perform independent samples t-test
t_statistic, p_value = ttest_ind(good_res_sugar, bad_res_sugar)

# Display the results
print("Independent samples t-test for residual sugar between 'good' and 'bad' quality wines:")
print(f"t-statistic: {t_statistic}")
print(f"p-value: {p_value}")



Independent samples t-test for pH between 'good' and 'bad' quality wines:
t-statistic: -0.13043758140804423
p-value: 0.8962366625632214
Independent samples t-test for residual sugar between 'good' and 'bad' quality wines:
t-statistic: -0.08633714492778152
p-value: 0.9312092342607453


So, the best discriminators of wine quality are pH and residual sugar. (Because they have the highest p values)

H0 = no difference in the mean 'pH' values between the two groups (good and bad). 
If the p-value is less than a chosen significance level (e.g., 0.05), you would reject the null hypothesis, indicating that there is a statistically significant difference in the mean 'pH' values between 'good' and 'bad' quality wines.