# Analysis and Visualization of Complex Agro-Environmental Data
---
### Exercise #5 - correction

In [1]:
import pandas as pd
import numpy as np
import zipfile
import seaborn as sns # For plotting
import matplotlib.pyplot as plt # For showing plots
import scipy.stats as sts
import scikit_posthocs as sp
import statsmodels.stats as stm
from statsmodels.graphics.gofplots import qqplot
import math

In [2]:
df = pd.read_csv('../examples/EFIplus_medit.zip',compression='zip', sep=";")

In [None]:
# clean up the dataset to remove unnecessary columns (eg. REG) 
df.drop(df.iloc[:,5:15], axis=1, inplace=True)

# let's rename some columns so that they make sense
df.rename(columns={'Sum of Run1_number_all':'Total_fish_individuals'}, inplace=True) # inplace="True" means that df will be updated

# for sake of consistency, let's also make all column labels of type string
df.columns = list(map(str, df.columns))

In [None]:
# a good way of detecting missing values in the dataset
plt.figure(figsize=(12,4))
sns.heatmap(df.isnull(),cbar=False,cmap='viridis',yticklabels=False)
plt.title('Missing values (yellow) in the dataset');

In [None]:
df = df.dropna() # drops rows when at least one element is a missing value

##### Exercise 5.1

In [None]:
# standardize the Mean Annual Temperature
df['temp_ann_st'] = (df['temp_ann'] - df['temp_ann'].mean()) / df['temp_ann'].std()
df['temp_ann_st']

In [None]:
# simpler alternative
sts.zscore(df['temp_ann'])

In [None]:
Tst_mean = round(df['temp_ann_st'].mean())
Tst_SD = df['temp_ann_st'].std()

# standard error
SE = Tst_SD/math.sqrt(len(df))

# Compute the 95% CI manually 
lower = Tst_mean-1.96*SE
upper = Tst_mean+1.96*SE
print('95% CI:', (lower, upper))

# Alternative using the scipy function norm.interval
CI = sts.norm.interval(0.95, Tst_mean, scale=SE)
print('95% CI:', CI)

# Assuming a t-distribution instead
CI_t = sts.t.interval(confidence=0.95, df=len(df)-1, loc=Tst_mean, scale=SE)
print('95% CI t distribution:', CI_t)

print(Tst_mean)
print(Tst_SD)

In [None]:
sns.histplot(df['temp_ann'])
sns.histplot(df['temp_ann_st'], color='red')
plt.show()

##### Exercise 5.2

In [None]:
sns.boxplot(data=df,x='Salmo trutta fario',y='temp_ann_st')
plt.show()

In [None]:
# Run t test
# H0 : The samples are drawn from populations with equal means

sample1 = df[df['Salmo trutta fario']==0]['temp_ann_st']
sample2 = df[df['Salmo trutta fario']==1]['temp_ann_st']

print('Mean of sample 1 = ', sample1.mean())
print('Mean of sample 2 = ', sample2.mean())

# t-test - tests the null hypothesis that sample 1 and 2 are derived from populations with the same mean
stat, p = sts.ttest_ind(sample1, sample2)
print('Statistics=%.3f, p=%.3f' % (stat, p)) # print outputs
alpha=0.05
if p > alpha:
 print('fail to reject H0. Rejecting H0 has an error probability >0.05')
else:
 print('reject H0 with an error probability <0.05)')

##### Exercise 5.3

In [None]:
catchment_count = pd.crosstab(index = df['Catchment_name'], columns='count')
catchment_count.sort_values(by=['count'], ascending=False)

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

dfsub = df2[(df2['Catchment_name']=='Galiza-Norte') | 
    (df2['Catchment_name']=='Minho') |
    (df2['Catchment_name']=='Cantabrica') |
    (df2['Catchment_name']=='Douro') |
    (df2['Catchment_name']=='Tejo') |
    (df2['Catchment_name']=='Guadia') |
    (df2['Catchment_name']=='Galiza-Sul') |
    (df2['Catchment_name']=='Mondego')
    ]


In [None]:
sns.histplot(df['Actual_river_slope'])

In [None]:
from statsmodels.graphics.gofplots import qqplot

qqplot(pd.Series(df2['Elevation_mean_catch']), line='s')
plt.show()

In [None]:
# Although the distribution of the mean elevation is right skewed and seems to depart from normality we will nevertheless try to run ANOVA. 

mod = ols('Elevation_mean_catch ~ Catchment_name',
                data=dfsub).fit()
                
aov_table = sm.stats.anova_lm(mod, typ=2) # typ is the type of anova type to perform ('I','II' or 'III' = 1,2,3)
print(aov_table) # provides the usual ANOVA table

alpha=0.05
p=aov_table['PR(>F)'][0]

if p <= alpha:
 print('reject H0 that mean elevation values are equal among catchments')
else:
 print('fail to reject H0 that mean elevation values are equal among catchments')

# compute mean elevation for eacch catchment
dfsub[['Elevation_mean_catch','Catchment_name']].groupby('Catchment_name').mean()


In [None]:
# Multiple comparisons - perform Tukey's test 
tukey = stm.multicomp.pairwise_tukeyhsd(endog=dfsub['Elevation_mean_catch'],
                          groups=dfsub['Catchment_name'],
                          alpha=0.05)
#display results
print(tukey)

In [None]:
sns.boxplot(data=dfsub, x='Catchment_name', y='Actual_river_slope')

##### Exercise 5.4

Potential problems in the data used for hypothesis testing are: 
- Departure from the normal distribution
- Categories are highly unbalanced (very different number of samples for each category - see below). 
- Lack of independency among sampling sites. For example when we tested the effect of Actual_river_slope in the presence of Salmo trutta fario, we did not take into account that observations within each catchment might not be totally independent.



In [None]:
print(dfsub['Catchment_name'].value_counts())
print(df['Salmo trutta fario'].value_counts())