### Exercise #4 - correction

In [1]:
import pandas as pd
import numpy as np
import zipfile
import seaborn as sns # For plotting
import matplotlib.pyplot as plt # For showing plots
from statsmodels.graphics.gofplots import qqplot

#### Exercise 4.1

Take 1000 random samples with replacement of increasing sample sizes (e.g. 10, 50, 100, 150, 200, 250, 300, 500 and 1000 observations), compute the mean Temp_ann of each sample and use an appropriate visualization to show how many samples will we need to have a good estimate of the population mean (this is a corrected version of the last challenge of the previous exercise).


In [2]:
import random

num = [10, 50, 100, 150, 200, 250, 300, 500, 1000] 

temp = df['temp_ann']
means = []

for i in num:
    # Generating seed so that we can get same result every time the loop is run...
    np.random.seed(24)
    # a loop that takes a random sample with i observations from temp and computes the mean, ...
    # ... and repeat the process 1000 times
    x = [np.mean( # compute mean
        temp.take( # from temp ...
        np.random.randint(0, len(temp), i)) # ... take a random sample of i observations ...
        ) for _ in range(1000)] # ... 1000 times.
    means.append(x)

print(means) # means is a list of 9 lists with 1000 observations (means) each.
type(means)

# Produce boxplots
ax = sns.boxplot(data= means)
ax.set_xticklabels(num) # change x-axis tick labels
ax.set_xlabel('Number of observations')
ax.set_ylabel('Mean Annual Temperature') 
plt.axhline(y=np.mean(df['temp_ann']), color='r', linestyle='-') # plot line of mean Temp_ann using all data

NameError: name 'df' is not defined

In [None]:
ax = sns.violinplot(data= means)
ax.set_xticklabels(num) # change x-axis tick labels
ax.set_xlabel('Number of observations')
ax.set_ylabel('Mean Annual Temperature') 
plt.axhline(y=np.mean(df['temp_ann']), color='r', linestyle='-') # plot line of mean Temp_ann using all data

In [None]:
# convert means to a DataFrame

meansdf = pd.DataFrame(means).transpose()
meansdf.columns = num
meanstack = pd.DataFrame(meansdf.stack(), columns=['Mean_Temp_ann'])
meanstack.reset_index(inplace=True)
meanstack = meanstack.rename(columns = {'level_1':'N_obs'})
meanstack = meanstack.drop('level_0', axis=1)
meanstack['N_obs'] = meanstack['N_obs'].astype(str) # change integer to string
meanstack

In [None]:
meanstack.dtypes

In [None]:
ax = sns.stripplot(data= meanstack, x='N_obs', y='Mean_Temp_ann')

In [None]:
temp = df['temp_ann']
means = []

for i in range(10, 1050, 50):
    # Generating seed so that we can get same result
    # every time the loop is run...
    x = [np.mean(
        temp.take(
        np.random.randint(0, len(temp), i))
        ) for _ in range(1000)]
    means.append(x)

print(means) # means is a list of lists
type(means)

# Produce boxplots
plt.xticks(rotation=90)
ax = sns.boxplot(data= means)
ax.set_xticklabels(range(0, 1050, 50))
ax.set_xlabel('Number of observations')
ax.set_ylabel('mean')
plt.xticks(rotation=90)


#### Exercise 4.2

Using an appropriate visualization, check the effect of Mean Annual Temperature (Temp_ann) on the presence of Salmo trutta fario (Brown Trout).

Import and clean data

In [None]:
df = pd.read_csv('EFIplus_medit.zip',compression='zip', sep=";")

In [None]:
# clean up the dataset to remove unnecessary columns (eg. REG) 
df.drop(df.iloc[:,5:15], axis=1, inplace=True)

# let's rename some columns so that they make sense
df.rename(columns={'Sum of Run1_number_all':'Total_fish_individuals'}, inplace=True) # inplace="True" means that df will be updated

# for sake of consistency, let's also make all column labels of type string
df.columns = list(map(str, df.columns))

In [None]:
df2 = df.dropna()

Some possible visualization settings

In [None]:
# Resize figure
# sns.set(rc={'figure.figsize':(8,3)}) # will change the setting for the whole graphs when applied

# Alternative
# from matplotlib import rcParams
# figure size in inches
# rcParams['figure.figsize'] = 8,3

# To restore settings run:
# import matplotlib as mpl
# mpl.rc_file_defaults()

In [None]:
# Resize figure
sns.set(rc={'figure.figsize':(4,5)}) 

sns.boxplot(data=df, x='Salmo trutta fario', y='temp_ann',
        notch=True,
        palette=['lightgray', 'skyblue'],
        width=0.4,
        medianprops=dict(color="red"),
        ).set_xticklabels(['Absence', 'Presence'])
plt.xlabel('Brown trout occurrence', fontdict={'size': 12, 'weight': 'bold'})
plt.ylabel('Mean Annual Temperature', fontdict={'size': 12, 'weight': 'bold'})

# restore settings
import matplotlib as mpl
mpl.rc_file_defaults()

In [None]:
sns.boxplot(data=df, y='Salmo trutta fario', x='temp_ann', orient='h',
        notch=True,
        palette=['lightgray', 'skyblue'],
        width=0.4,
        medianprops=dict(color="red"),
        )

sns.stripplot(data=df, y='Salmo trutta fario', x='temp_ann', orient='h',
    jitter=0.1, # the ammount of jitter (i.e. random point dispersion along the x-axis).
    linewidth=0, # no line around the poits
    color='#9e2a2b', # HEX color picked from here: https://coolors.co/palettes/trending - don't forget to start by "#"
    alpha=.3,
    s=3, # point size
    )
plt.xlabel('Mean Annual Temperature')

In [None]:
sns.histplot(data=df[df['Salmo trutta fario']==0], x='temp_ann', color='gray', alpha=0.5, edgecolor=None, label="Absence of Brown trout")
sns.histplot(data=df[df['Salmo trutta fario']==1], x='temp_ann', alpha=0.4, edgecolor=None, label="Presence of Brown trout")
plt.legend(frameon=False)
plt.xlabel('Mean Annual Temperature')


In [None]:
sns.kdeplot(data=df[df['Salmo trutta fario']==0], x='temp_ann', color='gray', fill=True, label="Absence of Brown trout")
sns.kdeplot(data=df[df['Salmo trutta fario']==1], x='temp_ann', fill=True, label="Presence of Brown trout")
plt.legend(frameon=False, loc='upper left')
plt.xlabel('Mean Annual Temperature')

In [None]:
sns.violinplot(data=df, x='Salmo trutta fario', y='temp_ann')
plt.ylim(0, 20)

In [None]:
sns.scatterplot(data=df, x='Salmo trutta fario', y='temp_ann')
plt.ylim(0, 20)

In [None]:
sns.histplot(df2['Actual_river_slope'])



In [None]:
from statsmodels.graphics.gofplots import qqplot


qqplot(pd.Series(df2['Actual_river_slope']), line='s')
plt.show()

#### Exercise 4.3
Test, using both visualization and hypothesis testing methods, if the actual_river_slope is drawn from a normal distribution.

In [None]:
# import function
from scipy.stats import shapiro

df2 = df.dropna() # drops rows when at least one element is a missing value
df2.info()

# normality test
stat, p = shapiro(pd.Series(df2['Actual_river_slope']))
print('Statistics=%.3f, p=%.3f' % (stat, p)) # print outputs
# interpret. H0: 'the sample was drawn from a Gaussian distribution'.
alpha = 0.05
if p > alpha:
 print('Sample is not significantly different from Gaussian (fail to reject H0. Rejecting H0 has an error probability >0.05)')
else:
 print('Sample is significantly different from Gaussian (reject H0 with an error probability <0.05)')

#### Exercise 4.4

Take 100 samples of 2000 observations with replacement, compute the mean for each sample and plot the resulting histogram of means. Test if these 100 mean values are drawn from a normal distribution.


In [None]:


mean = []
for i in range(0,100):
    temp = df['Actual_river_slope']
    sampler = np.random.randint(0, len(temp), 2000) # generate 2000 random integer numbers to be used as random indices
    sample = temp.take(sampler) # take 2000 random observations from temp
    mean.append(sample.mean())

sns.histplot(mean)

In [None]:
qqplot(pd.Series(mean), line='s')
plt.show()

In [None]:
# import function
from scipy.stats import shapiro

# normality test
stat, p = shapiro(mean)
print('Statistics=%.3f, p=%.3f' % (stat, p)) # print outputs
# interpret. H0: 'the sample was drawn from a Gaussian distribution'.
alpha = 0.05
if p > alpha:
 print('Sample is not significantly different from Gaussian (fail to reject H0. Rejecting H0 has an error probability >0.05)')
else:
 print('Sample is significantly different from Gaussian (reject H0 with an error probability <0.05)')