# Tutorial 5

In [11]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import median_abs_deviation
from scipy.stats import trim_mean

## Distributions

### Task 1.1

In [3]:
np.random.seed(123)

sample_means = []
trimmed_means = []

for _ in range(50):
    Z = np.random.normal(0, 1, 30)
    Y = Z * np.exp(0.3 * Z**2 / 2)

    sample_means.append(np.mean(Y))
    trimmed_means.append(trim_mean(Y, proportiontocut=0.2))

mean_sample = np.mean(sample_means)
sd_sample = np.std(sample_means, ddof=1)

mean_trimmed = np.mean(trimmed_means)
sd_trimmed = np.std(trimmed_means, ddof=1)

print(f"Mean of sample means: {mean_sample:.4f}")
print(f"SD of sample means: {sd_sample:.4f}")
print(f"Mean of trimmed means: {mean_trimmed:.4f}")
print(f"SD of trimmed means: {sd_trimmed:.4f}")

Mean of sample means: -0.0197
SD of sample means: 0.3126
Mean of trimmed means: -0.0082
SD of trimmed means: 0.2256


## Outliers

In [4]:
data = np.array([2, 2, 3, 3, 3, 4, 4, 4, 100000, 100000])

### Task 2.1

In [7]:
mean_X = np.mean(data)
std_X = np.std(data, ddof=1)

outliers_mean = [x for x in data if abs(x - mean_X) / std_X > 2.24]
outliers_mean

[]

In [6]:
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers_iqr = [x for x in data if x < lower_bound or x > upper_bound]
outliers_iqr

[np.int64(100000), np.int64(100000)]

In [8]:
median_X = np.median(data)
mad_X = median_abs_deviation(data)

outliers_mad = [x for x in data if abs(x - median_X) / (mad_X / 0.6745) > 2.24]
outliers_mad

[np.int64(100000), np.int64(100000)]

## Student Performence

In [10]:
path = "../../src/"
data = pd.read_csv(path + "data/student/student-mat.csv", sep=";")
df = pd.DataFrame(data)

### Task 3.1

In [12]:
gamma = 0.1  # 10% trimming/winsorizing
results = []

for medu_level, group in df.groupby("Medu"):
    mean = group["G3"].mean()
    trimmed_mean = stats.trim_mean(group["G3"], gamma)
    winsorized_mean = stats.mstats.winsorize(group["G3"], limits=gamma).mean()
    results.append((medu_level, mean, trimmed_mean, winsorized_mean))

results_df = pd.DataFrame(results, columns=["Medu", "Mean", "Trimmed Mean", "Winsorized Mean"])
results_df

Unnamed: 0,Medu,Mean,Trimmed Mean,Winsorized Mean
0,0,13.0,13.0,13.0
1,1,8.677966,8.938776,8.525424
2,2,9.728155,10.120482,9.514563
3,3,10.30303,10.62963,10.606061
4,4,11.763359,12.07619,11.961832


In [None]:
def mean_variance_outliers(group):
    mean_G3 = group["G3"].mean()
    std_G3 = group["G3"].std()
    return group[(abs(group["G3"] - mean_G3) / std_G3) > 2.24]

def iqr_outliers(group):
    Q1 = group["G3"].quantile(0.25)
    Q3 = group["G3"].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return group[(group["G3"] < lower_bound) | (group["G3"] > upper_bound)]

def mad_outliers(group):
    median_G3 = group["G3"].median()
    mad_G3 = median_abs_deviation(group["G3"], scale='normal')
    return group[(abs(group["G3"] - median_G3) / mad_G3) > 2.24]

Outliers using Means and Variances:
Medu     
4     135    0
      140    0
      148    0
      242    0
      296    0
      341    0
Name: G3, dtype: int64

Outliers using IQR:
Medu     
1     131    0
      150    0
      162    0
      173    0
      221    0
      310    0
      367    0
      383    0
      389    0
2     128    0
      144    0
      160    0
      168    0
      239    0
      244    0
      259    0
      264    0
      269    0
      316    0
      333    0
      334    0
      343    0
      387    0
3     130    0
      134    0
      136    0
      137    0
      146    0
      153    0
      170    0
      332    0
      337    0
4     135    0
      140    0
      148    0
      242    0
      296    0
      341    0
Name: G3, dtype: int64

Outliers using Median and MAD:
Medu     
0     127     9
1     131     0
      150     0
      162     0
      173     0
      221     0
      310     0
      367     0
      383     0
      389     0
2     128     0

  outliers_mean_variance = df.groupby("Medu").apply(mean_variance_outliers)['G3']
  outliers_iqr = df.groupby("Medu").apply(iqr_outliers)['G3']
  outliers_mad = df.groupby("Medu").apply(mad_outliers)['G3']


In [20]:
outliers_mean_variance = df.groupby("Medu").apply(mean_variance_outliers)['G3']
outliers_mean_variance

  outliers_mean_variance = df.groupby("Medu").apply(mean_variance_outliers)['G3']


Medu     
4     135    0
      140    0
      148    0
      242    0
      296    0
      341    0
Name: G3, dtype: int64

In [21]:
outliers_iqr = df.groupby("Medu").apply(iqr_outliers)['G3']
outliers_iqr

  outliers_iqr = df.groupby("Medu").apply(iqr_outliers)['G3']


Medu     
1     131    0
      150    0
      162    0
      173    0
      221    0
      310    0
      367    0
      383    0
      389    0
2     128    0
      144    0
      160    0
      168    0
      239    0
      244    0
      259    0
      264    0
      269    0
      316    0
      333    0
      334    0
      343    0
      387    0
3     130    0
      134    0
      136    0
      137    0
      146    0
      153    0
      170    0
      332    0
      337    0
4     135    0
      140    0
      148    0
      242    0
      296    0
      341    0
Name: G3, dtype: int64

In [22]:
outliers_mad = df.groupby("Medu").apply(mad_outliers)['G3']
outliers_mad

  outliers_mad = df.groupby("Medu").apply(mad_outliers)['G3']


Medu     
0     127     9
1     131     0
      150     0
      162     0
      173     0
      221     0
      310     0
      367     0
      383     0
      389     0
2     128     0
      144     0
      160     0
      168     0
      239     0
      244     0
      245    18
      259     0
      264     0
      269     0
      286    19
      316     0
      333     0
      334     0
      343     0
      387     0
3     130     0
      134     0
      136     0
      137     0
      146     0
      153     0
      170     0
      332     0
      337     0
4     135     0
      140     0
      148     0
      242     0
      296     0
      341     0
Name: G3, dtype: int64