## Chapter 1 Introduction to t-tests

### Understanding the t-distribution

In [None]:
library(ggplot2)
library(dplyr)

In [None]:
# Generate a vector of 100 values between -4 and 4
n <- 100
x <- seq(-4, 4, length = n)

# Create a data frame for ggplot2
df <- data.frame(Index = 1:n, Value = x)

# Plot using ggplot2
ggplot(df, aes(x = Index, y = Value)) +
  geom_point(color = "blue") +
  labs(
    title = "Scatter Plot of Values from -4 to 4",
    x = "Index",
    y = "Value"
  ) +
  theme_minimal()

To find the value of probability density function (pdf) of the Student's t-distribution given a random variable x, use the dt() function in R.

```
Syntax: dt(x, df) 

Parameters:
- x is the quantiles vector
- df is the degrees of freedom
```

In [None]:
# Simulate the t-distribution
y_sample <- dt(x, df = n - 1)

# Create a data frame for ggplot2
df_y_sample <- data.frame(Index = 1:n, Value = y_sample)

# Create the scatter plot
ggplot(df_y_sample, aes(x = Index, y = Value)) +
  geom_point(color = "blue") +
  labs(
    title = "T-Distribution",
    x = "Index",
    y = "Value"
  ) +
  theme_minimal()

In [None]:
# Create a data frame with all distributions
df <- data.frame(
  x = rep(x, times = 4),
  density = c(
    dt(x, df = 2),
    dt(x, df = 8),
    dt(x, df = 32),
    dnorm(x)
  ),
  distribution = factor(rep(c("df = 2", "df = 8", "df = 32", "Normal"), each = length(x)))
)

# Plot
ggplot(df, aes(x = x, y = density, color = distribution)) +
  geom_line(linewidth = 1.2) +
  labs(
    title = "Comparison of t-distributions",
    x = "T value",
    y = "Density",
    color = "Distribution"
  ) +
  theme_minimal()

In [None]:
# Import Dataset
fish_data <- read.csv("assets/data/fish_data.csv")

# Quick look
head(fish_data)

In [None]:
length_stats <- fish_data %>%
  group_by(Lake) %>%
  summarise(
    mean = mean(Length_cm),
    sd = sd(Length_cm),
    n = n(),
    se = sd / sqrt(n),
    df = n - 1
  )
length_stats

In [None]:
# Generate t-distribution data points for plotting
t_curves <- length_stats %>%
  rowwise() %>%
  do({
    x_vals <- seq(.$mean - 4*.$sd, .$mean + 4*.$sd, length.out = 200)
    data.frame(
      x = x_vals,
      y = dt((x_vals - .$mean) / .$se, df = .$df) / .$se,
      Lake = .$Lake
    )
  }) %>%
  ungroup()

# Plot histogram and t-distribution overlay
ggplot(fish_data, aes(x = Length_cm, fill = Lake)) +
  geom_histogram(aes(y = ..density..), bins = 15, alpha = 0.5, position = "identity") +
  geom_line(data = t_curves, aes(x = x, y = y, color = Lake), size = 1) +
  labs(
    title = "Fish Length Histogram with t-Distribution Overlay",
    x = "Length (cm)",
    y = "Density"
  ) +
  theme_minimal()

In [None]:
weight_stats <- fish_data %>%
  group_by(Lake) %>%
  summarise(
    mean = mean(Weight_g),
    sd = sd(Weight_g),
    n = n(),
    se = sd / sqrt(n),
    df = n - 1
  )
weight_stats

In [None]:
ggplot(fish_data, aes(x = Length_cm)) +
  geom_histogram(position = "identity", alpha = 0.6, bins = 15) +
  labs(title = "Distribution of Fish Length",
       x = "Length (cm)",
       y = "Count") +
  theme_minimal()

In [None]:
ggplot(fish_data, aes(x = Length_cm, fill = Lake)) +
  geom_histogram(position = "identity", alpha = 0.6, bins = 15) +
  labs(title = "Distribution of Fish Length",
       x = "Length (cm)",
       y = "Count") +
  theme_minimal()

In [None]:
ggplot(fish_data, aes(x = Weight_g, fill = Lake)) +
  geom_histogram(position = "identity", alpha = 0.6, bins = 15) +
  labs(title = "Distribution of Fish Weight",
       x = "Weight (g)",
       y = "Count") +
  theme_minimal()

In [None]:
ggplot(fish_data, aes(x = Lake, y = Length_cm, fill = Lake)) +
  geom_boxplot() +
  labs(title = "Fish Length by Lake",
       x = "Lake Type",
       y = "Length (cm)") +
  theme_minimal()

In [None]:
ggplot(fish_data, aes(x = Lake, y = Weight_g, fill = Lake)) +
  geom_boxplot() +
  labs(title = "Fish Weight by Lake",
       x = "Lake Type",
       y = "Weigth (gram)") +
  theme_minimal()