# load the required libraries
library(tidyverse)



cat(paste("R Version", R.version$version.string, "\n"))

# read the penguins data from a csv file
penguins <- read_csv("data/penguins-raw.csv")

# print the number of rows and columns in the data frame
cat("Rows:", nrow(penguins), ", Columns:", ncol(penguins), "\n")

# print the column names
cat("Columns:\n")
print(colnames(penguins))

# select a subset of columns
simpler <- penguins %>% select(Species, `Body Mass (g)`, `Flipper Length (mm)`)
head(simpler)

# print the index
print(rownames(penguins))

# print the 23rd row
penguins[23, ]

# print the value of the "Culmen Length (mm)" column in the 23rd row
penguins[23, "Culmen Length (mm)"]

# print the "Sex" and "Date Egg" columns for rows 23 to 28
penguins[23:28, c("Sex", "Date Egg")]

# count the number of occurrences of each value in the "Island" column
penguins %>% count(Island)

# count the number of occurrences of each value in the "Species" column
penguins %>% count(Species)

# filter the data to keep only rows where "Sex" is "FEMALE"
females <- penguins %>% filter(Sex == "FEMALE")
head(females)

# filter the data using the query function
females <- penguins %>% filter(`Sex` == "FEMALE")
head(females)

# filter the data to keep only rows where "Flipper Length (mm)" is greater than "Body Mass (g)" divided by 20
penguins %>% filter(`Flipper Length (mm)` > `Body Mass (g)` / 20)

# count the number of missing values in each column
colSums(is.na(penguins))

# drop the "Comments" column
penguins <- penguins %>% select(-Comments)

# drop rows with missing values
penguins_nona <- penguins %>% drop_na()

# impute missing values in the "Culmen Length (mm)" column with the mean value
penguins_imputed <- penguins %>% mutate(`Culmen Length (mm)` = if_else(is.na(`Culmen Length (mm)`), mean(`Culmen Length (mm)`, na.rm = TRUE), `Culmen Length (mm)`))

# set the "Sex" column as a categorical variable
penguins <- penguins %>% mutate(Sex = as.factor(Sex))

# create a new column "SimpleSpecies" by extracting the first word from the "Species" column
penguins <- penguins %>% mutate(SimpleSpecies = word(Species, 1))

# create a new column "Body Mass (kg)" by dividing the "Body Mass (g)" column by 1000
penguins <- penguins %>% mutate(`Body Mass (kg)` = `Body Mass (g)` / 1000)

# sort the data by "Body Mass (g)" in ascending order
penguins_small <- penguins %>% select(Species, Island, `Body Mass (g)`)
penguins_small %>% arrange(`Body Mass (g)`)

# group the data by "Species"
penguins_by_species <- penguins %>% group_by(Species)

# summarize the "Body Mass (g)" column for each group
penguins_by_species %>% summarize(mean = mean(`Body Mass (g)`), sd = sd(`Body Mass (g)`), n = n())

# group the data by "Sex" and "Species"
penguins_by_sex_and_species <- penguins %>% group_by(Sex, Species)

# summarize the "Body Mass (g)" column for each group
penguins_by_sex_and_species %>% summarize(mean = mean(`Body Mass (g)`), sd = sd(`Body Mass (g)`), n = n())

# create a pivot table of the mean "Body Mass (g)" for each combination of "Sex" and "Species"
penguins_by_sex_and_species %>%
    summarize(mean = mean(`Body Mass (g)`)) %>%
    pivot_wider(names_from = Species, values_from = mean)

# plot a histogram of "Body Mass (g)" for female Adelie and Gentoo penguins
penguins %>%
    filter(Species %in% c("Adelie Penguin (Pygoscelis adeliae)", "Gentoo penguin (Pygoscelis papua)"), Sex == "FEMALE") %>%
    ggplot(aes(x = `Body Mass (g)`)) +
    geom_histogram(bins = 30) +
    facet_wrap(~Species)