# Walkthroughs and Exercises for Fundamentals of Statistics with Python

Dr. Chester Ismay

In [None]:
# Install the packages directly if needed with pip
# Check the repo's README below for more information
!pip install numpy pandas scipy matplotlib seaborn statsmodels scikit-learn jupyter

In [1]:
import pandas as pd

# Display all columns
pd.set_option('display.max_columns', None)

# Display all outputs from each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Week 1

## Walkthrough 1.1: Getting Started

### Setting Up the Python Environment

If you haven’t already installed Python, Jupyter, and the necessary
packages, there are instructions on the course repo in the README to do
so
[here](https://github.com/ismayc/oreilly-fundamentals-of-statistics-with-python/blob/main/README.md).

If you aren’t able to do this on your machine, you may want to check out
[Google Colab](https://colab.research.google.com/). It’s a free service
that allows you to run Jupyter notebooks in the cloud. Alternatively,
I’ve set up some temporary notebooks on Binder
([![()](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ismayc/oreilly-fundamentals-of-statistics-with-python/main?urlpath=%2Fdoc%2Ftree%2Fexercises.ipynb))
that you can work with online as well.

In [2]:
# Importing libraries/modules and aliasing them as needed
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler, StandardScaler

### Exploring a dataset

In [None]:
# Load in the dataset

In [None]:
# Display information about the DataFrame

### Performing basic statistical functions using NumPy, Pandas, and SciPy.

#### Using NumPy

In [None]:
# Calculate the mean of the years_code_pro column

In [None]:
# Calculate the median of the work_exp column

In [None]:
# Calculate the standard deviation of the converted_comp_yearly column

#### Using Pandas

In [None]:
# Display summary statistics on the numeric columns of the DataFrame

#### Using SciPy

In [None]:
# Separate the data into two groups



# Perform the t-test looking for a difference in mean salary for the groups


# Print the results


## Exercise 1.1: Getting Started

### Setting Up the Python Environment

If you ran the `# Importing libraries and aliasing them` code above, you
should be good to proceed here. If not, scroll up and run it.

### Exploring a dataset

In [None]:
# Load in the coffee_quality dataset

In [None]:
# Display information about the DataFrame

### Performing basic statistical functions using NumPy, Pandas, and SciPy.

#### Using NumPy

In [None]:
# Calculate the mean of the aroma column

In [None]:
# Calculate the median of the total_cup_points column

In [None]:
# Calculate the standard deviation of the moisture_percentage column

#### Using Pandas

In [None]:
# Display summary statistics on the numeric columns of the DataFrame

#### Using SciPy

In [None]:
# Focus on only Asia and North America entries

# Separate the data into two groups


# Perform the t-test to test for difference in total_cup_points


# Print the results


------------------------------------------------------------------------

## Walkthrough 1.2: Data Summarizing

### Compute and interpret measures of central tendency

In [None]:
# Calculate center statistics for years_code_pro


# To extract just the value for mode

### Compute and interpret measures of variation

In [None]:
# Calculate spread statistics for years_code_pro


In [None]:
# Calculate the five-number summary for years_code_pro


# Print them out

## Exercise 1.2: Data Summarizing

In [None]:
# Calculate center statistics for aroma

In [None]:
# Calculate spread statistics for aroma

In [None]:
# Calculate the five-number summary for aroma

------------------------------------------------------------------------

## Walkthrough 1.3: Cleaning and Preparing Data with Pandas

In [None]:
# Histogram for years_code_pro

In [None]:
# Set the default figure size for all plots

In [None]:
# Box plot for work_exp

In [None]:
# Scatter plot for years_code_pro vs. converted_comp_yearly

## Exercise 1.3: Cleaning and Preparing Data with Pandas

In [None]:
# Histogram for acidity

In [None]:
# Box plot for body

In [None]:
# Scatter plot for body vs. acidity

------------------------------------------------------------------------

## Walkthrough 1.4: Sampling Distribution Generation

In [None]:
# Think of our data as a population to draw from

# Generate a large sample from the converted_comp_yearly column
# Parameters


# Set a seed to make code reproducible


# Simulate sampling distribution of the mean


# Plot the sampling distribution of the sample means

## Exercise 1.4: Sampling Distribution Generation

In [None]:
# Think of our aroma data as a population to draw from

# Parameters



# Set random seed


# Simulate sampling distribution of the mean



# Plot the sampling distribution of the sample means

------------------------------------------------------------------------

# Week 2

## Walkthrough 2.1: Advanced Plots

In [None]:
# Select only numeric columns

# Calculate the correlation matrix for numeric columns

# Heatmap for correlation matrix

In [None]:
# Pair plot for selected variables years_code_pro, work_exp, converted_comp_yearly

In [None]:
# Time series plot


# Count the number of surveys completed each day

# Plot the counts as a line chart

## Exercise 2.1: Advanced Plots

In [None]:
# Select only numeric columns

# Drop the clean_cup and sweetness columns since they are always 10
numeric_columns = numeric_columns.drop(columns=['clean_cup', 'sweetness'], errors='ignore')

# Calculate the correlation matrix for numeric columns


# Heatmap for correlation matrix

In [None]:
# Pair plot for selected variables ['aroma', 'acidity', 'body']

In [None]:
# Plot the mean total_cup_points for each grading_date

# Convert to datetime

# Extract the month and year from the grading_date column
coffee_quality['month'] = coffee_quality['grading_date'].dt.to_period('M')

# Aggregate the mean total_cup_points by month

# Plot the mean total_cup_points by month as a line chart

------------------------------------------------------------------------

## Walkthrough 2.2: EDA

### Data Cleaning

In [None]:
# Check for missing values


# Remove duplicates if any

### Data Visualization

In [None]:
# Histogram for years_code_pro


# Box plot for work_exp


# Scatter plot for years_code_pro vs. converted_comp_yearly

### Summary Statistics

In [None]:
# Calculate summary statistics

## Exercise 2.2: EDA

### Data Cleaning

In [None]:
# Check for missing values


# Remove duplicates if any

### Data Visualization

In [None]:
# Histogram for aroma


# Box plot for acidity


# Scatter plot for aroma vs. total_cup_points

### Summary Statistics

In [None]:
# Calculate summary statistics

------------------------------------------------------------------------

## Walkthrough 2.3: Data Preprocessing

### Inspect the Data after Loading

In [None]:
# Display basic information about the dataset


# Display the first few rows of the dataset

### Handle Missing Values

In [None]:
# Check for missing values


# Make a copy of the dataset for imputation

# Select only numeric columns

# Fill missing values in numeric columns with the median


# Display the first few rows of the imputed dataset

### Handling Outliers

In [None]:
# Select only numeric columns

# Identify outliers using IQR


# Remove outliers

### Data Transformation

In [None]:
# Normalization on converted_comp_yearly


# Standardization on work_exp


# Encoding categorical variables (country)

### Data Visualizations on Preprocessed Data

In [None]:
# Histogram for years_code_pro


# Box plot for work_exp


# Scatter plot for years_code_pro vs. converted_comp_yearly

## Exercise 2.3: Data Preprocessing

### Inspect the Data after Loading

In [None]:
# Display basic information about the dataset


# Display the first few rows of the dataset

### Handle Missing Values

In [None]:
# Check for missing values


# Make a copy of the dataset for imputation


# Select only numeric columns


# Fill missing values in numeric columns with the median


# Display the first few rows of the imputed dataset

### Handle Outliers

In [None]:
# Select only numeric columns

# Identify outliers using IQR


# Remove outliers

### Data Transformation

In [None]:
# Normalization on total_cup_points


# Standardization on acidity


# Encoding categorical variables (country and continent of origin)

### Data Visualizations on Preprocessed Data

In [None]:
# Histogram for aroma


# Box plot for acidity


# Scatter plot for aroma vs. total_cup_points

------------------------------------------------------------------------

## Walkthrough 2.4: Correlations

### Correlation Matrix

In [None]:
# Select only numeric columns

# Calculate the correlation matrix

# Display the correlation matrix

### Visualize Correlations

In [None]:
# Heatmap for correlation matrix

### Create Scatter Plots for Meaningful Correlations

In [None]:
# Scatter plot for years_code_pro vs. years_code


# Scatter plot for work_exp vs. years_code_pro

## Exercise 2.4: Correlations

### Correlation Matrix

In [None]:
# Select only numeric columns

# Remove clean_cup and sweetness too

# Calculate the correlation matrix

# Display the correlation matrix

### Visualize Correlations

In [None]:
# Heatmap for correlation matrix

### Create Scatter Plots for Meaningful Correlations

In [None]:
# Scatter plot for flavor vs. total_cup_points


# Scatter plot for overall vs. total_cup_points

------------------------------------------------------------------------

# Week 3

## Walkthrough 3.1: Simulating Distributions

### Simulating Binomial

In [None]:
# Simulate binomial distribution

# Plot histogram

### Simulating Normal

In [None]:
# Simulate normal distribution

# Plot histogram

## Exercise 3.1:

### Simulating Poisson

In [None]:
# Simulate Poisson distribution with lambda (lam) parameter 3

# Plot histogram

### Simulating Exponential

In [None]:
# Simulate exponential distribution with scale parameter 1

# Plot histogram

------------------------------------------------------------------------

## Walkthrough 3.2: t-tests

In [None]:
# One-sample t-test checking for evidence that mu compensation > 85000

In [None]:
# Two-sample t-test comparing compensation across plans_to_use_ai groups
# Checking for a difference

## Exercise 3.2: t-tests

In [None]:
# One-sample t-test checking for evidence that mu flavor < 7.8

In [None]:
# Two-sample t-test checking for difference in Columbia and Brazil total_cup_points

------------------------------------------------------------------------

## Walkthrough 3.3: Comparative Tests

In [None]:
# Perform one-way ANOVA comparing compensation across different levels of
# remote_work

In [None]:
# Create a contingency table of employment and remote_work


# Perform chi-square test

## Exercise 3.3: Comparative Tests

In [None]:
# Perform one-way ANOVA comparing total_cup_points across country_of_origin

In [None]:
# Perform a chi-square test of independence for processing_method versus
# continent_of_origin

# Create a contingency table


# Perform chi-square test

------------------------------------------------------------------------

## Walkthrough 3.4: Non-Parametric Tests

In [None]:
# Perform Mann-Whitney U Test comparing compensation for Remote and In-person

In [None]:
# Perform Kruskal-Wallis H Test comparing compensation across countries

## Exercise 3.4: Non-Parametric Tests

In [None]:
# Perform Mann-Whitney U Test comparing total_cup_points for Guatemala
# and Honduras

In [None]:
# Perform Kruskal-Wallis H Test comparing total_cup_points across 
# continent_of_origin