# **Week 2** - Descriptive Statistics

## Loading the Data

In [1]:
import pandas as pd
import matplotlib.pyplot

# Last week, we learned how to load data from your local copy of a dataset
# This time, we will load data from a GitHub repository

# below is the link to the GitHub data
link = "https://raw.githubusercontent.com/jolineuichanco/DataAnalytics/main/demos/citibike_data.csv"
df = pd.read_csv(link)

In [None]:
# Let's check the first five rows
df.head()

In [None]:
# Let's check the name of the columns
df.columns

### Checking Column Data Types

Let's see the data types of the `df` DataFrame columns using both `df.info()` and `df.dtypes`.

In [None]:
# Let's check the data types of each column
df.info()

In [None]:
# Check unique values for 'gender', 'start station name', 'end station name', 'user type'
gender_list = df['gender'].unique()

for i in gender_list:
  print(i)

In [None]:
start_station_list = df['start station name'].unique()

print("Total start stations:", len(start_station_list))

for i in start_station_list:
  print(i)

In [None]:
# Can we also list all unique user types?

### Converting 'gender' to a Categorical Data Type

To explicitly tell pandas that 'gender' is a categorical variable, you can use the `.astype('category')` method. This change in data type can be beneficial for several reasons:

*   **Memory Efficiency:** For columns with a limited number of unique values, converting to `category` can significantly reduce memory usage.
*   **Proper Statistical Treatment:** Many statistical functions and plotting libraries will recognize this data type and treat the column appropriately (e.g., in `groupby` operations, or when creating charts).
*   **Data Validation:** It helps ensure that only valid categories are present in the column.

Let's apply this to our `df` DataFrame and then check the `dtypes` again to confirm the change.

In [None]:
# Convert the categorical data types
df['gender'] = df['gender'].astype('category')
df['start station name'] = df['start station name'].astype('category')
df['end station name'] = df['end station name'].astype('category')
df['usertype'] = df['usertype'].astype('category')

print("Data types after converting 'gender' to category:")
display(df.dtypes)

In [None]:
# what other columns do you think should be categorical data types?
# can you try converting them into categorical data types?

### Explicitly Defining an Ordered Categorical Column

Let's imagine we had a 'satisfaction' column that should be treated as ordinal. Here's how you'd convert it to an ordered `category` dtype.

In [None]:
import pandas as pd

# Create a sample DataFrame with an ordinal-like column
df_example = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5],
    'satisfaction': ['Medium', 'Low', 'High', 'Medium', 'High']
})

print("Original DataFrame and dtypes:")
display(df_example)
display(df_example.dtypes)

# Define the order for the categories
quality_order = ['Low', 'Medium', 'High']

# Convert 'satisfaction' to an ordered categorical type
df_example['satisfaction'] = pd.Categorical(
    df_example['satisfaction'], categories=quality_order, ordered=True
    )

In [None]:
# Let's try to compare the satisfaction of two customers

# Access the underlying numerical codes for the categories
satisfaction_code_a = df_example['satisfaction'].cat.codes.iloc[2]
satisfaction_code_b = df_example['satisfaction'].cat.codes.iloc[1]

print(f"Customer A's satisfaction ('{df_example['satisfaction'].iloc[2]}') has code: {satisfaction_code_a}")
print(f"Customer B's satisfaction ('{df_example['satisfaction'].iloc[1]}') has code: {satisfaction_code_b}")

# Now, compare the numerical codes
is_more_satisfied_by_code = satisfaction_code_a > satisfaction_code_b

print(f"Is Customer A's satisfaction code greater than Customer B's code: {is_more_satisfied_by_code}")

In [None]:
# You can also use boolean indexing to check conditions across the series
print("\nAre any satisfaction values greater than 'Medium'?")
display(df_example['satisfaction'] > 'Medium')

## Measures of Central Tendency

We can use `.mean()` for the mean, `.mode()` to get all modes, and `.median()` to get the median

In [None]:
# First, let's convert the tripduration column from seconds into minutes
df['duration'] = df['tripduration']/60

# Let's summarize the data first
print("Mean and Median of df['duration']:")
df['duration'].describe()

In [None]:
# Computing the mean
duration_mean = df['duration'].mean()

print("Mean of df['duration']: " + str(round(duration_mean, 2)))

Mean of df['duration']: 13.71


In [None]:
# Computing the median
duration_median = df['duration'].median()

print("Median of df['duration']: " + str(round(duration_median, 2)))

In [None]:
# Computing the mode
duration_modes = df['duration'].mode().tolist()
# returns a list, because there could be more than one modes

for i in duration_modes:
  print("Mode of df['duration]: " + str(round(i, 2)))

Mode of df['duration]: 7.9


## Measures of Dispersion or Spread

We can use `.std()` for the standard deviation, and `.var()` for variance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# We will first show the histogram and the boxplot. We will go into more detail about what the boxplot means in the next class.
# Create a figure with two subplots (2 rows, 1 column)
fig, axes = plt.subplots(2, 1, figsize=(10, 10)) # 2 rows, 1 column, adjust figure size

# Plot 1: Histogram for duration
sns.histplot(df['duration'], bins=50, ax=axes[0])
axes[0].set_title('Histogram of Trip Duration')
axes[0].set_xlabel('Duration (minutes)')
axes[0].set_ylabel('Frequency')

# Plot 2: Boxplot for all genders combined with duration on the x-axis (full range)
sns.boxplot(x='duration', data=df, ax=axes[1])
axes[1].set_title('Boxplot of Trip Duration')
axes[1].set_xlabel('Duration (minutes)')
axes[1].set_ylabel('')

plt.tight_layout() # Adjust layout to prevent overlapping titles/labels
plt.show()

In [None]:
# Calculate the Range
duration_max = df['duration'].max()
duration_min = df['duration'].min()
duration_range = duration_max - duration_min

print(f"Range of df['duration'] (Max - Min): {duration_range:.2f} minutes")

# Calculate the Interquartile Range (IQR)
Q1 = df['duration'].quantile(0.25)
Q3 = df['duration'].quantile(0.75)
IQR = Q3 - Q1

print(f"Interquartile Range (IQR) of df['duration'] (Q3 - Q1): {IQR:.2f} minutes")

In [None]:
# Calculate the Standard Deviation
std_dev = df['duration'].std()
print(f"Standard Deviation of df['duration']: " + str(round(std_dev, 2)) + " minutes")

# Calculate the Variance
variance = df['duration'].var()
print(f"Variance of df['duration']: " + str(round(variance, 2)) + " square minutes")

### Let's do a comparison of the dispersion of the duration by gender

Which of the genders has a duration with more variance?

In [None]:
df.groupby('gender')['duration'].mean()

In [None]:
sns.boxplot(x='gender', y='duration', data=df)

Let's check if your answer is true

In [None]:
df.groupby('gender')['duration'].std()

## Measures of Association

To calculate the correlation between two specific columns, such as `'duration'` and `'gender'`, you can use the `.corr()` method on one column and pass the other column as an argument. This gives you a single value representing their linear relationship.

In [None]:
# Let's first create a scatter plot between birth year and duration
import matplotlib.pyplot as plt
plt.scatter(x='birth year', y='duration', data=df)
plt.xlabel('Birth Year')
plt.ylabel('Duration (minutes)')
plt.title('Birth Year vs. Trip Duration')

In [None]:
# Calculate the correlation between 'duration' and 'gender'
correlation = df['duration'].corr(df['birth year'])
print("Correlation between 'duration' and 'birth year': " + str(round(correlation, 4)))

### Creating a Correlation Heatmap

A correlation heatmap uses a color-coded matrix to show the correlation coefficients between different pairs of numerical variables. Values closer to 1 (red) indicate a strong positive correlation, values closer to -1 (blue) indicate a strong negative correlation, and values near 0 (white/light colors) indicate a weak or no linear correlation. This is particularly useful for quickly identifying relationships within your dataset.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure df is loaded (if not already from a previous cell)
# link = "https://raw.githubusercontent.com/jolineuichanco/DataAnalytics/main/demos/citibike_data.csv"
# df = pd.read_csv(link)


# Select numerical variables for correlation analysis
corr_vars = ['duration', 'start station latitude', 'start station longitude', 'end station latitude', 'end station longitude']

# Calculate the correlation matrix
correlation_matrix = df[corr_vars].corr()

# Create the heatmap
sns.heatmap(correlation_matrix,
            annot=True,           # Show correlation values on the heatmap
            cmap='coolwarm',      # Color map: coolwarm often used for correlations
            vmin=-1, vmax=1)      # Ensure the color scale ranges from -1 to 1
plt.show()