## Data visualization (suggestions)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "notebook"

#### Understanding the available datasets

#### 1. Penguins datset

- The penguins dataset contains measurements for three species of penguins (Adelie, Chinstrap, Gentoo) observed on different islands in the Palmer Archipelago, Antarctica.
- It includes features such as bill length and depth, flipper length, body mass, sex, and species.
- This dataset is commonly used for data visualization and machine learning exercises as an alternative to the classic iris dataset.
- It provides a real-world example for exploring classification, visualization, and data cleaning techniques.

In [None]:
penguins_df = pd.read_csv('data/penguins.csv')

In [None]:
penguins_df.head()

#### 2. Car Crashes Dataset

- The car crashes dataset contains data on traffic accidents, including the number of crashes, injuries, and fatalities by state or region.
- It typically includes features such as total crashes, alcohol-involved crashes, speeding-related crashes, and population statistics.
- This dataset is widely used for data visualization, exploratory data analysis, and statistical modeling to understand factors contributing to road accidents.
- It provides a practical example for learning about correlation, regression, and geospatial analysis in Python.

In [None]:
# Load and explore the penguins dataset
car_crashes = pd.read_csv('data/car_crashes.csv')

car_crashes.head(10)

#### 3. Chlorophyll Concentration Analysis

The `data/chla_subset.csv` dataset contains chlorophyll-a (chla) predictions for various water bodies, such as lakes and reservoirs. Each row represents a measurement event, including the following columns:

- `gnis_name`: Name of the water body (e.g., "Pepacton Reservoir", "Lake Montauk").
- `comid`: Unique identifier for the water body.
- `centroid_longitude` and `centroid_latitude`: Geographic coordinates of the water body's centroid.
- `date_acquired`: Date when the measurement or prediction was made.
- `predictions`: Predicted chlorophyll-a concentration (likely in µg/L).

This dataset is useful for analyzing spatial and temporal patterns of chlorophyll-a, which is an important indicator of water quality and algal biomass.

In [None]:
# Load and describe the chla dataset
chla = pd.read_csv('data/chla_subset.csv')
chla['date_acquired'] = pd.to_datetime(chla['date_acquired'])

# Show the first few rows
display(chla.head())

### 4. Any other dataset of your choice

Here're some Seaborn datasets for inspiration: https://github.com/mwaskom/seaborn-data

## Exercise!

#### Objective: Create some data visualizations that we can reuse for the rest of the tutorial.

### If you selected the `Penguins` dataset

#### 1. Create a bar chart showing how many penguins of each species are in the dataset.
Hint: plt.bar(), sns.countplot() or px.bar().

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.countplot(data=penguins_df, x='species')
plt.title('Number of Penguins by Species')
plt.xlabel('Species')
plt.ylabel('Count')
plt.show()

fig = ax.figure

In [None]:
fig = px.bar(penguins_df['species'].value_counts().reset_index(), 
             x='species', 
             y='count',
             title='Number of Penguins by Species')
fig.update_layout(xaxis_title='Species', yaxis_title='Count')

#### 2. Make a histogram of penguin body weights to see the distribution.
Hint: plt.hist(), sns.histplot() or px.histogram().

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.histplot(data=penguins_df, x='body_mass_g')
plt.title('Distribution of Penguin Body Weights')
plt.xlabel('Body Mass (g)')
plt.ylabel('Count')
plt.show()

fig = ax.figure

In [None]:
fig = px.histogram(penguins_df, x='body_mass_g', 
                   title='Distribution of Penguin Body Weights')
fig.update_layout(xaxis_title='Body Mass (g)', yaxis_title='Count')
fig.show()

#### 3. Create a scatter plot comparing bill length vs bill depth, with different colors for each species.
Hint: plt.scatter(), sns.scatterplot() or px.scatter().

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.scatterplot(data=penguins_df, x='bill_length_mm', y='bill_depth_mm', hue='species')
plt.title('Bill Length vs Bill Depth by Species')
plt.xlabel('Bill Length (mm)')
plt.ylabel('Bill Depth (mm)')
plt.show()

fig = ax.figure

In [None]:
fig = px.scatter(penguins_df, x='bill_length_mm', y='bill_depth_mm', 
                 color='species',
                 title='Bill Length vs Bill Depth by Species')
fig.update_layout(xaxis_title='Bill Length (mm)', yaxis_title='Bill Depth (mm)')
fig.show()

#### 4. Make a box plot showing flipper length for each penguin species.
Hint: plt.boxplot(), sns.boxplot() or px.box().

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.boxplot(data=penguins_df, x='species', y='flipper_length_mm')
plt.title('Flipper Length by Species')
plt.xlabel('Species')
plt.ylabel('Flipper Length (mm)')
plt.show()

fig = ax.figure

In [None]:
fig = px.box(penguins_df, x='species', y='flipper_length_mm',
             title='Flipper Length by Species')
fig.update_layout(xaxis_title='Species', yaxis_title='Flipper Length (mm)')
fig.show()

#### 5. Create a line plot showing the average body mass for each species across different islands.
Hint: plt.plot(), sns.lineplot() or px.line().

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.lineplot(data=penguins_df, x='island', y='body_mass_g', hue='species')
plt.title('Average Body Mass by Species Across Islands')
plt.xlabel('Island')
plt.ylabel('Body Mass (g)')
plt.show()

fig = ax.figure

In [None]:
fig = px.line(penguins_df, x='island', y='body_mass_g', 
              color='species',
              title='Average Body Mass by Species Across Islands')
fig.update_layout(xaxis_title='Island', yaxis_title='Body Mass (g)')
fig.show()

### If you selected the `crashes` dataset

#### 1. Create a bar chart showing the total number of car crashes by state.
Hint: plt.bar(), sns.barplot() or px.bar().

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data=car_crashes, x='abbrev', y='total')
plt.title('Total Car Crashes by State')
plt.xlabel('State')
plt.ylabel('Total Crashes')
plt.xticks(rotation=45)
plt.show()

In [None]:
fig = px.bar(car_crashes, x='abbrev', y='total',
             title='Total Car Crashes by State')
fig.update_layout(xaxis_title='State', yaxis_title='Total Crashes')
fig.show()


#### 2. Make a histogram of insurance premiums to see the distribution across states.
Hint: plt.hist(), sns.histplot() or px.histogram().

In [None]:
# Seaborn approach
plt.figure(figsize=(8, 6))
sns.histplot(data=car_crashes, x='ins_premium')
plt.title('Distribution of Insurance Premiums Across States')
plt.xlabel('Insurance Premium')
plt.ylabel('Count')
plt.show()

# Plotly Express approach
fig = px.histogram(car_crashes, x='ins_premium',
                   title='Distribution of Insurance Premiums Across States')
fig.update_layout(xaxis_title='Insurance Premium', yaxis_title='Count')
fig.show()

#### 3. Create a scatter plot comparing total crashes vs speeding-related crashes, with state abbreviations as labels.
Hint: plt.scatter(), sns.scatterplot() or px.scatter().

In [None]:
# Seaborn approach
plt.figure(figsize=(8, 6))
sns.scatterplot(data=car_crashes, x='total', y='speeding')
for i, txt in enumerate(car_crashes['abbrev']):
    plt.annotate(txt, (car_crashes['total'].iloc[i], car_crashes['speeding'].iloc[i]))
plt.title('Total Crashes vs Speeding-Related Crashes')
plt.xlabel('Total Crashes')
plt.ylabel('Speeding-Related Crashes')
plt.show()

# Plotly Express approach
fig = px.scatter(car_crashes, x='total', y='speeding', 
                 text='abbrev',
                 title='Total Crashes vs Speeding-Related Crashes')
fig.update_traces(textposition='top center')
fig.update_layout(xaxis_title='Total Crashes', yaxis_title='Speeding-Related Crashes')
fig.show()

In [None]:
# Plotly Express choropleth map
fig = px.choropleth(car_crashes, 
                    locations='abbrev', 
                    color='total',
                    locationmode='USA-states',
                    scope='usa',
                    title='Total Car Crashes by US State',
                    color_continuous_scale='Reds',
                    labels={'total': 'Total Crashes'})
fig.update_layout(geo=dict(bgcolor='rgba(0,0,0,0)'))
fig.show()

#### 4. Make a box plot showing the distribution of alcohol-related crashes.
Hint: plt.boxplot(), sns.boxplot() or px.box().

In [None]:
# Seaborn approach
plt.figure(figsize=(8, 6))
sns.boxplot(data=car_crashes, y='alcohol')
plt.title('Distribution of Alcohol-Related Crashes')
plt.ylabel('Alcohol-Related Crashes')
plt.show()

# Plotly Express approach
fig = px.box(car_crashes, y='alcohol',
             title='Distribution of Alcohol-Related Crashes')
fig.update_layout(yaxis_title='Alcohol-Related Crashes')
fig.show()

#### 5. Create a bar chart comparing insurance premiums vs insurance losses by state.
Hint: plt.bar(), sns.barplot() or px.bar().

In [None]:
# Seaborn approach
plt.figure(figsize=(12, 6))
car_crashes_melted = car_crashes.melt(id_vars=['abbrev'], 
                                      value_vars=['ins_premium', 'ins_losses'],
                                      var_name='insurance_type', 
                                      value_name='amount')
sns.barplot(data=car_crashes_melted, x='abbrev', y='amount', hue='insurance_type')
plt.title('Insurance Premiums vs Insurance Losses by State')
plt.xlabel('State')
plt.ylabel('Amount')
plt.xticks(rotation=45)
plt.show()

# Plotly Express approach
fig = px.bar(car_crashes_melted, x='abbrev', y='amount', 
             color='insurance_type',
             title='Insurance Premiums vs Insurance Losses by State',
             barmode='group')
fig.update_layout(xaxis_title='State', yaxis_title='Amount')
fig.show()

### If you selected the `chlorophyll` dataset

#### 1. Create a histogram of chlorophyll-a predictions to see the distribution of concentration levels.
Hint: plt.hist(), sns.histplot() or px.histogram().

In [None]:
# Seaborn approach
plt.figure(figsize=(8, 6))
sns.histplot(data=chla, x='predictions')
plt.title('Distribution of Chlorophyll-a Concentration Levels')
plt.xlabel('Chlorophyll-a Predictions (µg/L)')
plt.ylabel('Count')
plt.show()

# Plotly Express approach
fig = px.histogram(chla, x='predictions',
                   title='Distribution of Chlorophyll-a Concentration Levels')
fig.update_layout(xaxis_title='Chlorophyll-a Predictions (µg/L)', yaxis_title='Count')
fig.show()

#### 2. Make a scatter plot showing the geographic distribution of water bodies using longitude and latitude coordinates.
Hint: plt.scatter(), sns.scatterplot() or px.scatter().

In [None]:
# Seaborn approach
plt.figure(figsize=(10, 6))
sns.scatterplot(data=chla, x='centroid_longitude', y='centroid_latitude')
plt.title('Geographic Distribution of Water Bodies')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

# Plotly Express approach
fig = px.scatter(chla, x='centroid_longitude', y='centroid_latitude',
                 title='Geographic Distribution of Water Bodies')
fig.update_layout(xaxis_title='Longitude', yaxis_title='Latitude')
fig.show()

#### 3. Create a line plot showing how chlorophyll-a predictions change over time (by date_acquired).
Hint: plt.plot(), sns.lineplot() or px.line().

In [None]:
# Seaborn approach
plt.figure(figsize=(10, 6))
sns.lineplot(data=chla.sort_values('date_acquired'), x='date_acquired', y='predictions')
plt.title('Chlorophyll-a Predictions Over Time')
plt.xlabel('Date Acquired')
plt.ylabel('Chlorophyll-a Predictions (µg/L)')
plt.xticks(rotation=45)
plt.show()

# Plotly Express approach
fig = px.line(chla.sort_values('date_acquired'), x='date_acquired', y='predictions',
              title='Chlorophyll-a Predictions Over Time')
fig.update_layout(xaxis_title='Date Acquired', yaxis_title='Chlorophyll-a Predictions (µg/L)')
fig.show()

#### 4. Make a box plot comparing chlorophyll-a predictions across different water bodies (top 10 most frequent).
Hint: plt.boxplot(), sns.boxplot() or px.box().

In [None]:
# Get top 10 most frequent water bodies
top_10_waterbodies = chla['gnis_name'].value_counts().head(10).index
chla_top10 = chla[chla['gnis_name'].isin(top_10_waterbodies)]

# Seaborn approach
plt.figure(figsize=(12, 6))
sns.boxplot(data=chla_top10, x='gnis_name', y='predictions')
plt.title('Chlorophyll-a Predictions by Water Body (Top 10 Most Frequent)')
plt.xlabel('Water Body')
plt.ylabel('Chlorophyll-a Predictions (µg/L)')
plt.xticks(rotation=45)
plt.show()

# Plotly Express approach
fig = px.box(chla_top10, x='gnis_name', y='predictions',
             title='Chlorophyll-a Predictions by Water Body (Top 10 Most Frequent)')
fig.update_layout(xaxis_title='Water Body', yaxis_title='Chlorophyll-a Predictions (µg/L)')
fig.update_xaxes(tickangle=45)
fig.show()

#### 5. Create a scatter plot comparing longitude vs chlorophyll-a predictions, with different colors for different concentration ranges.
Hint: plt.scatter(), sns.scatterplot() or px.scatter().

In [None]:
# Create concentration ranges
chla['concentration_range'] = pd.cut(chla['predictions'], 
                                    bins=[0, 10, 25, 50, float('inf')], 
                                    labels=['Low (0-10)', 'Medium (10-25)', 'High (25-50)', 'Very High (50+)'])

# Seaborn approach
plt.figure(figsize=(10, 6))
sns.scatterplot(data=chla, x='centroid_longitude', y='predictions', hue='concentration_range')
plt.title('Longitude vs Chlorophyll-a Predictions by Concentration Range')
plt.xlabel('Longitude')
plt.ylabel('Chlorophyll-a Predictions (µg/L)')
plt.show()

# Plotly Express approach
fig = px.scatter(chla, x='centroid_longitude', y='predictions', 
                 color='concentration_range',
                 title='Longitude vs Chlorophyll-a Predictions by Concentration Range')
fig.update_layout(xaxis_title='Longitude', yaxis_title='Chlorophyll-a Predictions (µg/L)')
fig.show()

### If you selected a different dataset

- What are the 5 most important insights we can get from this?