- Create a new notebook (don't forget the imports)
- Name the notebook **Diabetes Analysis Dashboard**
- read in the diabetes_for_plotly dataset
- group data as needed
- Use express or graph objects
- Create a scatter plot of any two measures.  Use a third measure to adjust the size.  Color by a categorical value. Add hover text to show the age group.
- Create a side-by-side bar chart showing number of lab procedures and number of non lab procedures by gender.
- Create a line chart showing number of number of medications by month.
- Create a line chart showing number of number of procedures by month.
- Create a fifth chart of your choice (NOT scatter, bar or line) using the documentation.

# Import Required Libraries
Import the necessary libraries, including pandas and plotly.

# Read in the Dataset
Read the diabetes_for_plotly dataset into a pandas DataFrame.

In [None]:
# Import Required Libraries

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import io

# Read in the Dataset

response = requests.get(
    "https://bitbucket.org/jimcody/sampledata/raw/b2aa6df015816ec35afc482b53df1b7ca7a31f80/diabetes_for_plotly.csv"
)
data = response.content.decode("utf-8")

# Convert the CSV data to a pandas DataFrame
df = pd.read_csv(io.StringIO(data))

# Standardize the 'gender' column values
df["gender"] = (
    df["gender"].str.strip().str.capitalize()
)  # Remove any leading/trailing spaces and capitalize

# Standardize the 'gender' column values
df["gender"] = (
    df["gender"].str.strip().str.capitalize()
)  # Remove any leading/trailing spaces and capitalize

# Replace common variants, typos, and invalid entries with standard values
df["gender"] = df["gender"].replace(
    {
        "M": "Male",
        "F": "Female",
        "Male": "Male",
        "Female": "Female",
        "Mle": "Male",
        "male": "Male",
        "female": "Female",
        "?": "Unknown/Invalid",
        "Unknown/Invalid": "Unknown/Invalid",
        "Unknown/invalid": "Unknown/Invalid",
    }
)
# Display the updated DataFrame
df.head()

In [None]:
# Import Required Libraries

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import io

# Read in the Dataset

response = requests.get(
    "https://bitbucket.org/jimcody/sampledata/raw/b2aa6df015816ec35afc482b53df1b7ca7a31f80/diabetes_for_plotly.csv"
)
data = response.content.decode("utf-8")

# Convert the CSV data to a pandas DataFrame
df = pd.read_csv(io.StringIO(data))

# Scrub and Normalize Gender Data

# Create a mapping for gender normalization
gender_mapping = {
    "male": "Male",
    "female": "Female",
    "mle": "Male",
    "m": "Male",
    "f": "Female",
    "F": "Female",
    "M": "Male",
    "ML": "Male",
}

# Apply the mapping to the gender column
df["gender"] = df["gender"].str.lower().map(gender_mapping)

# Display the first few rows of the DataFrame to verify changes
print(df.head())

# Check the column names
print(df.columns)

# Group Data as Needed

# Group by gender and calculate the number of lab and non-lab procedures
grouped_gender = (
    df.groupby("gender")[["num_lab_procedures", "num_procedures"]].sum().reset_index()
)

# Ensure the 'admission_date' column exists
if "admission_date" in df.columns:
    # Group by month and calculate the number of medications and procedures
    df["admission_date"] = pd.to_datetime(df["admission_date"])
    df["month"] = df["admission_date"].dt.to_period("M")
    grouped_month = (
        df.groupby("month")[["num_medications", "num_procedures"]].sum().reset_index()
    )
else:
    print("Column 'admission_date' does not exist in the dataset.")
    grouped_month = pd.DataFrame()  # Create an empty DataFrame as a placeholder

# Display the grouped data
print(grouped_gender.head())
print(grouped_month.head())

# Create Scatter Plot

# Create a scatter plot of 'num_lab_procedures' vs 'num_medications'
# Use 'num_procedures' to adjust the size of the markers
# Color by 'gender' and add hover text to show the 'age' group

fig = px.scatter(
    df,
    x="num_lab_procedures",
    y="num_medications",
    size="num_procedures",
    color="gender",
    hover_name="age",
    title="Scatter Plot of Lab Procedures vs Medications",
    opacity=0.6,  # Add transparency to reduce overlap
)

# Add jitter to the points to reduce overlap
fig.update_traces(
    marker=dict(
        sizemode="diameter",
        sizeref=2.0 * max(df["num_procedures"]) / (40.0**2),
        line=dict(width=2, color="DarkSlateGrey"),
    )
)

fig.show()

In [None]:
# Import Required Libraries

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import io

# Read in the Dataset

response = requests.get(
    "https://bitbucket.org/jimcody/sampledata/raw/b2aa6df015816ec35afc482b53df1b7ca7a31f80/diabetes_for_plotly.csv"
)
data = response.content.decode("utf-8")

# Convert the CSV data to a pandas DataFrame
df = pd.read_csv(io.StringIO(data))

# Scrub and Normalize Gender Data

# Create a mapping for gender normalization
gender_mapping = {
    "male": "Male",
    "female": "Female",
    "mle": "Male",
    "m": "Male",
    "f": "Female",
    "F": "Female",
    "M": "Male",
    "ML": "Male",
}

# Apply the mapping to the gender column
df["gender"] = df["gender"].str.lower().map(gender_mapping)

# Display the first few rows of the DataFrame to verify changes
print(df.head())

# Check the column names
print(df.columns)

# Group Data as Needed

# Group by gender and calculate the number of lab and non-lab procedures
grouped_gender = (
    df.groupby("gender")[["num_lab_procedures", "num_procedures"]].sum().reset_index()
)

# Ensure the 'admission_date' column exists
if "admission_date" in df.columns:
    # Group by month and calculate the number of medications and procedures
    df["admission_date"] = pd.to_datetime(df["admission_date"])
    df["month"] = df["admission_date"].dt.to_period("M")
    grouped_month = (
        df.groupby("month")[["num_medications", "num_procedures"]].sum().reset_index()
    )
else:
    print("Column 'admission_date' does not exist in the dataset.")
    grouped_month = pd.DataFrame()  # Create an empty DataFrame as a placeholder

# Display the grouped data
print(grouped_gender.head())
print(grouped_month.head())

# Create Scatter Plot

# Create a scatter plot of 'num_lab_procedures' vs 'num_medications'
# Use 'num_procedures' to adjust the size of the markers
# Color by 'gender' and add hover text to show the 'age' group

fig = px.scatter(
    df,
    x="num_lab_procedures",
    y="num_medications",
    size="num_procedures",
    color="gender",
    hover_name="age",
    title="Scatter Plot of Lab Procedures vs Medications",
    opacity=0.6,  # Add transparency to reduce overlap
    height=800,  # Make the chart twice as tall
)

# Add jitter to the points to reduce overlap
fig.update_traces(
    marker=dict(
        sizemode="diameter",
        sizeref=2.0 * max(df["num_procedures"]) / (40.0**2),
        line=dict(width=2, color="DarkSlateGrey"),
    )
)

fig.show()

# Group Data as Needed
Group the data as needed for the analysis.

In [None]:
# Import Required Libraries

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import io


# Display the first few rows of the DataFrame
print(df.head())

# Check the column names
print(df.columns)

# Group Data as Needed

# Group by gender and calculate the number of lab and non-lab procedures
grouped_gender = (
    df.groupby("gender")[["num_lab_procedures", "num_procedures"]].sum().reset_index()
)

# Ensure the 'admission_date' column exists
if "admission_date" in df.columns:
    # Group by month and calculate the number of medications and procedures
    df["admission_date"] = pd.to_datetime(df["admission_date"])
    df["month"] = df["admission_date"].dt.to_period("M")
    grouped_month = (
        df.groupby("month")[["num_medications", "num_procedures"]].sum().reset_index()
    )
else:
    print("Column 'admission_date' does not exist in the dataset.")
    grouped_month = pd.DataFrame()  # Create an empty DataFrame as a placeholder

# Display the grouped data
print(grouped_gender.head())
print(grouped_month.head())

# Create Scatter Plot

# Create a scatter plot of 'num_lab_procedures' vs 'num_medications'
# Use 'num_procedures' to adjust the size of the markers
# Color by 'gender' and add hover text to show the 'age' group

fig = px.scatter(
    df,
    x="num_lab_procedures",
    y="num_medications",
    size="num_procedures",
    color="gender",
    hover_name="age",
    title="Scatter Plot of Lab Procedures vs Medications",
)

fig.show()

# Create Side-by-Side Bar Chart

# Create a side-by-side bar chart showing the number of lab procedures and number of non-lab procedures by gender
fig = go.Figure(
    data=[
        go.Bar(
            name="Lab Procedures",
            x=grouped_gender["gender"],
            y=grouped_gender["num_lab_procedures"],
        ),
        go.Bar(
            name="Non-Lab Procedures",
            x=grouped_gender["gender"],
            y=grouped_gender["num_procedures"],
        ),
    ]
)

fig.update_layout(barmode="group", title="Lab and Non-Lab Procedures by Gender")
fig.show()

# Create Line Chart for Number of Medications by Month

if not grouped_month.empty:
    fig = px.line(
        grouped_month,
        x="month",
        y="num_medications",
        title="Number of Medications by Month",
    )
    fig.show()

# Create Line Chart for Number of Procedures by Month

if not grouped_month.empty:
    fig = px.line(
        grouped_month,
        x="month",
        y="num_procedures",
        title="Number of Procedures by Month",
    )
    fig.show()

# Create Fifth Chart of Your Choice (e.g., Pie Chart)

fig = px.pie(
    df,
    names="gender",
    values="num_medications",
    title="Distribution of Medications by Gender",
)
fig.show()

# Create a side-by-side bar chart showing the number of lab procedures and number of non-lab procedures by gender
fig = go.Figure(
    data=[
        go.Bar(
            name="Lab Procedures",
            x=grouped_gender["gender"],
            y=grouped_gender["num_lab_procedures"],
        ),
        go.Bar(
            name="Non-Lab Procedures",
            x=grouped_gender["gender"],
            y=grouped_gender["num_procedures"],
        ),
    ]
)

fig.update_layout(barmode="group", title="Lab and Non-Lab Procedures by Gender")
fig.show()

# Create Line Chart for Number of Medications by Month

if not grouped_month.empty:
    fig = px.line(
        grouped_month,
        x="month",
        y="num_medications",
        title="Number of Medications by Month",
    )
    fig.show()

# Create Line Chart for Number of Procedures by Month

if not grouped_month.empty:
    fig = px.line(
        grouped_month,
        x="month",
        y="num_procedures",
        title="Number of Procedures by Month",
    )
    fig.show()

# Create Fifth Chart of Your Choice (e.g., Pie Chart)

fig = px.pie(
    df,
    names="gender",
    values="num_medications",
    title="Distribution of Medications by Gender",
)
fig.show()

# Create Scatter Plot
Create a scatter plot of any two measures, using a third measure to adjust the size, and color by a categorical value. Add hover text to show the age group.

In [None]:
# Create Scatter Plot

# Create a scatter plot of 'num_lab_procedures' vs 'num_medications'
# Use 'num_procedures' to adjust the size of the markers
# Color by 'gender' and add hover text to show the 'age' group

fig = px.scatter(
    df,
    x="num_lab_procedures",
    y="num_medications",
    size="num_procedures",
    color="gender",
    hover_name="age",
    title="Scatter Plot of Lab Procedures vs Medications",
)

fig.show()

# Create Side-by-Side Bar Chart
Create a side-by-side bar chart showing the number of lab procedures and number of non-lab procedures by gender.

In [None]:
# Create Side-by-Side Bar Chart

# Create a side-by-side bar chart showing the number of lab procedures and number of non-lab procedures by gender
fig = go.Figure(
    data=[
        go.Bar(
            name="Lab Procedures",
            x=grouped_gender["gender"],
            y=grouped_gender["num_lab_procedures"],
        ),
        go.Bar(
            name="Non-Lab Procedures",
            x=grouped_gender["gender"],
            y=grouped_gender["num_procedures"],
        ),
    ]
)

# Change the bar mode to group
fig.update_layout(
    barmode="group", title="Number of Lab and Non-Lab Procedures by Gender"
)

fig.show()

# Create Line Chart for Number of Medications by Month
Create a line chart showing the number of medications by month.

In [None]:
# Create Line Chart for Number of Medications by Month

# Create a line chart showing the number of medications by month
fig = px.line(
    grouped_month,
    x="grouped_month",
    y="num_medications",
    title="Number of Medications by Month",
)

fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of [] but received: grouped_month

# Create Line Chart for Number of Procedures by Month
Create a line chart showing the number of procedures by month.

In [None]:
# Create Line Chart for Number of Procedures by Month

# Create a line chart showing the number of procedures by month
fig = px.line(
    grouped_month, x="month", y="num_procedures", title="Number of Procedures by Month"
)

fig.show()

# Create Fifth Chart of Your Choice
Create a fifth chart of your choice using the Plotly documentation, ensuring it is not a scatter, bar, or line chart.

In [None]:
# Create Fifth Chart of Your Choice

# Create a pie chart showing the distribution of age groups
fig = px.pie(df, names="age", title="Distribution of Age Groups")

fig.show()