In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import requests
import time

In [6]:
data = pd.read_csv('ensenada_mexico_hemoglobin_full.csv')
data

Unnamed: 0,index,age,sex,hemoglobin,city_country,country
0,1,8,F,13.1,"Ensenada, MX",Mexico
1,2,4,M,12.9,"Ensenada, MX",Mexico
2,3,6,M,12.6,"Ensenada, MX",Mexico
3,4,24,F,13.0,"Ensenada, MX",Mexico
4,5,87,F,11.5,"Ensenada, MX",Mexico
...,...,...,...,...,...,...
73,74,30,M,15.9,"Ensenada, MX",Mexico
74,75,31,M,14.4,"Ensenada, MX",Mexico
75,76,22,F,13.3,"Ensenada, MX",Mexico
76,77,57,M,15.3,"Ensenada, MX",Mexico


In [7]:
data = data.drop(columns = 'index')
data

Unnamed: 0,age,sex,hemoglobin,city_country,country
0,8,F,13.1,"Ensenada, MX",Mexico
1,4,M,12.9,"Ensenada, MX",Mexico
2,6,M,12.6,"Ensenada, MX",Mexico
3,24,F,13.0,"Ensenada, MX",Mexico
4,87,F,11.5,"Ensenada, MX",Mexico
...,...,...,...,...,...
73,30,M,15.9,"Ensenada, MX",Mexico
74,31,M,14.4,"Ensenada, MX",Mexico
75,22,F,13.3,"Ensenada, MX",Mexico
76,57,M,15.3,"Ensenada, MX",Mexico


In [8]:
data.describe()

Unnamed: 0,hemoglobin
count,78.0
mean,13.869231
std,1.824103
min,5.6
25%,13.1
50%,14.15
75%,15.175
max,17.0


In [9]:
import plotly.express as px


In [19]:
data['age'] = data['age'].replace('Null', np.nan)
data['age'] = pd.to_numeric(data['age'], errors='coerce')

# Drop rows with missing ages or hemoglobin values
data = data.dropna(subset=['age', 'hemoglobin'])


In [20]:
scatter_fig = px.scatter(
    data,
    x='age',
    y='hemoglobin',
    color='sex',
    title='Hemoglobin Levels by Age and Sex',
    labels={'age': 'Age', 'hemoglobin': 'Hemoglobin (g/dL)', 'sex': 'Sex'},
)
scatter_fig.show()


In [21]:
box_fig = px.box(
    data,
    x='sex',
    y='hemoglobin',
    title='Hemoglobin Distribution by Sex',
    labels={'sex': 'Sex', 'hemoglobin': 'Hemoglobin (g/dL)'},
    points='all'  # Shows all data points over the boxplot
)
box_fig.show()


In [22]:
# Define age bins and labels
bins = [0, 12, 18, 40, 60, 100]
labels = ['Child', 'Teen', 'Young Adult', 'Middle Age', 'Senior']
data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels)

# Group and calculate average hemoglobin
age_group_avg = data.groupby('age_group', observed=True)['hemoglobin'].mean().reset_index()

# Create bar plot
bar_fig = px.bar(
    age_group_avg,
    x='age_group',
    y='hemoglobin',
    title='Average Hemoglobin by Age Group',
    labels={'age_group': 'Age Group', 'hemoglobin': 'Avg Hemoglobin (g/dL)'}
)
bar_fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

