In [11]:
import os
import pandas as pd
import openpyxl

In [39]:
ITALY_PATH = "data\Italy"
INDIA_PATH = "data\India"
excel_path = os.path.join(ITALY_PATH, "Italy.xlsx")
excel_path_india = os.path.join(INDIA_PATH, "India.xlsx")

df_italy = pd.read_excel(excel_path)
df_italy = df_italy[["Number", "Hgb"]]

df_india = pd.read_excel(excel_path_india)
df_india = df_india[["Number", "Hgb"]]

df = pd.concat([df_italy, df_india], ignore_index=True)
df.head()


Unnamed: 0,Number,Hgb
0,1,9.3
1,2,10.2
2,3,10.7
3,4,11.7
4,5,11.6


In [68]:
import os
import pandas as pd

ITALY_PATH = "data/Italy"

# Normalizar nombres de columnas
df.columns = df.columns.str.strip().str.lower()

# ‚ö†Ô∏è En tu caso las columnas reales son:
# number  | hgb
# as√≠ que NO necesitamos renombrar a id

# Convertir hemoglobina a float
df["hgb"] = pd.to_numeric(df["hgb"], errors="coerce")
df = df.dropna(subset=["hgb"])
df["hgb"] = df["hgb"].astype("float32")


# üî• Construcci√≥n ROBUSTA del image_path (detecta jpg/png autom√°ticamente)
def build_image_path(row):
    folder = str(int(row["number"]))
    folder_path = os.path.join(ITALY_PATH, folder)

    if not os.path.exists(folder_path):
        return None

    files = os.listdir(folder_path)

    image_files = [
        f for f in files
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]

    if len(image_files) == 0:
        return None

    return os.path.join(folder_path, image_files[0])


df["image_path"] = df.apply(build_image_path, axis=1)

# Eliminar filas sin imagen v√°lida
df = df[df["image_path"].notnull()]
df = df[df["image_path"].apply(os.path.exists)]

# Agregar pa√≠s
df["country"] = "Italy"

df.info()


<class 'pandas.DataFrame'>
Index: 202 entries, 0 to 217
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   number      202 non-null    int64   
 1   hgb         202 non-null    float32 
 2   image_path  202 non-null    str     
 3   country     202 non-null    str     
 4   hgb_bin     202 non-null    category
dtypes: category(1), float32(1), int64(1), str(2)
memory usage: 7.4 KB


In [69]:
def preprocess(image_path, label):
    # ensure image_path tensor has string dtype
    image_path = tf.convert_to_tensor(image_path, dtype=tf.string)
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    image = tf.cast(image, tf.float32) / 255.0
    return image, label


image_paths, labels = load_data(DATASET_PATH)

# convert lists to tensors with correct dtypes before creating the dataset
image_paths = tf.constant(image_paths, dtype=tf.string)
labels = tf.constant(labels, dtype=tf.float32)

In [70]:
import numpy as np

df["hgb_bin"] = pd.qcut(df["hgb"], q=5, duplicates="drop")

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["hgb_bin"]
)

# quitar columna auxiliar
train_df = train_df.drop(columns=["hgb_bin"])
val_df = val_df.drop(columns=["hgb_bin"])


In [71]:
print(train_df["hgb"].describe())
print(val_df["hgb"].describe())

count    161.000000
mean      12.679502
std        2.387963
min        7.000000
25%       10.700000
50%       13.000000
75%       14.800000
max       17.400000
Name: hgb, dtype: float64
count    41.000000
mean     12.635853
std       2.374454
min       7.800000
25%      11.300000
50%      13.200000
75%      14.500000
max      17.100000
Name: hgb, dtype: float64


In [72]:
import numpy as np

mean_hgb = train_df["hgb"].mean()
std_hgb = train_df["hgb"].std()

train_df["hgb_norm"] = (train_df["hgb"] - mean_hgb) / std_hgb
val_df["hgb_norm"] = (val_df["hgb"] - mean_hgb) / std_hgb

print("Mean:", mean_hgb)
print("Std:", std_hgb)


Mean: 12.6795025
Std: 2.3879633


In [None]:
import tensorflow as tf

IMG_SIZE = 224
BATCH_SIZE = 16

def preprocess(image_path, label):
    image = tf.io.read_file(image_path)
    image = tf.image.image.decode_image(image, channels=3)
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    image = tf.cast(image, tf.float32)
    
    image = tf.keras.applications.efficientnet.preprocess_input(image)

    label = tf.cast(label, tf.float32)

    return image, label


In [74]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.05),
    tf.keras.layers.RandomZoom(0.1),
])


In [75]:
def make_dataset(df, training=True):
    image_paths = df["image_path"].values
    labels = df["hgb_norm"].values

    ds = tf.data.Dataset.from_tensor_slices((image_paths, labels))

    ds = ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)

    if training:
        ds = ds.map(lambda x, y: (data_augmentation(x, training=True), y),
                    num_parallel_calls=tf.data.AUTOTUNE)
        ds = ds.shuffle(500)

    ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    return ds


In [76]:
train_ds = make_dataset(train_df, training=True)
val_ds = make_dataset(val_df, training=False)

In [77]:
for images, labels in train_ds.take(1):
    print(images.shape)
    print(labels.shape)


(16, 224, 224, 3)
(16,)
