<a href="https://colab.research.google.com/github/kiril-buga/Neural-Network-Training-Project/blob/main/Deeplearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load the dataset from the shared Google Drive

In [None]:
# ===== Imports =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set this to True to download from Huggingface else use Google Drive
USE_HF = True


In [None]:
if USE_HF:
  from huggingface_hub import snapshot_download
  local_dir = snapshot_download(
      repo_id="kiril-buga/ECG-database",
      repo_type="dataset"
  )
  print("Downloaded to:", local_dir)
else:
  from google.colab import drive
  drive.mount('/content/drive')

In [None]:
if USE_HF and local_dir:
  # Case 2: You want to download the dataset from Huggingface
    DATA_PATH = f"{local_dir}/data/"
    ARTIFACT_DIR = f"{local_dir}/artifacts/"

else:
  # ===== Detect if running in Google Colab and mount Drive =====
  IN_COLAB = False
  try:
      from google.colab import drive  # type: ignore
      IN_COLAB = True
  except Exception:
      drive = None
      IN_COLAB = False

  if IN_COLAB:
      drive.mount('/content/drive/')

  # ===== Define paths =====
  if IN_COLAB:
      # Case 1: You manually placed the dataset in MyDrive
      DATA_PATH = "/content/drive/MyDrive/DeepLearningECG/data/"
      ARTIFACT_DIR = "/content/drive/MyDrive/DeepLearningECG/artifacts/"

  else:
      # Case 3: Local fallback (if running outside Colab)
      DATA_PATH = "../data/"
      ARTIFACT_DIR = "../artifacts/"

print("DATA_PATH:", DATA_PATH)
print("ARTIFACT_DIR:", ARTIFACT_DIR)
print("Files in DATA_PATH:", os.listdir(DATA_PATH))

3. Load the CSV metadata

In [None]:
# ===== Load CSV metadata =====
ATTR_PATH = os.path.join(DATA_PATH, "AttributesDictionary.csv")
DISEASE_PATH = os.path.join(DATA_PATH, "DiseaseCode.csv")
ECGCODE_PATH = os.path.join(DATA_PATH, "ECGCode.csv")

df_attr = pd.read_csv(ATTR_PATH)
df_disease = pd.read_csv(DISEASE_PATH)
df_ecgcode = pd.read_csv(ECGCODE_PATH)

print("Attributes shape:", df_attr.shape)
display(df_attr.head())

print("DiseaseCode shape:", df_disease.shape)
display(df_disease.head())

print("ECGCode shape:", df_ecgcode.shape)
display(df_ecgcode.head())

4. Simple data analysis (EDA)

In [None]:
# ===== Basic statistics and structure =====

print("Shape:", df_attr.shape)
print("\nColumn types:")
print(df_attr.dtypes)

print("\nMissing values per column:")
print(df_attr.isna().sum())

print("\n----- Numerical Summary -----")
display(df_attr.describe(include="all"))

In [None]:
# ===== Age distribution =====

# Age is stored as "###d" (days). Convert to integer days.
df_attr["Age_days"] = df_attr["Age"].str.replace("d", "").astype(int)
df_attr["Age_years"] = df_attr["Age_days"] / 365

plt.figure(figsize=(8,5))
plt.hist(df_attr["Age_years"], bins=60)
plt.xlabel("Age (years)")
plt.ylabel("Count")
plt.title("Distribution of patient age")
plt.show()

print("Min age (days):", df_attr["Age_days"].min())
print("Max age (days):", df_attr["Age_days"].max())
print("Average age (days):", df_attr["Age_days"].mean())

# How many have age = 0 days?
num_age_zero = (df_attr["Age_days"] == 0).sum()
print("Number of ECG records with age = 0 days:", num_age_zero)

In [None]:
# ===== Gender distribution =====

plt.figure(figsize=(5,4))
df_attr["Gender"].value_counts().plot(kind="bar")
plt.title("Gender distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

print(df_attr["Gender"].value_counts())

In [None]:
# ===== Number of electrodes per ECG (9 vs 12) =====

plt.figure(figsize=(5,4))
df_attr["Lead"].value_counts().plot(kind="bar")
plt.title("Number of leads per ECG")
plt.xlabel("Lead count")
plt.ylabel("Number of ECG records")
plt.show()

print(df_attr["Lead"].value_counts())

In [None]:
# Ensure Age_days exists
df_attr["Age_days"] = df_attr["Age"].str.replace("d", "").astype(int)
df_attr["Age_years"] = df_attr["Age_days"] / 365

# Split by lead type
df_9 = df_attr[df_attr["Lead"] == 9]
df_12 = df_attr[df_attr["Lead"] == 12]

print("Average age (9-lead):", df_9["Age_years"].mean())
print("Average age (12-lead):", df_12["Age_years"].mean())
print("Median age (9-lead):", df_9["Age_years"].median())
print("Median age (12-lead):", df_12["Age_years"].median())
print("Min age 9-lead:", df_9["Age_years"].min())
print("Min age 12-lead:", df_12["Age_years"].min())
print("Max age 9-lead:", df_9["Age_years"].max())
print("Max age 12-lead:", df_12["Age_years"].max())

# Plot both age distributions
plt.figure(figsize=(10,5))
plt.hist(df_9["Age_years"], bins=40, alpha=0.6, label="9-lead")
plt.hist(df_12["Age_years"], bins=40, alpha=0.6, label="12-lead")
plt.xlabel("Age (years)")
plt.ylabel("Count")
plt.title("Age distribution: 9-lead vs 12-lead ECGs")
plt.legend()
plt.show()

In [None]:
# ===== ECG duration distribution =====
#Assuming sampling frequency = 500 Hz

df_attr["Duration_sec"] = df_attr["Sampling_point"] / 500

plt.figure(figsize=(8,5))
plt.hist(df_attr["Duration_sec"], bins=40)
plt.xlabel("Duration (seconds)")
plt.ylabel("Count")
plt.title("Distribution of ECG signal length")
plt.show()

print("Min duration:", df_attr["Duration_sec"].min(), "sec")
print("Max duration:", df_attr["Duration_sec"].max(), "sec")
print("Average duration:", df_attr["Duration_sec"].mean(), "sec")

# Distribution in a table
# Create bins of 5 seconds
bins = list(range(0, int(df_attr["Duration_sec"].max()) + 5, 5))

df_attr["Duration_bin"] = pd.cut(
    df_attr["Duration_sec"],
    bins=bins,
    right=False,   # intervals like [0–5), [5–10), ...
    labels=[f"{b}-{b+5}s" for b in bins[:-1]]
)

duration_table = df_attr["Duration_bin"].value_counts().sort_index()
duration_table

In [None]:
# ===== ICD-10 disease label distribution =====
#The ICD-10 field has multiple codes separated by ;. We split and counted all codes.

# Split multi-code entries and flatten
codes = (
    df_attr["ICD-10 code"]
    .dropna()
    .str.replace("'", "")
    .str.split(";")
)

flattened = [code.strip() for list_ in codes for code in list_]

icd_counts = pd.Series(flattened).value_counts()

plt.figure(figsize=(10,6))
icd_counts.head(20).plot(kind="bar")
plt.title("Top 20 ICD-10 codes")
plt.xlabel("ICD-10 code")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

icd_counts.head(20)

In [None]:
# ===== AHA and CHN code distributions =====

# Clean AHA code strings
df_attr["AHA_clean"] = df_attr["AHA_code"].str.replace("'", "")
aha_codes = df_attr["AHA_clean"].str.split(";")
aha_flat = [c.strip() for lst in aha_codes for c in lst]
aha_counts = pd.Series(aha_flat).value_counts()

plt.figure(figsize=(10,6))
aha_counts.head(20).plot(kind="bar")
plt.title("Most frequent AHA diagnostic codes")
plt.xticks(rotation=45)
plt.show()

# CHN codes
df_attr["CHN_clean"] = df_attr["CHN_code"].str.replace("'", "")
chn_codes = df_attr["CHN_clean"].str.split(";")
chn_flat = [c.strip() for lst in chn_codes for c in lst]
chn_counts = pd.Series(chn_flat).value_counts()

plt.figure(figsize=(10,6))
chn_counts.head(20).plot(kind="bar")
plt.title("Most frequent CHN diagnostic codes")
plt.xticks(rotation=45)
plt.show()

In [None]:
# ===== Number of ECGs per patient =====

patient_counts = df_attr["Patient_ID"].value_counts()

plt.figure(figsize=(6,4))
plt.hist(patient_counts, bins=20)
plt.xlabel("ECG records per patient")
plt.ylabel("Count")
plt.title("Distribution of ECG count per patient")
plt.show()

print(patient_counts.describe())

# Table: ECG count → number of patients
ecg_count_table = patient_counts.value_counts().sort_index()

# Convert to DataFrame
ecg_count_df = pd.DataFrame({
    "ECG recordings per patient": ecg_count_table.index,
    "Number of patients": ecg_count_table.values
})

display(ecg_count_df)