In [14]:
!pip install pandas



In [9]:
# ===============================
# PHASE 1: DATA COLLECTION & UNDERSTANDING
# ===============================

import pandas as pd
import os
import sys

In [10]:
path = os.path.abspath("../")
if(path not in sys.path):
    sys.path.append(path)

In [11]:
from src.Read_Any_File_Type import reading_data

# 1️⃣ Load Dataset
file_path = "../data/raw/Diabetes-Missing-Data.csv"  # update path if needed
data = reading_data(file_path)
df=data.read_data()

[SUCCESS] File loaded successfully: <class 'pandas.core.frame.DataFrame'>


In [12]:
# 2️⃣ Initial Exploration
print("=== DATASET INFORMATION ===")
print(df.info())
print("\n=== FIRST FIVE ROWS ===")
print(df.head())
print("\n=== SUMMARY STATISTICS ===")
print(df.describe())


=== DATASET INFORMATION ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Pregnant           768 non-null    int64  
 1   Glucose            763 non-null    float64
 2   Diastolic_BP       733 non-null    float64
 3   Skin_Fold          541 non-null    float64
 4   Serum_Insulin      394 non-null    float64
 5   BMI                757 non-null    float64
 6   Diabetes_Pedigree  768 non-null    float64
 7   Age                768 non-null    int64  
 8   Class              768 non-null    int64  
dtypes: float64(6), int64(3)
memory usage: 54.1 KB
None

=== FIRST FIVE ROWS ===
   Pregnant  Glucose  Diastolic_BP  Skin_Fold  Serum_Insulin   BMI  \
0         6    148.0          72.0       35.0            NaN  33.6   
1         1     85.0          66.0       29.0            NaN  26.6   
2         8    183.0          64.0        NaN    

In [13]:
# 3️⃣ Check for Missing Values
print("\n=== MISSING VALUE COUNTS ===")
print(df.isna().sum())


=== MISSING VALUE COUNTS ===
Pregnant               0
Glucose                5
Diastolic_BP          35
Skin_Fold            227
Serum_Insulin        374
BMI                   11
Diabetes_Pedigree      0
Age                    0
Class                  0
dtype: int64


In [14]:
# 4️⃣ Check Data Types
print("\n=== DATA TYPES ===")
print(df.dtypes)


=== DATA TYPES ===
Pregnant               int64
Glucose              float64
Diastolic_BP         float64
Skin_Fold            float64
Serum_Insulin        float64
BMI                  float64
Diabetes_Pedigree    float64
Age                    int64
Class                  int64
dtype: object


In [15]:
# 5️⃣ Identify Biological Impossibilities (Zeros in Physiological Measurements)
#    (Some columns may not logically have zeros)
zero_counts = (df == 0).sum()
print("\n=== ZERO VALUE COUNTS ===")
print(zero_counts)


=== ZERO VALUE COUNTS ===
Pregnant             111
Glucose                0
Diastolic_BP           0
Skin_Fold              0
Serum_Insulin          0
BMI                    0
Diabetes_Pedigree      0
Age                    0
Class                500
dtype: int64


In [16]:
# 6️⃣ Detect Potential Outliers using summary stats (min/max)
print("\n=== POSSIBLE OUTLIER CHECK ===")
for col in df.select_dtypes(include=["float64", "int64"]).columns:
    print(f"{col}: min = {df[col].min()}, max = {df[col].max()}")


=== POSSIBLE OUTLIER CHECK ===
Pregnant: min = 0, max = 17
Glucose: min = 44.0, max = 199.0
Diastolic_BP: min = 24.0, max = 122.0
Skin_Fold: min = 7.0, max = 99.0
Serum_Insulin: min = 14.0, max = 846.0
BMI: min = 18.2, max = 67.1
Diabetes_Pedigree: min = 0.078, max = 2.42
Age: min = 21, max = 81
Class: min = 0, max = 1


In [17]:
# 7️⃣ Short Report Summary
print("\n=== INITIAL DATA ASSESSMENT SUMMARY ===")
print(f"Total Rows: {df.shape[0]}, Total Columns: {df.shape[1]}")
print("Columns with missing data:")
print(df.columns[df.isna().any()].tolist())
print("\nColumns likely containing outliers: ['Serum_Insulin', 'Skin_Fold', 'BMI']")
print("\n✅ Data types consistent")
print("⚠️ Missing data detected in multiple columns")
print("⚠️ Possible outliers detected in insulin and BMI")
print("✅ No biologically impossible zeros found (except valid ones like Pregnant=0 or Class=0)")


=== INITIAL DATA ASSESSMENT SUMMARY ===
Total Rows: 768, Total Columns: 9
Columns with missing data:
['Glucose', 'Diastolic_BP', 'Skin_Fold', 'Serum_Insulin', 'BMI']

Columns likely containing outliers: ['Serum_Insulin', 'Skin_Fold', 'BMI']

✅ Data types consistent
⚠️ Missing data detected in multiple columns
⚠️ Possible outliers detected in insulin and BMI
✅ No biologically impossible zeros found (except valid ones like Pregnant=0 or Class=0)
