In [2]:
# 🧪 Diabetes Dataset Quiz - Complete Python Solutions

In [2]:
import pandas as pd

# Load the dataset (assuming it's already uploaded in Colab)
data = pd.read_csv('data/transformed_data.csv')

print("🏥 DIABETES DATASET QUIZ - COMPLETE SOLUTIONS")
print("=" * 60)
print(f"Dataset loaded: {data.shape[0]:,} rows, {data.shape[1]} columns")

🏥 DIABETES DATASET QUIZ - COMPLETE SOLUTIONS
Dataset loaded: 100,000 rows, 16 columns


## Question 1: How many "Male" are there in total?

In [3]:
total_male_patients = len(data[data['gender'] == 'Male'])
print(f"Total patients: {total_male_patients}")
print(f"✅ ANSWER: {total_male_patients}")

Total patients: 41430
✅ ANSWER: 41430


## Question 2: Average age of all patients

In [22]:
average_age = data['age'].mean()
print(f"Average age: {average_age:.1f} years")
print(f"Exact value: {average_age:.3f} years")
print(f"✅ ANSWER: {average_age:.1f}")

Average age: 41.9 years
Exact value: 41.886 years
✅ ANSWER: 41.9


## Question 3: Patients with diabetes

In [4]:
diabetic_count = data['diabetes'].sum()
print(f"Patients with diabetes: {diabetic_count:,}")
diabetic_count_alt = len(data[data['diabetes'] == 1])
print(f"Alternative method: {diabetic_count_alt:,}")
print("Breakdown:")
print(f"  No diabetes (0): {(data['diabetes'] == 0).sum():,}")
print(f"  Has diabetes (1): {(data['diabetes'] == 1).sum():,}")
print(f"✅ ANSWER: A) {diabetic_count_alt} patients")

Patients with diabetes: 8,500
Alternative method: 8,500
Breakdown:
  No diabetes (0): 91,500
  Has diabetes (1): 8,500
✅ ANSWER: A) 8500 patients


## Question 4: Percentage of female patients

In [5]:
female_percentage = (data['gender'] == 'Female').mean() * 100
print(f"Female percentage: {female_percentage:.1f}%")
female_count = len(data[data['gender'] == 'Female'])
male_count = len(data[data['gender'] == 'Male'])
total_count = len(data)
female_pct_manual = (female_count / total_count) * 100
print(f"Manual calculation: {female_pct_manual:.1f}%")
print(f"Female: {female_count:,}, Male: {male_count:,}, Total: {total_count:,}")
print(f"✅ ANSWER: {female_percentage:.1f}")

Female percentage: 58.6%
Manual calculation: 58.6%
Female: 58,552, Male: 41,430, Total: 100,000
✅ ANSWER: 58.6


## Question 5: Patients over 65 years old

In [6]:
over_65 = data[data['age'] > 65]
over_65_count = len(over_65)
print(f"Patients over 65: {over_65_count:,}")
percentage_over_65 = (over_65_count / len(data)) * 100
print(f"Percentage over 65: {percentage_over_65:.1f}%")
if 12000 <= over_65_count < 15000:
    range_answer = "A) 12,000-15,000"
elif 15000 <= over_65_count < 18000:
    range_answer = "B) 15,000-18,000"
elif 18000 <= over_65_count < 21000:
    range_answer = "C) 18,000-21,000"
else:
    range_answer = "D) 21,000-24,000"
print(f"Falls into range: {range_answer}")
print(f"✅ ANSWER: {range_answer}")

Patients over 65: 14,237
Percentage over 65: 14.2%
Falls into range: A) 12,000-15,000
✅ ANSWER: A) 12,000-15,000


## Question 6: Average BMI of diabetic patients

In [7]:
diabetic_patients = data[data['diabetes'] == 1]
diabetic_bmi = diabetic_patients['bmi'].mean()
print(f"Average BMI of diabetic patients: {diabetic_bmi:.1f}")
non_diabetic_bmi = data[data['diabetes'] == 0]['bmi'].mean()
print(f"Average BMI of non-diabetic patients: {non_diabetic_bmi:.1f}")
print(f"Difference: {diabetic_bmi - non_diabetic_bmi:.1f} BMI points")
print(f"✅ ANSWER: {diabetic_bmi:.1f}")

Average BMI of diabetic patients: 32.0
Average BMI of non-diabetic patients: 26.9
Difference: 5.1 BMI points
✅ ANSWER: 32.0


## Question 7: Which gender has higher diabetes rate?

In [8]:
male_diabetes_rate = data[data['gender'] == 'Male']['diabetes'].mean()
female_diabetes_rate = data[data['gender'] == 'Female']['diabetes'].mean()
print(f"Male diabetes rate: {male_diabetes_rate:.4f} ({male_diabetes_rate*100:.2f}%)")
print(f"Female diabetes rate: {female_diabetes_rate:.4f} ({female_diabetes_rate*100:.2f}%)")
diabetes_by_gender = data.groupby('gender')['diabetes'].mean()
print("\nUsing groupby:")
for gender, rate in diabetes_by_gender.items():
    print(f"  {gender}: {rate:.4f} ({rate*100:.2f}%)")
if male_diabetes_rate > female_diabetes_rate:
    winner = "Males"
    difference = (male_diabetes_rate - female_diabetes_rate) * 100
else:
    winner = "Females"
    difference = (female_diabetes_rate - male_diabetes_rate) * 100
print(f"\n{winner} have higher diabetes rate by {difference:.2f} percentage points")
print(f"✅ ANSWER: A) {winner}")

Male diabetes rate: 0.0975 (9.75%)
Female diabetes rate: 0.0762 (7.62%)

Using groupby:
  Female: 0.0762 (7.62%)
  Male: 0.0975 (9.75%)
  Other: 0.0000 (0.00%)

Males have higher diabetes rate by 2.13 percentage points
✅ ANSWER: A) Males


## Question 8: Female + diabetes + over 50

In [9]:
female_diabetes_over50 = data[(data['gender'] == 'Female') & (data['diabetes'] == 1) & (data['age'] > 50)]
result_count = len(female_diabetes_over50)
print(f"Female + Diabetes + Over 50: {result_count:,} patients")
print("\nStep-by-step verification:")
step1 = data[data['gender'] == 'Female']
print(f"  Step 1 - Female patients: {len(step1):,}")
step2 = step1[step1['diabetes'] == 1]
print(f"  Step 2 - Female with diabetes: {len(step2):,}")
step3 = step2[step2['age'] > 50]
print(f"  Step 3 - Female with diabetes over 50: {len(step3):,}")
print(f"Both methods give same result: {result_count == len(step3)}")
print(f"✅ ANSWER: {result_count:,} patients")

Female + Diabetes + Over 50: 2,853 patients

Step-by-step verification:
  Step 1 - Female patients: 58,552
  Step 2 - Female with diabetes: 4,461
  Step 3 - Female with diabetes over 50: 2,853
Both methods give same result: True
✅ ANSWER: 2,853 patients


## Question 9: Average age of non-diabetic patients

In [10]:
non_diabetic_patients = data[data['diabetes'] == 0]
non_diabetic_age = non_diabetic_patients['age'].mean()
rounded_age = round(non_diabetic_age)
print(f"Average age of non-diabetic patients: {non_diabetic_age:.2f} years")
print(f"Rounded to nearest whole number: {rounded_age} years")
if 38 <= non_diabetic_age <= 40:
    age_range = "A) 38-40 years"
elif 40 < non_diabetic_age <= 42:
    age_range = "B) 40-42 years"
elif 42 < non_diabetic_age <= 44:
    age_range = "C) 42-44 years"
else:
    age_range = "D) 44-46 years"
print(f"Falls into range: {age_range}")
print(f"✅ ANSWER: {age_range}")

Average age of non-diabetic patients: 40.46 years
Rounded to nearest whole number: 40 years
Falls into range: B) 40-42 years
✅ ANSWER: B) 40-42 years


## Question 10: Patients with BMI > 30 (obese)

In [11]:
obese_patients = data[data['bmi'] > 30]
obese_count = len(obese_patients)
print(f"Patients with BMI > 30: {obese_count:,}")
print("\nBMI Categories breakdown:")
underweight = len(data[data['bmi'] < 18.5])
normal = len(data[(data['bmi'] >= 18.5) & (data['bmi'] < 25)])
overweight = len(data[(data['bmi'] >= 25) & (data['bmi'] <= 30)])
obese = len(data[data['bmi'] > 30])
print(f"  Underweight (< 18.5): {underweight:,}")
print(f"  Normal (18.5-25): {normal:,}")
print(f"  Overweight (25-30): {overweight:,}")
print(f"  Obese (> 30): {obese:,}")
obese_percentage = (obese_count / len(data)) * 100
print(f"\nPercentage of obese patients: {obese_percentage:.1f}%")
print(f"✅ ANSWER: {obese_count} patients")

Patients with BMI > 30: 23,483

BMI Categories breakdown:
  Underweight (< 18.5): 8,494
  Normal (18.5-25): 22,219
  Overweight (25-30): 45,804
  Obese (> 30): 23,483

Percentage of obese patients: 23.5%
✅ ANSWER: 23483 patients
