# Data Preparation and Exploratory Analysis

Load dataset from Google Drive/local upload and perform comprehensive exploratory data analysis (EDA).

### Step 1: Upload or Mount Dataset

In [None]:
# Option 1: Upload directly
from google.colab import files

print('Choose upload method:')
print('1. Direct upload (paste below)')
print('2. Mount Google Drive')

# For direct upload:
# uploaded = files.upload()

# For Google Drive mount:
from google.colab import drive
drive.mount('/content/drive')

# Check files
import os
os.listdir('/content')

### Step 2: Load Dataset

In [None]:
import pandas as pd
import numpy as np

# Load dataset - adjust path based on your upload method
df = pd.read_csv('/content/drive/MyDrive/osteoporosis_cleaned_reorganized.csv')
# OR if uploaded directly:
# df = pd.read_csv('osteoporosis_cleaned_reorganized.csv')

print('═' * 60)
print('DATASET LOADED SUCCESSFULLY')
print('═' * 60)
print(f'Dataset shape: {df.shape[0]} rows × {df.shape[1]} columns')
print(f'\nFirst few records:')
print(df.head())

### Step 3: Dataset Information

In [None]:
print('═' * 60)
print('DATASET INFORMATION')
print('═' * 60)
df.info()

print('\n' + '═' * 60)
print('STATISTICAL SUMMARY')
print('═' * 60)
print(df.describe())

### Step 4: Missing Values Analysis

In [None]:
print('═' * 60)
print('MISSING VALUES ANALYSIS')
print('═' * 60)

missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Feature': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
})

missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

if len(missing_df) == 0:
    print('✓ No missing values found!')
else:
    print(missing_df.to_string(index=False))

### Step 5: Target Variable Distribution

In [None]:
print('═' * 60)
print('TARGET VARIABLE DISTRIBUTION')
print('═' * 60)

target_counts = df['Osteoporosis'].value_counts()
print(f'No Risk (0): {target_counts[0]} ({target_counts[0]/len(df)*100:.1f}%)')
print(f'Risk (1): {target_counts[1]} ({target_counts[1]/len(df)*100:.1f}%)')

# Visualize
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 5))
target_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Target Variable Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Osteoporosis Status')
plt.ylabel('Count')
plt.xticks(['No Risk', 'Risk'], rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

### Step 6: Gender and Age Distribution

In [None]:
print('═' * 60)
print('DEMOGRAPHIC ANALYSIS')
print('═' * 60)

# Gender distribution
print('\nGender Distribution:')
print(df['Gender'].value_counts())

# Age statistics by gender
print('\nAge Statistics by Gender:')
print(df.groupby('Gender')['Age'].describe())

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Age distribution
axes[0].hist(df['Age'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Age (years)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Age Distribution', fontweight='bold')
axes[0].grid(alpha=0.3)

# Gender distribution
df['Gender'].value_counts().plot(kind='bar', ax=axes[1], color=['skyblue', 'salmon'])
axes[1].set_title('Gender Distribution', fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)

plt.tight_layout()
plt.show()

### Step 7: Risk Distribution by Demographics

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Risk by gender
pd.crosstab(df['Gender'], df['Osteoporosis'], normalize='index').plot(kind='bar', ax=axes[0], color=['skyblue', 'salmon'])
axes[0].set_title('Osteoporosis Risk by Gender', fontweight='bold')
axes[0].set_ylabel('Proportion')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)
axes[0].legend(['No Risk', 'Risk'])
axes[0].grid(alpha=0.3)

# Risk by age groups
age_bins = [0, 30, 40, 50, 100]
age_labels = ['<30', '30-40', '40-50', '>50']
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

pd.crosstab(df['Age_Group'], df['Osteoporosis'], normalize='index').plot(kind='bar', ax=axes[1], color=['skyblue', 'salmon'])
axes[1].set_title('Osteoporosis Risk by Age Group', fontweight='bold')
axes[1].set_ylabel('Proportion')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45)
axes[1].legend(['No Risk', 'Risk'])
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

df.drop('Age_Group', axis=1, inplace=True)

### Save Dataset for Next Steps

The dataset is now prepared for preprocessing in the next notebook.

In [None]:
# Save for next notebook
df.to_csv('data/dataset_loaded.csv', index=False)
print('✓ Dataset saved successfully!')
print('\nReady to proceed to Data Preprocessing notebook.')

### Next Notebook

Proceed to **03_Data_Preprocessing.ipynb** to handle missing values, encode features, and prepare data for model training.