# Data Loading and Initial Exploration

This notebook covers:
- Loading the patient dataset
- Initial data exploration
- Data quality assessment
- Basic statistics

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src directory to path
sys.path.append('../src')

from data_preprocessing import DataPreprocessor, generate_sample_data

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## 1. Generate Sample Data

In [None]:
# Generate sample patient data
print("Generating sample patient data...")
sample_data = generate_sample_data(n_samples=2000, filepath='../data/raw/patient_data.csv')

print(f"Generated dataset with {len(sample_data)} patients")
print(f"Dataset shape: {sample_data.shape}")

## 2. Load and Examine Data

In [None]:
# Load the data
preprocessor = DataPreprocessor()
df = preprocessor.load_data('../data/raw/patient_data.csv')

# Display basic information
print("Dataset Info:")
print(df.info())
print("\nFirst 5 rows:")
df.head()

## 3. Data Quality Assessment

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Data types
print("\nData Types:")
print(df.dtypes)

## 4. Basic Statistics

In [None]:
# Numerical columns statistics
print("Numerical Columns Statistics:")
numerical_cols = df.select_dtypes(include=[np.number]).columns
df[numerical_cols].describe()

In [None]:
# Categorical columns value counts
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    if col != 'patient_id':  # Skip ID column
        print(f"\n{col} - Value Counts:")
        print(df[col].value_counts())
        print(f"Unique values: {df[col].nunique()}")

## 5. Initial Visualizations

In [None]:
# Age distribution
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.hist(df['age'], bins=20, edgecolor='black', alpha=0.7)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

# Gender distribution
plt.subplot(2, 2, 2)
df['gender'].value_counts().plot(kind='bar')
plt.title('Gender Distribution')
plt.xticks(rotation=0)

# Diagnosis distribution
plt.subplot(2, 2, 3)
df['diagnosis'].value_counts().plot(kind='bar')
plt.title('Diagnosis Distribution')
plt.xticks(rotation=45)

# Outcome distribution
plt.subplot(2, 2, 4)
df['outcome'].value_counts().plot(kind='bar')
plt.title('Treatment Outcome Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Treatment and severity distributions
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df['recommended_treatment'].value_counts().plot(kind='bar')
plt.title('Recommended Treatment Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
df['severity'].value_counts().plot(kind='bar')
plt.title('Severity Distribution')
plt.xticks(rotation=0)

plt.subplot(1, 3, 3)
df['symptoms'].value_counts().plot(kind='bar')
plt.title('Symptoms Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 6. Data Summary

In [None]:
# Create a summary report
print("=" * 50)
print("DATA LOADING SUMMARY REPORT")
print("=" * 50)

print(f"Dataset Shape: {df.shape}")
print(f"Number of Patients: {df['patient_id'].nunique()}")
print(f"Age Range: {df['age'].min()} - {df['age'].max()} years")
print(f"Number of Diagnoses: {df['diagnosis'].nunique()}")
print(f"Number of Treatments: {df['recommended_treatment'].nunique()}")
print(f"Number of Outcomes: {df['outcome'].nunique()}")

print("\nData Quality:")
print(f"Missing Values: {df.isnull().sum().sum()}")
print(f"Duplicate Rows: {df.duplicated().sum()}")

print("\nNext Steps:")
print("1. Proceed to EDA notebook for detailed analysis")
print("2. Examine relationships between variables")
print("3. Identify patterns in treatment outcomes")

In [None]:
# Save processed data for next notebooks
df.to_csv('../data/processed/loaded_data.csv', index=False)
print("Data saved to '../data/processed/loaded_data.csv'")