# Data Exploration Notebook

This notebook demonstrates exploratory data analysis (EDA) for the MLOps Production System.

In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot styles
plt.style.use('ggplot')
sns.set(style="whitegrid")
%matplotlib inline

## 1. Load Data

Load the sample dataset generated by the data generation script.

In [None]:
# Path to the data file
data_path = "../data/raw/sample_data.csv"

# Load data
df = pd.read_csv(data_path)

# Display the first few rows
df.head()

## 2. Basic Data Exploration

In [None]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nData types:")
print(df.dtypes)
print("\nSummary statistics:")
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0] if missing_values.any() else "No missing values")

## 3. Feature Distribution

In [None]:
# Separate numerical and categorical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove target from features if it exists
if 'target' in numerical_features:
    numerical_features.remove('target')
elif 'target' in categorical_features:
    categorical_features.remove('target')
    
print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

In [None]:
# Plot distributions of numerical features
if numerical_features:
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(numerical_features[:6], 1):  # Limit to first 6 features
        plt.subplot(2, 3, i)
        sns.histplot(df[feature], kde=True)
        plt.title(f'Distribution of {feature}')
    plt.tight_layout()
    plt.show()

In [None]:
# Plot distributions of categorical features
if categorical_features:
    plt.figure(figsize=(15, 5 * len(categorical_features)))
    for i, feature in enumerate(categorical_features, 1):
        plt.subplot(len(categorical_features), 1, i)
        value_counts = df[feature].value_counts()
        sns.barplot(x=value_counts.index, y=value_counts.values)
        plt.title(f'Distribution of {feature}')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 4. Target Distribution

In [None]:
# Check if target exists
if 'target' in df.columns:
    # Determine if target is numerical or categorical
    if df['target'].dtype in ['int64', 'float64'] and df['target'].nunique() <= 10:
        # Treat as categorical
        plt.figure(figsize=(10, 6))
        value_counts = df['target'].value_counts().sort_index()
        sns.barplot(x=value_counts.index, y=value_counts.values)
        plt.title('Target Distribution')
        plt.xlabel('Target')
        plt.ylabel('Count')
        plt.show()
        
        # Calculate class balance for classification
        print("Class distribution:")
        print(df['target'].value_counts(normalize=True).sort_index())
    else:
        # Treat as continuous
        plt.figure(figsize=(10, 6))
        sns.histplot(df['target'], kde=True)
        plt.title('Target Distribution')
        plt.xlabel('Target')
        plt.show()

## 5. Feature Correlations

In [None]:
# Calculate correlations between numerical features
if len(numerical_features) > 1:
    correlation_matrix = df[numerical_features + ['target']].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title('Feature Correlations')
    plt.tight_layout()
    plt.show()

## 6. Feature Relationships with Target

In [None]:
if 'target' in df.columns:
    # For numerical features vs target
    if numerical_features:
        plt.figure(figsize=(15, 10))
        for i, feature in enumerate(numerical_features[:6], 1):  # Limit to first 6 features
            plt.subplot(2, 3, i)
            if df['target'].nunique() <= 10:  # Classification
                sns.boxplot(x='target', y=feature, data=df)
                plt.title(f'{feature} vs Target')
            else:  # Regression
                sns.scatterplot(x=feature, y='target', data=df)
                plt.title(f'{feature} vs Target')
        plt.tight_layout()
        plt.show()
    
    # For categorical features vs target
    if categorical_features:
        for feature in categorical_features:
            plt.figure(figsize=(12, 6))
            if df['target'].nunique() <= 10:  # Classification
                # Create crosstab
                crosstab = pd.crosstab(df[feature], df['target'])
                crosstab.plot(kind='bar', stacked=True)
                plt.title(f'{feature} vs Target')
                plt.xlabel(feature)
                plt.ylabel('Count')
                plt.legend(title='Target')
            else:  # Regression
                sns.boxplot(x=feature, y='target', data=df)
                plt.title(f'{feature} vs Target')
            plt.tight_layout()
            plt.show()

## 7. Conclusions

Based on the exploratory data analysis, we can draw the following conclusions:

1. **Data Quality**: [Fill in observations about missing values, outliers, etc.]
2. **Feature Distributions**: [Fill in observations about feature distributions]
3. **Feature Importance**: [Fill in observations about correlations with target]
4. **Feature Engineering Opportunities**: [Fill in ideas for feature engineering]

### Next Steps

1. Feature engineering to create more predictive features
2. Feature selection to remove irrelevant features
3. Model training and evaluation
4. Model deployment using the MLOps Production System