# Data Exploration Notebook

This notebook demonstrates how to use the data_pipeline package for exploratory data analysis.

In [None]:
# Import the data_pipeline package
from data_pipeline.data import load_data, preprocess_data
from data_pipeline.utils import create_summary

import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the sample data
data = load_data("sample_data.csv")
print(f"Loaded data shape: {data.shape}")
data.head()

In [None]:
# Create a data summary
summary = create_summary(data)
print("Data Summary:")
for key, value in summary.items():
    if key != 'numeric_summary':  # Skip detailed numeric summary for brevity
        print(f"{key}: {value}")

In [None]:
# Visualize the data
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# Feature distributions
data[['feature1', 'feature2']].hist(ax=axes[0], bins=10)
axes[0].set_title('Feature Distributions')

# Target distribution
data['target'].value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('Target Distribution')

plt.tight_layout()
plt.show()