# Data Exploration Notebook

This notebook demonstrates how to use the data_pipeline package for exploratory data analysis.

In [None]:
# Import libraries for data analysis
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Create sample data for demonstration
np.random.seed(42)
data = pd.DataFrame({
    'feature1': np.random.normal(100, 15, 1000),
    'feature2': np.random.normal(50, 10, 1000),
    'target': np.random.choice(['A', 'B', 'C'], 1000, p=[0.5, 0.3, 0.2])
})

print(f"Loaded data shape: {data.shape}")
data.head()

In [None]:
# Create a simple data summary
print("Data Summary:")
print(f"Total rows: {len(data)}")
print(f"Total columns: {len(data.columns)}")
print(f"Data types:\n{data.dtypes}")
print(f"\nNumeric features summary:\n{data[['feature1', 'feature2']].describe()}")
print(f"\nTarget distribution:\n{data['target'].value_counts()}")

In [None]:
# Visualize the data
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# Feature distributions
data[['feature1', 'feature2']].hist(ax=axes[0], bins=10)
axes[0].set_title('Feature Distributions')

# Target distribution
data['target'].value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('Target Distribution')

plt.tight_layout()
plt.show()