In [None]:
# Project Configuration & Paths
# This cell imports our centralized configuration

import sys
from pathlib import Path

# Add the project root to the Python path so we can import our config
project_root = Path('..').resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import our centralized configuration
import config

# All paths and settings are now available via config module
print("📁 Project paths loaded from config.py:")
print(f"  Raw data: {config.RAW_DATA_DIR}")
print(f"  Processed data: {config.PROCESSED_DATA_DIR}")
print(f"  Figures: {config.FIGURES_DIR}")
print()
print("⚙️ Settings loaded from environment (.env file):")
print(f"  Random seed: {config.RANDOM_SEED}")
print(f"  Figure DPI: {config.DPI}")
print()
print("💡 To customize these settings:")
print("  1. Copy .env.example to .env")
print("  2. Edit .env with your preferred values")
print("  3. Restart the kernel to reload")

# Exploratory Data Analysis

## Project Overview
Brief description of the project and its objectives.

## Data Description
- **Source**: [Data source]
- **Size**: [Number of rows and columns]
- **Date Range**: [If applicable]
- **Key Variables**: [List main variables of interest]

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Additional imports
import polars as pl
import pyarrow.parquet as pq
import requests
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Jupyter settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print("Environment setup complete!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Polars version: {pl.__version__}")

In [None]:
# Load your data here
# df = pd.read_csv('../data/raw/your_data.csv')
# df_polars = pl.read_csv('../data/raw/your_data.csv')

# For demonstration, let's create sample data
np.random.seed(42)
sample_data = {
    'feature_1': np.random.randn(1000),
    'feature_2': np.random.randn(1000) * 2 + 1,
    'category': np.random.choice(['A', 'B', 'C'], 1000),
    'target': np.random.randn(1000) + np.random.choice([0, 1], 1000)
}
df = pd.DataFrame(sample_data)

print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
df.head()

In [None]:
# Data overview
print("=== DATA OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMemory usage:")
print(df.memory_usage(deep=True))

print("\n=== BASIC STATISTICS ===")
df.describe()

In [None]:
# Check for missing values
print("=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = 100 * missing_data / len(df)
missing_table = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
missing_table[missing_table['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

In [None]:
# Basic visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Distribution plots
df['feature_1'].hist(bins=30, ax=axes[0,0], alpha=0.7)
axes[0,0].set_title('Feature 1 Distribution')

df['feature_2'].hist(bins=30, ax=axes[0,1], alpha=0.7)
axes[0,1].set_title('Feature 2 Distribution')

# Category counts
df['category'].value_counts().plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Category Distribution')
axes[1,0].tick_params(axis='x', rotation=45)

# Correlation heatmap
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1,1])
axes[1,1].set_title('Correlation Matrix')

plt.tight_layout()
plt.show()

In [None]:
# Interactive plotly visualizations
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Feature 1 vs Feature 2', 'Feature Distribution by Category', 
                   'Target Distribution', 'Box Plot by Category'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Scatter plot
scatter = px.scatter(df, x='feature_1', y='feature_2', color='category', 
                    title='Feature 1 vs Feature 2')
for trace in scatter.data:
    fig.add_trace(trace, row=1, col=1)

# Histogram by category
for cat in df['category'].unique():
    fig.add_trace(
        go.Histogram(x=df[df['category']==cat]['feature_1'], 
                    name=f'Category {cat}', opacity=0.7),
        row=1, col=2
    )

# Target distribution
fig.add_trace(
    go.Histogram(x=df['target'], name='Target'),
    row=2, col=1
)

# Box plot
box_plot = px.box(df, x='category', y='target')
for trace in box_plot.data:
    fig.add_trace(trace, row=2, col=2)

fig.update_layout(height=800, showlegend=True, title_text="Data Exploration Dashboard")
fig.show()

## Key Insights

1. **Data Quality**: [Observations about missing values, outliers, etc.]
2. **Distributions**: [Key findings about variable distributions]
3. **Relationships**: [Notable correlations or patterns]
4. **Next Steps**: [Recommendations for further analysis or modeling]

In [None]:
# Save processed data
# df.to_csv('../data/processed/cleaned_data.csv', index=False)
# df.to_parquet('../data/processed/cleaned_data.parquet')

print("Analysis complete! Ready for next steps.")

In [1]:
# Adding a simple print statement to test cell modification
print("Hello, World!")

Hello, World!
