In [1]:
# Sample Data Analysis Notebook
# This notebook demonstrates common data science operations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("Libraries loaded successfully!")

Hello World!


## 1. Creating Sample Data

Let's create a sample dataset to work with.

In [None]:
# Create a sample DataFrame
np.random.seed(42)

data = {
    'date': pd.date_range('2024-01-01', periods=100, freq='D'),
    'sales': np.random.randint(100, 1000, 100),
    'category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], 100),
    'region': np.random.choice(['North', 'South', 'East', 'West'], 100)
}

df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
df.head(10)

## 2. Basic Data Exploration

Let's explore the dataset with some basic statistics.

In [None]:
# Summary statistics
df.describe()

In [None]:
# Group by category and calculate statistics
category_stats = df.groupby('category')['sales'].agg(['mean', 'sum', 'count'])
category_stats.columns = ['Avg Sales', 'Total Sales', 'Count']
category_stats

## 3. Data Visualization

Let's create some visualizations to understand our data better.

In [None]:
# Create a figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 1. Bar chart - Sales by Category
category_totals = df.groupby('category')['sales'].sum()
axes[0, 0].bar(category_totals.index, category_totals.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
axes[0, 0].set_title('Total Sales by Category', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Category')
axes[0, 0].set_ylabel('Total Sales')

# 2. Line chart - Daily Sales Trend
daily_sales = df.groupby('date')['sales'].sum()
axes[0, 1].plot(daily_sales.index, daily_sales.values, color='#667eea', linewidth=2)
axes[0, 1].fill_between(daily_sales.index, daily_sales.values, alpha=0.3, color='#667eea')
axes[0, 1].set_title('Daily Sales Trend', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Sales')
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Pie chart - Sales Distribution by Region
region_sales = df.groupby('region')['sales'].sum()
colors = ['#FF9F43', '#EE5A24', '#00D2D3', '#54A0FF']
axes[1, 0].pie(region_sales.values, labels=region_sales.index, autopct='%1.1f%%', colors=colors)
axes[1, 0].set_title('Sales Distribution by Region', fontsize=12, fontweight='bold')

# 4. Histogram - Sales Distribution
axes[1, 1].hist(df['sales'], bins=20, color='#6C5CE7', edgecolor='white', alpha=0.7)
axes[1, 1].set_title('Sales Distribution', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Sales Value')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 4. Data Filtering and Aggregation

Advanced data manipulation examples.

In [None]:
# Filter high-value sales (above average)
avg_sales = df['sales'].mean()
high_value_sales = df[df['sales'] > avg_sales]

print(f"Average sales: ${avg_sales:.2f}")
print(f"High-value transactions: {len(high_value_sales)} out of {len(df)} ({len(high_value_sales)/len(df)*100:.1f}%)")
print()

# Top 5 highest sales days
top_5 = df.nlargest(5, 'sales')[['date', 'category', 'region', 'sales']]
print("Top 5 Highest Sales:")
top_5

In [None]:
# Pivot table - Category vs Region
pivot_table = df.pivot_table(
    values='sales', 
    index='category', 
    columns='region', 
    aggfunc='sum',
    fill_value=0
)
print("Sales Pivot Table (Category vs Region):")
pivot_table

## 5. Summary

This sample notebook demonstrated:
- üìä Creating and exploring DataFrames
- üìà Basic statistical analysis
- üé® Data visualization with matplotlib
- üîç Data filtering and aggregation
- üìã Pivot tables

Feel free to modify and extend this notebook for your own analysis!