# Homework 3: Python Fundamentals

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os

## 1. NumPy Operations

In [None]:
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.array([2, 4, 6, 8, 10])

print("Array 1:", arr1)
print("Array 2:", arr2)

addition = arr1 + arr2
multiplication = arr1 * arr2
power = arr1 ** 2

print("Addition:", addition)
print("Multiplication:", multiplication)
print("Power:", power)

print("Mean:", arr1.mean())
print("Sum:", arr1.sum())
print("Max:", arr1.max())
print("Min:", arr1.min())

### Compare loop vs vectorized execution

In [None]:
test_array = np.random.rand(100000)

start_time = time.time()
loop_result = []
for x in test_array:
    loop_result.append(x ** 2 + 2 * x + 1)
loop_time = time.time() - start_time

start_time = time.time()
vectorized_result = test_array ** 2 + 2 * test_array + 1
vectorized_time = time.time() - start_time

speedup = loop_time / vectorized_time

print("Performance Comparison:")
print(f"Array size: {len(test_array):,} elements")
print(f"Loop time: {loop_time:.4f} seconds")
print(f"Vectorized time: {vectorized_time:.4f} seconds")
print(f"Speedup: {speedup:.1f}x faster")

## 2. Dataset Loading

### Load provided CSV using pandas

In [None]:
df = pd.read_csv('../data/starter_data (1).csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

### Inspect with .info() and .head()

In [None]:
print("Dataset Info:")
df.info()

In [None]:
print("First 5 rows:")
df.head()

## 3. Summary Statistics

### Calculate .describe() for numeric columns

In [None]:
numeric_summary = df.describe()
print("Summary Statistics for Numeric Columns:")
print(numeric_summary)

### Perform .groupby() aggregation by category

In [None]:
category_stats = df.groupby('category')['value'].agg([
    'count', 'mean', 'std', 'sum', 'min', 'max'
]).round(4)

print("Category Statistics:")
print(category_stats)

category_summary = category_stats.copy()
category_summary.columns = ['count', 'avg_value', 'volatility', 'total_value', 'min_value', 'max_value']

print("\nSimplified Category Summary:")
print(category_summary)

## 4. Save Outputs

### Save summary stats to data/processed/summary.csv

In [None]:
os.makedirs('../data/processed', exist_ok=True)

category_summary.to_csv('../data/processed/summary.csv')
print("Category summary statistics saved to data/processed/summary.csv")

numeric_summary.to_csv('../data/processed/overall_summary.csv')
print("Overall summary saved to data/processed/overall_summary.csv")

## 5. Reusable Functions

### Write utility function (get_summary_stats)

In [None]:
def get_summary_stats(df, numeric_only=True):
    if numeric_only:
        return df.describe()
    else:
        return df.describe(include='all')

def calculate_metrics(df):
    metrics = {}
    metrics['total_records'] = len(df)
    metrics['avg_value'] = df['value'].mean()
    metrics['value_volatility'] = df['value'].std()
    metrics['total_sum'] = df['value'].sum()
    metrics['value_range'] = df['value'].max() - df['value'].min()
    
    return metrics

print("Testing get_summary_stats function:")
summary = get_summary_stats(df)
print(summary)

print("\nTesting calculate_metrics function:")
data_metrics = calculate_metrics(df)
for key, value in data_metrics.items():
    print(f"{key}: {value:.4f}")