# Example Usage: 1980s Delicensing Analysis Tools

This notebook demonstrates how to use the utility functions created for this analysis.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import our custom utilities
from utils import DataValidator, DataProcessor, AnalysisTools
from config import Config, AnalysisConfig

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✓ Imports successful")

## 2. Create Sample Data

For demonstration purposes, we'll create sample firm-level data.

In [None]:
# Create sample firm data
np.random.seed(42)

firms = [1, 2, 3, 4, 5]
years = list(range(1980, 1991))

data = []
for firm in firms:
    base_output = np.random.uniform(100, 500)
    for year in years:
        # Add growth over time
        growth = 1.05 if year >= 1985 else 1.02  # Higher growth after delicensing
        output = base_output * (growth ** (year - 1980))
        
        data.append({
            'firm_id': firm,
            'year': year,
            'output': output + np.random.normal(0, 10),
            'employees': int(output / 10) + np.random.randint(-5, 5),
            'licensed': year < 1985,
            'sector': np.random.choice(['textiles', 'chemicals', 'machinery'])
        })

df = pd.DataFrame(data)

print(f"Created sample dataset with {len(df)} observations")
print(f"Firms: {df['firm_id'].nunique()}, Years: {df['year'].nunique()}")
df.head(10)

## 3. Data Validation

Use DataValidator to check data quality.

In [None]:
validator = DataValidator()

# Check for missing values
print("=== Missing Values Check ===")
missing = validator.check_missing_values(df, threshold=0.1)
for col, prop in missing.items():
    if prop > 0:
        print(f"  {col}: {prop*100:.2f}% missing")
    else:
        print(f"  {col}: No missing values ✓")

# Validate year range
print("\n=== Year Range Validation ===")
is_valid = validator.validate_year_range(df, 'year', 1976, 1990)
print(f"  Years valid: {'✓' if is_valid else '✗'}")

# Check for duplicates
print("\n=== Duplicate Check ===")
n_dups = validator.check_duplicates(df, ['firm_id', 'year'])
print(f"  Duplicates found: {n_dups}")

## 4. Data Processing

In [None]:
processor = DataProcessor()

# Clean numeric columns
df_clean = processor.clean_numeric_columns(df, ['output', 'employees'])

print("✓ Data cleaned")
print(f"\nData types:")
print(df_clean.dtypes)

## 5. Analysis

In [None]:
tools = AnalysisTools()

# Calculate growth rates
df_growth = tools.calculate_growth_rate(df_clean, 'output', ['firm_id'])

print("=== Growth Rate Analysis ===")
print("\nSample growth rates:")
print(df_growth[['firm_id', 'year', 'output', 'output_growth']].head(15))

# Average growth by period
df_growth['period'] = df_growth['year'].apply(
    lambda x: 'Post-delicensing' if x >= 1985 else 'Pre-delicensing'
)

avg_growth = df_growth.groupby('period')['output_growth'].mean()
print("\n=== Average Growth by Period ===")
print(avg_growth)

## 6. Summary Statistics

In [None]:
# Generate summary statistics
summary = tools.create_summary_stats(df_clean, ['output', 'employees'])

print("=== Summary Statistics ===")
summary

## 7. Filtering Data

In [None]:
# Filter for post-delicensing period and specific sector
conditions = {
    'year': ('>=', 1985),
    'sector': 'textiles'
}

df_filtered = tools.filter_by_conditions(df_clean, conditions)

print(f"Original data: {len(df_clean)} records")
print(f"Filtered data: {len(df_filtered)} records")
print(f"\nFiltered dataset preview:")
df_filtered.head()

## 8. Visualization

In [None]:
# Plot output over time by firm
plt.figure(figsize=(12, 6))

for firm in df_clean['firm_id'].unique():
    firm_data = df_clean[df_clean['firm_id'] == firm]
    plt.plot(firm_data['year'], firm_data['output'], marker='o', label=f'Firm {firm}')

plt.axvline(x=1985, color='red', linestyle='--', label='Delicensing Year')
plt.xlabel('Year')
plt.ylabel('Output')
plt.title('Firm Output Over Time (1980-1990)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("✓ Visualization complete")

## 9. Comparing Pre vs Post Delicensing

In [None]:
# Compare average output before and after delicensing
df_clean['period'] = df_clean['year'].apply(
    lambda x: 'Post-delicensing' if x >= 1985 else 'Pre-delicensing'
)

comparison = df_clean.groupby('period').agg({
    'output': ['mean', 'std', 'count'],
    'employees': ['mean', 'std', 'count']
})

print("=== Pre vs Post Delicensing Comparison ===")
comparison

In [None]:
# Box plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Output comparison
df_clean.boxplot(column='output', by='period', ax=axes[0])
axes[0].set_title('Output Distribution by Period')
axes[0].set_xlabel('Period')
axes[0].set_ylabel('Output')

# Employees comparison
df_clean.boxplot(column='employees', by='period', ax=axes[1])
axes[1].set_title('Employees Distribution by Period')
axes[1].set_xlabel('Period')
axes[1].set_ylabel('Employees')

plt.suptitle('')  # Remove the automatic title
plt.tight_layout()
plt.show()

print("✓ Comparison analysis complete")

## 10. Export Results

In [None]:
# Export processed data
# Uncomment to actually save files

# df_growth.to_csv('output/firm_growth_rates.csv', index=False)
# summary.to_csv('output/summary_statistics.csv')

print("✓ Example notebook complete!")
print("\nThis notebook demonstrated:")
print("  1. Data validation")
print("  2. Data cleaning and processing")
print("  3. Growth rate calculations")
print("  4. Summary statistics")
print("  5. Data filtering")
print("  6. Visualizations")
print("  7. Comparative analysis")
print("  8. Data export")