In [1]:
# Import our custom modules
import sys
sys.path.append('..')

# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

try:
    # Custom module imports
    from src.data_loader import DataLoader
    from src.data_cleaner import DataCleaner
    from src.data_analyzer import DataAnalyzer
    from src.visualizer import Visualizer
    
    # Initialize and load data
    data_loader = DataLoader(data_path="../data")
    king_county_df, montgomery_df = data_loader.load_datasets()
    
    # Display dataset information
    data_loader.print_dataset_info()
    
    print("All modules imported and data loaded successfully!")
    
except Exception as e:
    print(f"Setup warning: {e}")
    print("This is normal when viewing online - everything will work when running locally")

Dataset Information

King County:
  Records: 563
  Columns: 25
  Missing Values: 2571
  Column Names:
    - impound_no
    - Animal_ID
    - Data_Source
    - Record_Type
    - Link
    - Current_Location
    - Animal_Name
    - animal_type
    - Age
    - Animal_Gender
    - Animal_Breed
    - Animal_Color
    - Date
    - Date_Type
    - Obfuscated_Address
    - City
    - State
    - Zip
    - jurisdiction
    - obfuscated_latitude
    - obfuscated_longitude
    - Image
    - image_alt_text
    - Memo
    - Temperament

Montgomery:
  Records: 95
  Columns: 12
  Missing Values: 74
  Column Names:
    - Animal ID
    - Intake Type
    - In Date
    - Pet name
    - Animal Type
    - Pet Age
    - Pet Size
    - Color
    - Breed
    - Sex
    - URL Link 
    - Crossing
All modules imported and data loaded successfully!


In [2]:
# Initialize data cleaner
data_cleaner = DataCleaner()

# Clean all datasets
king_clean, montgomery_clean, combined_df = data_cleaner.clean_all_datasets(
    king_county_df, montgomery_df
)

# Assess data quality with better formatting
print("\n" + "="*60)
print("DATA QUALITY ASSESSMENT")
print("="*60)

# Create formatted tables for data quality
def create_quality_table(df, location_name):
    missing_data = df.isnull().sum()
    missing_percent = (missing_data / len(df)) * 100
    
    quality_data = []
    for col, missing, percent in zip(df.columns, missing_data, missing_percent):
        if missing > 0:
            quality_data.append({
                'Column': col,
                'Missing Count': f"{missing:,}",
                'Missing %': f"{percent:.1f}%"
            })
    
    if quality_data:
        quality_df = pd.DataFrame(quality_data)
        print(f"\n{location_name} - Missing Values:")
        print("-" * 40)
        print(quality_df.to_string(index=False))
    else:
        print(f"\n{location_name} - No missing values found.")
    
    # Key column analysis
    print(f"\n{location_name} - Key Column Analysis:")
    print("-" * 40)
    key_data = {
        'Metric': ['Animal Types', 'Breeds', 'Statuses'],
        'Count': [
            df['animal_type'].nunique(),
            df['animal_breed'].nunique(),
            df['status'].nunique()
        ]
    }
    key_df = pd.DataFrame(key_data)
    print(key_df.to_string(index=False))

# Assess both datasets
data_cleaner.assess_data_quality(king_clean, "King County, WA")
data_cleaner.assess_data_quality(montgomery_clean, "Montgomery County, MD")


DATA QUALITY ASSESSMENT

Data Quality Assessment: King County, WA

Missing Values:
  current_location: 4 (0.7%)
  animal_name: 120 (21.3%)
  age: 1 (0.2%)
  animal_breed: 8 (1.4%)
  obfuscated_address: 48 (8.5%)
  city: 182 (32.3%)
  state: 377 (67.0%)
  zip: 158 (28.1%)
  jurisdiction: 50 (8.9%)
  obfuscated_latitude: 531 (94.3%)
  obfuscated_longitude: 531 (94.3%)
  image: 4 (0.7%)
  memo: 1 (0.2%)
  temperament: 556 (98.8%)
  age_clean: 86 (15.3%)

Key Column Analysis:
  Animal Types: 9 unique values
  Breeds: 96 unique values
  Statuses: 3 unique values

Data Quality Assessment: Montgomery County, MD

Missing Values:
  crossing: 74 (77.9%)
  age_clean: 1 (1.1%)

Key Column Analysis:
  Animal Types: 3 unique values
  Breeds: 28 unique values
  Statuses: 7 unique values


(animal_id         0
 record_type       0
 in_date           0
 animal_name       0
 animal_type       0
 age               0
 pet_size          0
 animal_color      0
 animal_breed      0
 animal_gender     0
 url_link_         0
 crossing         74
 location          0
 age_clean         1
 age_category      0
 status            0
 date_parsed       0
 dtype: int64,
 animal_id         0.000000
 record_type       0.000000
 in_date           0.000000
 animal_name       0.000000
 animal_type       0.000000
 age               0.000000
 pet_size          0.000000
 animal_color      0.000000
 animal_breed      0.000000
 animal_gender     0.000000
 url_link_         0.000000
 crossing         77.894737
 location          0.000000
 age_clean         1.052632
 age_category      0.000000
 status            0.000000
 date_parsed       0.000000
 dtype: float64)

In [3]:
# Initialize data analyzer
analyzer = DataAnalyzer(combined_df)

# Calculate all metrics
all_metrics = analyzer.calculate_all_metrics(king_clean, montgomery_clean)

# Display key metrics summary with better formatting
print("Key Metrics Summary")
print("=" * 60)

# Create a formatted table for metrics
import pandas as pd

metrics_data = []
for location, metrics in all_metrics.items():
    if location != 'combined':
        metrics_data.append({
            'Location': location.replace('_', ' ').title(),
            'Total Records': f"{metrics['total_records']:,}",
            'Top Pet Type': f"{metrics['pet_type_distribution'].index[0]} ({metrics['pet_type_distribution'].iloc[0]:,})",
            'Top Status': f"{metrics['status_distribution'].index[0]} ({metrics['status_distribution'].iloc[0]:,})"
        })

# Add combined metrics
combined_metrics = all_metrics['combined']
metrics_data.append({
    'Location': 'Combined Analysis',
    'Total Records': f"{combined_metrics['total_records']:,}",
    'Top Pet Type': f"{combined_metrics['pet_type_distribution'].index[0]} ({combined_metrics['pet_type_distribution'].iloc[0]:,})",
    'Top Status': f"{combined_metrics['status_distribution'].index[0]} ({combined_metrics['status_distribution'].iloc[0]:,})"
})

# Create and display formatted table
metrics_df = pd.DataFrame(metrics_data)
print(metrics_df.to_string(index=False))

Calculating key metrics...
Metrics calculated successfully!
Key Metrics Summary
         Location Total Records Top Pet Type     Top Status
      King County           563    Cat (343)     Lost (261)
Montgomery County            95     Cat (39) Owner Sur (53)
Combined Analysis           658    Cat (382)     Lost (261)


In [4]:
# Create pivot tables
pivot_tables = analyzer.create_pivot_tables()

print("Pivot Table Analysis")
print("=" * 60)

# Display each pivot table with better formatting
for name, pivot_table in pivot_tables.items():
    print(f"\n{name.replace('_', ' ').title()}:")
    print("-" * 50)
    
    # Format the pivot table for better display
    if isinstance(pivot_table, pd.DataFrame):
        # Round numeric values for cleaner display
        if pivot_table.dtypes.any() in ['float64', 'int64']:
            pivot_table = pivot_table.round(2)
        
        # Display with better formatting
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        print(pivot_table.to_string())
    else:
        print(pivot_table)
    
    print("\n" + "="*60)

Creating pivot tables...
Pivot tables created successfully!
Pivot Table Analysis

Pet Type By Location Status:
--------------------------------------------------
status                              Adoptable  Boarding  Confiscate  Euth Req  Foster  Found  Lost  Owner Sur  Return  Stray
location              animal_type                                                                                           
King County, WA       Bird                  0         0           0         0       0      3     2          0       0      0
                      Cat                  29         0           0         0       0    142   172          0       0      0
                      Deceased Cat          0         0           0         0       0      4     0          0       0      0
                      Dog                  27         0           0         0       0     82    87          0       0      0
                      Dwarf                 1         0           0         0       0   

In [5]:
# Perform statistical analysis
statistical_results = analyzer.perform_statistical_analysis()

print("Statistical Analysis Results")
print("=" * 60)

# Chi-square test results with better formatting
chi_square = statistical_results['chi_square_test']
print(f"\nChi-Square Test Results:")
print("-" * 30)
print(f"  Chi-square statistic: {chi_square['chi2_statistic']:.4f}")
print(f"  P-value: {chi_square['p_value']:.4f}")
print(f"  Degrees of freedom: {chi_square['degrees_of_freedom']}")
print(f"  Significant difference: {'Yes' if chi_square['significant_difference'] else 'No'}")

# Location summary with better table formatting
print(f"\nSummary Statistics by Location:")
print("-" * 40)

location_summary = statistical_results['location_summary']
if isinstance(location_summary, pd.DataFrame):
    # Format the summary table
    formatted_summary = location_summary.round(2)
    print(formatted_summary.to_string())
else:
    print(location_summary)

Performing statistical analysis...
Statistical analysis completed!
Statistical Analysis Results

Chi-Square Test Results:
------------------------------
  Chi-square statistic: 122.5705
  P-value: 0.0000
  Degrees of freedom: 9
  Significant difference: Yes

Summary Statistics by Location:
----------------------------------------
                      animal_id age_clean              animal_type animal_breed
                          count      mean median   std     nunique      nunique
location                                                                       
King County, WA             563      3.24    1.0  3.73           9           96
Montgomery County, MD        95      5.00    3.0  4.10           3           28


In [6]:
# Perform clustering analysis
clustering_results = analyzer.perform_clustering_analysis(n_clusters=3)

if clustering_results:
    print("Clustering Analysis Results")
    print("=" * 60)
    
    # Create formatted table for cluster distribution
    cluster_dist_data = []
    for cluster, count in clustering_results['cluster_distribution'].items():
        percentage = count / len(combined_df) * 100
        cluster_dist_data.append({
            'Cluster': f"Cluster {cluster}",
            'Records': f"{count:,}",
            'Percentage': f"{percentage:.1f}%"
        })
    
    cluster_dist_df = pd.DataFrame(cluster_dist_data)
    print("\nCluster Distribution:")
    print("-" * 30)
    print(cluster_dist_df.to_string(index=False))
    
    # Create formatted table for cluster characteristics
    print(f"\nCluster Characteristics:")
    print("-" * 30)
    
    characteristics_data = []
    for cluster_name, characteristics in clustering_results['cluster_characteristics'].items():
        characteristics_data.append({
            'Cluster': cluster_name.replace('_', ' ').title(),
            'Size': f"{characteristics['size']:,} records",
            'Top Pet Type': characteristics['top_pet_type'],
            'Top Location': characteristics['top_location']
        })
    
    characteristics_df = pd.DataFrame(characteristics_data)
    print(characteristics_df.to_string(index=False))
    
else:
    print("Clustering analysis could not be completed.")

Performing clustering analysis...
Clustering analysis completed!
Clustering Analysis Results

Cluster Distribution:
------------------------------
  Cluster Records Percentage
Cluster 0     185      28.1%
Cluster 1     124      18.8%
Cluster 2     349      53.0%

Cluster Characteristics:
------------------------------
  Cluster        Size Top Pet Type    Top Location
Cluster 0 185 records          Cat King County, WA
Cluster 1 124 records          Dog King County, WA
Cluster 2 349 records          Cat King County, WA


In [7]:
# Initialize visualizer with combined dataset
visualizer = Visualizer(combined_df)

In [8]:
# Create interactive Plotly dashboard
plotly_dashboard = visualizer.create_plotly_dashboard()
if plotly_dashboard:
    plotly_dashboard.show()
else:
    print("Plotly dashboard creation failed.")

# Create separate Pet Type Distribution chart
pet_type_chart = visualizer.create_pet_type_distribution_chart()
if pet_type_chart:
    pet_type_chart.show()
else:
    print("Pet Type Distribution chart creation failed.")

## Pet Type Distribution Analysis (ABOVE)

The pei chart shows the **breakdown of different animal types** in the shelter system.

### Key Findings:
- **Cats**: Most numerous (approximately 380 animals)
- **Dogs**: Second most common (approximately 230 animals)
- **Other Animals**: Much smaller numbers (birds, rabbits, etc.)

### Business Implications:
- **Resource Allocation**: Cats and dogs require the most shelter resources
- **Adoption Focus**: Majority of adoptable pets are cats and dogs
- **Specialized Care**: Smaller animal types need specialized attention

In [9]:
pet_type_fig = visualizer.create_interactive_pet_type_distribution_chart()
if pet_type_fig:
    pet_type_fig.show()



In [10]:
# Create interactive age-gender scatter plot
age_gender_fig = visualizer.create_interactive_age_gender_scatter_plot()
if age_gender_fig:
    age_gender_fig.show()

In [11]:
# Create interactive breed treemap chart
breed_fig = visualizer.create_interactive_breed_treemap_chart()
if breed_fig:
    breed_fig.show()

In [12]:
# Create King County recovery and pet type distribution charts
custom_figs = visualizer.create_king_county_recovery_and_pet_type_charts(all_metrics)
for name, fig in custom_figs.items():
    if fig:
        fig.show()

## Pet Recovery Analysis (ABOVE)

The visualization below shows the **King County Pet Recovery Analysis** based on actual data from the King County CSV file.

### What This Shows:
- **Red Slice**: LOST pets (still missing)
- **Green Slice**: FOUND pets (successfully recovered) 
- **Blue Slice**: ADOPTABLE pets (available for adoption)

### Business Insights:
- **Recovery Rate**: Percentage of pets successfully found vs total
- **Adoption Opportunity**: Number of pets available for adoption
- **Lost Pet Challenge**: Pets still missing and needing attention

### Data Source:
This analysis uses the actual `record_type` field from the King County, WA dataset to provide accurate recovery statistics.



In [13]:
# Create King County adoption fee bar chart
adoption_fee_fig = visualizer.create_king_county_adoption_fee_bar_chart()
if adoption_fee_fig:
    adoption_fee_fig.show()
else:
    print("No adoption fee data found in King County descriptions.")

## King County Adoption Fee Analysis

This visualization shows the actual adoption fees charged by King County Animal Services based on real data extracted from their pet records.

### What This Shows:
- Real Fee Data: Extracted from the "Memo" column containing adoption fee information
- Average Fees by Animal Type: Shows what different animals typically cost to adopt
- Fee Range: $15-$250 across all animals
- Total Animals: 64 pets with published adoption fees

### Key Findings:
- Dogs: Highest average fee at $131.11
- Cats: Moderate average fee at $39.66
- Small Animals: Lower fees for Dwarf ($30), Rabbit Sh ($25), and Rex ($30)
- Overall Average: $76.56 across all animals

### Business Insights:
- Revenue Optimization: Dogs generate the highest revenue per adoption
- Pricing Strategy: Clear tiered pricing based on animal type
- Market Analysis: Understanding actual cost structure of pet adoption
- Financial Planning: Real revenue potential based on actual fees