In [5]:
import json
import pandas as pd
from datetime import datetime
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')


In [6]:
# PART 1: Load and Explore Data

def load_json_data(filepath):
    """
    Load JSON data from file
    """
    with open(filepath, 'r') as f:
        data = json.load(f)
    return data

def explore_data(data):
    """
    Perform initial data exploration
    """
    print("=" * 80)
    print("DATA EXPLORATION")
    print("=" * 80)
    print(f"Number of records in dataset: {len(data)}")
    
    # Extract shipment records
    shipment_records = [detail for item in data for detail in item.get('trackDetails', [])]
    print(f"Total shipments to analyze: {len(shipment_records)}")
    
    if shipment_records:
        print("\nSample shipment keys:")
        print(list(shipment_records[0].keys())[:10])
    
    return shipment_records


In [7]:
# PART 2: Flatten and Extract Transit Data

def safe_get(dictionary, *keys, default=None):
    """
    Safely navigate nested dictionary structure
    """
    result = dictionary
    for key in keys:
        if isinstance(result, dict):
            result = result.get(key, default)
        else:
            return default
    return result if result is not None else default

def parse_timestamp(ts_value):
    """
    Parse timestamp from various formats (MongoDB $numberLong or ISO string)
    Returns datetime object or None
    """
    if ts_value is None:
        return None
    
    try:
        # Handle MongoDB $numberLong format
        if isinstance(ts_value, dict) and '$numberLong' in ts_value:
            timestamp_ms = int(ts_value['$numberLong'])
            return datetime.fromtimestamp(timestamp_ms / 1000.0)
        
        # Handle ISO string format
        elif isinstance(ts_value, str):
            # Try multiple datetime formats
            for fmt in ['%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d']:
                try:
                    return datetime.strptime(ts_value.split('.')[0].replace('+05:30', ''), 
                                           fmt.replace('%z', ''))
                except:
                    continue
        
        # Handle direct integer timestamp
        elif isinstance(ts_value, (int, float)):
            return datetime.fromtimestamp(ts_value / 1000.0)
    
    except Exception as e:
        pass
    
    return None

def extract_shipment_data(shipment):
    """
    Extract and flatten all shipment information
    """
    # Basic shipment identifiers
    tracking_number = safe_get(shipment, 'trackingNumber', default='')
    service_type = safe_get(shipment, 'service', 'type', default='')
    service_description = safe_get(shipment, 'service', 'description', default='')
    carrier_code = safe_get(shipment, 'carrierCode', default='')
    
    # Weight and package information
    package_weight_value = safe_get(shipment, 'packageWeight', 'value', default=0)
    package_weight_units = safe_get(shipment, 'packageWeight', 'units', default='KG')
    packaging_type = safe_get(shipment, 'packaging', 'type', default='')
    
    # Convert weight to KG if needed
    if package_weight_units == 'LB':
        package_weight_kg = package_weight_value * 0.453592
    else:
        package_weight_kg = package_weight_value
    
    # Origin information
    origin_city = safe_get(shipment, 'shipperAddress', 'city', default='')
    origin_state = safe_get(shipment, 'shipperAddress', 'stateOrProvinceCode', default='')
    origin_pincode = safe_get(shipment, 'shipperAddress', 'postalCode', default='')
    
    # Destination information
    destination_city = safe_get(shipment, 'destinationAddress', 'city', default='')
    destination_state = safe_get(shipment, 'destinationAddress', 'stateOrProvinceCode', default='')
    destination_pincode = safe_get(shipment, 'destinationAddress', 'postalCode', default='')
    
    # Delivery characteristics
    delivery_location_type = safe_get(shipment, 'deliveryLocationType', default='')
    
    # Extract events
    events = safe_get(shipment, 'events', default=[])
    
    return {
        'tracking_number': tracking_number,
        'service_type': service_type,
        'service_description': service_description,
        'carrier_code': carrier_code,
        'package_weight_kg': package_weight_kg,
        'packaging_type': packaging_type,
        'origin_city': origin_city,
        'origin_state': origin_state,
        'origin_pincode': origin_pincode,
        'destination_city': destination_city,
        'destination_state': destination_state,
        'destination_pincode': destination_pincode,
        'delivery_location_type': delivery_location_type,
        'events': events
    }


In [8]:
# PART 3: Compute Transit Performance Metrics
def compute_transit_metrics(shipment_data):
    """
    Calculate all transit performance metrics for a shipment
    """
    events = shipment_data['events']
    
    # Initialize metrics
    metrics = {
        'num_facilities_visited': 0,
        'facility_list': [],
        'event_type_counts': {},
        'unique_event_types': set(),
        'total_transit_hours': 0,
        'pickup_datetime_ist': None,
        'delivery_datetime_ist': None,
        'time_in_inter_facility_transit_hours': 0,
        'avg_hours_per_facility': 0,
        'is_express_service': False,
        'num_out_for_delivery_attempts': 0,
        'first_attempt_delivery': False,
        'total_events_count': len(events),
        'num_in_transit_events': 0
    }
    
    if not events:
        return metrics
    
    # Parse all event timestamps and types
    parsed_events = []
    for event in events:
        event_type = safe_get(event, 'eventType', default='')
        event_desc = safe_get(event, 'eventDescription', default='')
        timestamp = parse_timestamp(safe_get(event, 'timestamp'))
        arrival_location = safe_get(event, 'arrivalLocation', default='')
        city = safe_get(event, 'address', 'city', default='')
        
        if timestamp:
            parsed_events.append({
                'timestamp': timestamp,
                'event_type': event_type,
                'event_desc': event_desc,
                'arrival_location': arrival_location,
                'city': city
            })
        
        # Track unique event types
        if event_type:
            metrics['unique_event_types'].add(event_type)
            metrics['event_type_counts'][event_type] = metrics['event_type_counts'].get(event_type, 0) + 1
        
        # Count facilities visited
        if arrival_location and 'FACILITY' in arrival_location.upper():
            if arrival_location not in metrics['facility_list']:
                metrics['facility_list'].append(arrival_location)
        
        # Count out-for-delivery attempts
        if event_type in ['OD', 'OFD'] or 'OUT FOR DELIVERY' in event_desc.upper():
            metrics['num_out_for_delivery_attempts'] += 1
        
        # Count in-transit events
        if event_type in ['IT', 'DP', 'AR']:
            metrics['num_in_transit_events'] += 1
    
    # Sort events by timestamp
    parsed_events.sort(key=lambda x: x['timestamp'])
    
    # Calculate transit time
    if parsed_events:
        # Find pickup event (PU, PL, OC)
        pickup_events = [e for e in parsed_events if e['event_type'] in ['PU', 'PL', 'OC', 'PK']]
        if pickup_events:
            metrics['pickup_datetime_ist'] = pickup_events[0]['timestamp']
        else:
            # Use first event if no pickup found
            metrics['pickup_datetime_ist'] = parsed_events[0]['timestamp']
        
        # Find delivery event (DL, DD, DEL)
        delivery_events = [e for e in parsed_events if e['event_type'] in ['DL', 'DD', 'DEL']]
        if delivery_events:
            metrics['delivery_datetime_ist'] = delivery_events[-1]['timestamp']
        else:
            # Use last event if no delivery found
            metrics['delivery_datetime_ist'] = parsed_events[-1]['timestamp']
        
        # Calculate total transit hours
        if metrics['pickup_datetime_ist'] and metrics['delivery_datetime_ist']:
            time_diff = metrics['delivery_datetime_ist'] - metrics['pickup_datetime_ist']
            metrics['total_transit_hours'] = time_diff.total_seconds() / 3600.0
    
    # Calculate facility metrics
    metrics['num_facilities_visited'] = len(metrics['facility_list'])
    
    if metrics['num_facilities_visited'] > 0 and metrics['total_transit_hours'] > 0:
        metrics['avg_hours_per_facility'] = metrics['total_transit_hours'] / metrics['num_facilities_visited']
    
    # Calculate inter-facility transit time (approximate)
    # Time between facility arrivals
    facility_events = [e for e in parsed_events if 'FACILITY' in e['arrival_location'].upper()]
    if len(facility_events) > 1:
        inter_facility_time = 0
        for i in range(1, len(facility_events)):
            time_diff = facility_events[i]['timestamp'] - facility_events[i-1]['timestamp']
            inter_facility_time += time_diff.total_seconds() / 3600.0
        metrics['time_in_inter_facility_transit_hours'] = inter_facility_time
    
    # Classify service type
    service_type = shipment_data['service_type'].upper()
    express_keywords = ['EXPRESS', 'PRIORITY', 'OVERNIGHT', 'NEXT_DAY', 'FIRST', 'SAVER']
    metrics['is_express_service'] = any(keyword in service_type for keyword in express_keywords)
    
    # Check first attempt delivery
    metrics['first_attempt_delivery'] = metrics['num_out_for_delivery_attempts'] <= 1
    
    return metrics


In [9]:
# PART 4 & 5: Process All Shipments and Create Detailed CSV

def process_all_shipments(shipment_records):
    """
    Process all shipments and create detailed records
    """
    detailed_records = []
    
    for shipment in shipment_records:
        try:
            # Extract basic data
            shipment_data = extract_shipment_data(shipment)
            
            # Calculate metrics
            metrics = compute_transit_metrics(shipment_data)
            
            # Combine into final record
            record = {
                'tracking_number': shipment_data['tracking_number'],
                'service_type': shipment_data['service_type'],
                'carrier_code': shipment_data['carrier_code'],
                'package_weight_kg': round(shipment_data['package_weight_kg'], 2),
                'packaging_type': shipment_data['packaging_type'],
                'origin_city': shipment_data['origin_city'],
                'origin_state': shipment_data['origin_state'],
                'origin_pincode': shipment_data['origin_pincode'],
                'destination_city': shipment_data['destination_city'],
                'destination_state': shipment_data['destination_state'],
                'destination_pincode': shipment_data['destination_pincode'],
                'pickup_datetime_ist': metrics['pickup_datetime_ist'],
                'delivery_datetime_ist': metrics['delivery_datetime_ist'],
                'total_transit_hours': round(metrics['total_transit_hours'], 2),
                'num_facilities_visited': metrics['num_facilities_visited'],
                'num_in_transit_events': metrics['num_in_transit_events'],
                'time_in_inter_facility_transit_hours': round(metrics['time_in_inter_facility_transit_hours'], 2),
                'avg_hours_per_facility': round(metrics['avg_hours_per_facility'], 2),
                'is_express_service': metrics['is_express_service'],
                'delivery_location_type': shipment_data['delivery_location_type'],
                'num_out_for_delivery_attempts': metrics['num_out_for_delivery_attempts'],
                'first_attempt_delivery': metrics['first_attempt_delivery'],
                'total_events_count': metrics['total_events_count']
            }
            
            detailed_records.append(record)
        
        except Exception as e:
            print(f"Error processing shipment: {e}")
            continue
    
    return pd.DataFrame(detailed_records)

In [11]:
# PART 6: Generate Network Performance Summary

def generate_summary(df):
    """
    Generate network performance summary statistics
    """
    summary = {}
    
    # Overall Metrics
    summary['total_shipments_analyzed'] = len(df)
    summary['avg_transit_hours'] = round(df['total_transit_hours'].mean(), 2)
    summary['median_transit_hours'] = round(df['total_transit_hours'].median(), 2)
    summary['std_dev_transit_hours'] = round(df['total_transit_hours'].std(), 2)
    summary['min_transit_hours'] = round(df['total_transit_hours'].min(), 2)
    summary['max_transit_hours'] = round(df['total_transit_hours'].max(), 2)
    
    # Facility Metrics
    summary['avg_facilities_per_shipment'] = round(df['num_facilities_visited'].mean(), 2)
    summary['median_facilities_per_shipment'] = round(df['num_facilities_visited'].median(), 2)
    
    # Mode calculation
    mode_result = df['num_facilities_visited'].mode()
    summary['mode_facilities_per_shipment'] = int(mode_result.iloc[0]) if len(mode_result) > 0 else 0
    
    summary['avg_hours_per_facility'] = round(df['avg_hours_per_facility'].mean(), 2)
    summary['median_hours_per_facility'] = round(df['avg_hours_per_facility'].median(), 2)
    
    # Service Type Comparison
    service_groups = df.groupby('service_type').agg({
        'total_transit_hours': 'mean',
        'num_facilities_visited': 'mean',
        'tracking_number': 'count'
    }).round(2)
    
    for service_type, row in service_groups.iterrows():
        summary[f'avg_transit_hours_{service_type}'] = row['total_transit_hours']
        summary[f'avg_facilities_{service_type}'] = row['num_facilities_visited']
        summary[f'count_shipments_{service_type}'] = int(row['tracking_number'])
    
    # Delivery Performance
    summary['pct_first_attempt_delivery'] = round(
        (df['first_attempt_delivery'].sum() / len(df)) * 100, 2
    )
    summary['avg_out_for_delivery_attempts'] = round(
        df['num_out_for_delivery_attempts'].mean(), 2
    )
    
    return summary


In [14]:
# MAIN EXECUTION

def main(json_filepath):
    """
    Main execution function
    """
    print("Starting Transit Performance Analysis...")
    print()
    
    # Load data
    data = load_json_data(json_filepath)
    shipment_records = explore_data(data)
    print()
    
    # Process all shipments
    print("Processing shipments and calculating metrics...")
    detailed_df = process_all_shipments(shipment_records)
    print(f"Successfully processed {len(detailed_df)} shipments")
    print()
    
    # Save detailed CSV
    output_detailed = 'transit_performance_detailed.csv'
    detailed_df.to_csv(output_detailed, index=False)
    print(f"✓ Saved detailed results to: {output_detailed}")
    
    # Generate and save summary
    summary = generate_summary(detailed_df)
    summary_df = pd.DataFrame([summary])
    output_summary = 'transit_performance_summary.csv'
    summary_df.to_csv(output_summary, index=False)
    print(f"✓ Saved summary results to: {output_summary}")
    print()
    
    # Display preview
    print("=" * 80)
    print("PREVIEW OF RESULTS")
    print("=" * 80)
    print("\nDetailed Data (first 5 rows):")
    print(detailed_df.head())
    print("\nSummary Statistics:")
    print(summary_df.transpose())
    print()
    
    print("Analysis complete!")
    return detailed_df, summary_df

# ============================================================================
# RUN THE ANALYSIS
# ============================================================================

if __name__ == "__main__":
    # Replace with your actual JSON file path
    json_file = 'Swift Assignment 4 - Dataset (1).json'
    
    try:
        detailed_df, summary_df = main(json_file)
    except FileNotFoundError:
        print(f"Error: Could not find file '{json_file}'")
        print("Please update the 'json_file' variable with the correct path to your JSON file.")
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

Starting Transit Performance Analysis...

DATA EXPLORATION
Number of records in dataset: 99
Total shipments to analyze: 99

Sample shipment keys:
['notification', 'trackingNumber', 'trackingNumberUniqueIdentifier', 'statusDetail', 'informationNotes', 'customerExceptionRequests', 'carrierCode', 'operatingCompanyOrCarrierDescription', 'otherIdentifiers', 'service']

Processing shipments and calculating metrics...
Successfully processed 99 shipments

✓ Saved detailed results to: transit_performance_detailed.csv
✓ Saved summary results to: transit_performance_summary.csv

PREVIEW OF RESULTS

Detailed Data (first 5 rows):
  tracking_number         service_type carrier_code  package_weight_kg  \
0    391128701026  FEDEX_EXPRESS_SAVER         FDXE               14.0   
1    390901883808  FEDEX_EXPRESS_SAVER         FDXE               14.0   
2    391128749178  FEDEX_EXPRESS_SAVER         FDXE               14.0   
3    390807986805  FEDEX_EXPRESS_SAVER         FDXE               14.0   
4    

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the detailed performance data generated in the previous step
detailed_df = pd.read_csv('transit_performance_detailed.csv')

# --- 1. Data Cleaning for Visualization ---
# Drop rows where total_transit_hours is NaN (e.g., incomplete events) to clean the analysis scope
df_clean = detailed_df.dropna(subset=['total_transit_hours']).copy()

# --- 2. Create Visualizations (Graphs) ---

# Set a consistent style for better presentation
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# 2.1. Histogram of Total Transit Hours (Distribution Analysis)
plt.figure(figsize=(10, 6))
sns.histplot(df_clean['total_transit_hours'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Total Transit Time (Hours)')
plt.xlabel('Total Transit Hours')
plt.ylabel('Number of Shipments')
plt.tight_layout()
plt.savefig('transit_time_distribution.png')
plt.close()

# 2.2. Comparative Bar Chart: Average Transit Hours by Origin State (Performance Comparison)
# Group by origin state and calculate mean transit time
state_performance = df_clean.groupby('origin_state')['total_transit_hours'].mean().reset_index()
state_performance = state_performance.sort_values(by='total_transit_hours', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='origin_state', y='total_transit_hours', data=state_performance, palette='viridis')
plt.title('Average Transit Hours by Origin State')
plt.xlabel('Origin State')
plt.ylabel('Average Transit Hours')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('avg_transit_time_by_state.png')
plt.close()

# 2.3. Bar Chart of Facility Touchpoints Distribution
facility_dist = df_clean['num_facilities_visited'].value_counts().sort_index().reset_index()
facility_dist.columns = ['num_facilities_visited', 'count']

plt.figure(figsize=(10, 6))
sns.barplot(x='num_facilities_visited', y='count', data=facility_dist, palette='magma')
plt.title('Distribution of Facility Touchpoints per Shipment')
plt.xlabel('Number of Unique Facilities Visited')
plt.ylabel('Number of Shipments')
plt.tight_layout()
plt.savefig('facility_touchpoints_distribution.png')
plt.close()

# --- 3. Advanced Analysis: Outlier Detection (Finding the "Worst Performers") ---

# Identify the top 5 shipments with the longest transit time
worst_performers = df_clean.sort_values(by='total_transit_hours', ascending=False).head(5)

# Select relevant columns for display
worst_performers_summary = worst_performers[[
    'tracking_number', 
    'origin_city', 
    'destination_city', 
    'total_transit_hours', 
    'num_facilities_visited',
    'first_attempt_delivery'
]]

# The summary table printed above is the output of this section.

ModuleNotFoundError: No module named 'matplotlib'