In [4]:
import pandas as pd
import geopandas
import matplotlib.pyplot as plt
import numpy as np
 
SEED = 42
rng = np.random.default_rng(SEED)

In [5]:
import sys
import os

# Adjust the path to the 'preprocess' folder relative to your notebook
preprocess_path = os.path.abspath(os.path.join('..', '..', 'preprocess'))
sys.path.append(preprocess_path)
# Import the necessary functions
try:
    from plot_transfer import plot_transfer
    from load_flight_data import load_flight_data
    from find_transfers import find_transfers
    print("Modules imported successfully")
except ModuleNotFoundError as e:
    print(f"Error importing modules: {e}")

Modules imported successfully


In [6]:
flight_data_dropped = load_flight_data(drop_last=True)
flight_data_not_dropped = load_flight_data(drop_last=False)

In [8]:
transfer_flight_data_dropped = find_transfers(d=flight_data_dropped, max_transit_time=3, remove_outliers=False, outlier_factor=2, outlier_offset=0)
transfer_flight_data_not_dropped = find_transfers(d=flight_data_not_dropped, max_transit_time=3, remove_outliers=False, outlier_factor=2, outlier_offset=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [26]:
#transfer_flight_data with last row included = 7908 transfers
transfer_flight_data_dropped.query("transit_time_outlier == False").shape, transfer_flight_data_not_dropped.query("transit_time_outlier == False").shape

((7201, 22), (7257, 22))

In [None]:
from find_transfers import extract_entries_and_exits
entries_and_exits = extract_entries_and_exits(flight_data_dropped)

In [None]:
entries_and_exits

## Test inferred potential transfer flights

### Non-outliers (based on transit time x 2 of expected + 5 minute offset)
The offset is to account for very short travel times.

In [None]:
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

# Initialize DataFrame to store classifications
classification_results_non_outliers = pd.DataFrame(columns=['transfer_id', 'correctly_classified'])

# Counter to keep track of classified maps
map_count = 0

# Escape flag
escape = False

while not escape:
    # Select a random transfer ID from non-outliers
    transfer_id = rng.choice(transfer_flight_data.query('transit_time_outlier == False').transfer_id)
    
    # Print the relevant row of transfer_flight_data
    print(f"Number of maps classified: {map_count}")

    display(transfer_flight_data[transfer_flight_data['transfer_id'] == transfer_id][['transfer_id', 'hospital_name_sending', 'hospital_name_receiving', 'time_in_zone_sending', 'expected_transit_time', 'transit_time', 'transit_time_ratio']])
    
    # Plot the selected transfer using the plot_transfer function
    m = plot_transfer(flight_data, transfer_flight_data, transfer_id)
    display(m)
    
    # Prompt for classification
    classification = input(f'Classify Transfer ID {transfer_id} (y/n): ').strip().lower()
    
    # Validate input
    while classification not in ['y', 'n', 'exit']:
        print("Invalid input. Please enter 'y' or 'n'. To exit, type 'exit'.")
        classification = input(f'Classify Transfer ID {transfer_id} (y/n): ').strip().lower()
    
    # Check if user wants to exit
    if classification == 'exit':
        escape = True
        print("Exiting...")
        break
    
    # Save classification in DataFrame
    classification_results_non_outliers = pd.concat([classification_results_non_outliers, pd.DataFrame({'transfer_id': [transfer_id], 'correctly_classified': [classification]}).astype({'transfer_id': 'int', 'correctly_classified': 'str'})], ignore_index=True)
    
    # Increment map counter
    map_count += 1
        
    # Clear the output to remove the previous plot
    clear_output(wait=True)

In [None]:
classification_results_non_outliers['correctly_classified_bool'] = classification_results_non_outliers['correctly_classified'].map({'y': True, 'n': False})
print("No samples:", len(classification_results_non_outliers))
print("Specificity:", classification_results_non_outliers['correctly_classified_bool'].sum() / len(classification_results_non_outliers))

In [None]:
# Save classification results to CSV
from datetime import datetime
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
file_name = f'classification_results_non_outliers_{timestamp}.csv'
classification_results_non_outliers.to_csv(file_name, index=False)

## Outliers

In [None]:
# Initialize DataFrame to store classifications
classification_results_outliers = pd.DataFrame(columns=['transfer_id', 'correctly_classified'])

# Counter to keep track of classified maps
map_count = 0

# Escape flag
escape = False

while not escape:
    # Select a random transfer ID from non-outliers
    transfer_id = rng.choice(transfer_flight_data.query('transit_time_outlier == True').transfer_id)
    
    # Print the relevant row of transfer_flight_data
    print(f"Number of maps classified: {map_count}")

    display(transfer_flight_data[transfer_flight_data['transfer_id'] == transfer_id][['transfer_id', 'hospital_name_sending', 'hospital_name_receiving', 'time_in_zone_sending', 'expected_transit_time', 'transit_time', 'transit_time_ratio']])
    
    # Plot the selected transfer using the plot_transfer function
    m = plot_transfer(flight_data, transfer_flight_data, transfer_id)
    display(m)
    
    # Prompt for classification
    classification = input(f'Classify Transfer ID {transfer_id} (y/n): ').strip().lower()
    
    # Validate input
    while classification not in ['y', 'n', 'exit']:
        print("Invalid input. Please enter 'y' or 'n'. To exit, type 'exit'.")
        classification = input(f'Classify Transfer ID {transfer_id} (y/n): ').strip().lower()
    
    # Check if user wants to exit
    if classification == 'exit':
        escape = True
        print("Exiting...")
        break
    
    # Save classification in DataFrame
    classification_results_outliers = pd.concat([classification_results_non_outliers, pd.DataFrame({'transfer_id': [transfer_id], 'correctly_classified': [classification]}).astype({'transfer_id': 'int', 'correctly_classified': 'str'})], ignore_index=True)
    
    # Increment map counter
    map_count += 1
        
    # Clear the output to remove the previous plot
    clear_output(wait=True)

In [None]:
classification_results_outliers['correctly_classified_bool'] = classification_results_outliers['correctly_classified'].map({'y': True, 'n': False})
print("Specificity:", classification_results_outliers['correctly_classified_bool'].sum() / len(classification_results_outliers))

In [None]:
# Save classification results to CSV
from datetime import datetime
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
file_name = f'classification_results_outliers_{timestamp}.csv'
classification_results_outliers.to_csv(file_name, index=False)

Conclusion:
* Of the included flights, almost all are possible transfer flights. Of those exlucuded based on transit time, almost all seem like reasonable exclusions.
* Sofar we can estimate that the specificity > 95%
* What about sensitivity?

## Estimating sensitivity
Per se impossible since we have no gold standard. However, if we map some random flights we can check if relevant ones are caught. This is manual labor...

In [None]:
jsk = flight_data.query("reg == 'SEJSK'")

In [None]:
check_flight = rng.choice(jsk['flight_id'])
check_flight_reg = rng.choice(jsk['reg'])
check_flight_df = jsk.query(f'flight_id == {check_flight}')[['geometry', 'speed', 'altitude', 'UTC_str']]
check_flight_day = jsk.query(f'flight_id == {check_flight}')['date'].values[0]
check_flight_df.explore()

In [None]:
flights_that_day = transfer_flight_data[transfer_flight_data['UTC_out_sending'].dt.date == check_flight_day]
flights_that_day.query(f"reg_sending=='{check_flight_reg}'")


In [None]:
# All flights that day
flight_data[flight_data.UTC.dt.date == check_flight_day].query(f'reg == "{check_flight_reg}"')[['UTC_str', 'geometry', 'flight_id']].explore()

In [None]:
#Get some data on the missing flight
from datetime import timedelta
missing_flight = 671141277
missing_flight_date = flight_data.query(f"flight_id == {missing_flight}")['UTC'].dt.date.iloc[0]
missing_flight_date_prior = missing_flight_date - timedelta(days=1)
missing_flight_reg = flight_data.query(f"flight_id == {missing_flight}")['reg'].iloc[0]

In [None]:
#Plot the flight
flight_data.query(f"flight_id == {missing_flight}")[['zone_name', 'UTC_str', 'radius', 'speed', 'altitude', 'geometry']].explore()

In [None]:
#Look at inferred flights that day for the aircraft
flights_that_day = transfer_flight_data[transfer_flight_data['UTC_out_sending'].dt.date == missing_flight_date]
flights_that_day.query(f"reg_sending=='{missing_flight_reg}'")


In [None]:
#Get entries and exits for the aircraft and day

entries_and_exits[entries_and_exits['date'].isin((missing_flight_date_prior, missing_flight_date))].query(f"reg == '{missing_flight_reg}'")

SE-JSK: 25/25

SE-JSN: 24/25

SE-JXA: 10/10

SE-JXB: 10/10

SE-JXC: 9/10

SE-JXD: 20/20

SE-JSL: 7/3 (some missing Visby->Karolinska)

SE-JSJ: 10/10

SE-JSG: 10/10

SE-JRA: 10/10