In [24]:
# http://localhost:8888/tree?token=828ad94db4d7d69d99c8c59436853dc031be47d05fee9a61

In [30]:
! gcloud auth login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=eDt3byy7cibumau9J8867S5LwYwY0S&access_type=offline&code_challenge=zZn07RGmRCsFFfeY1lS3rIoI5Qb4jzUBLOzTE0fQFQM&code_challenge_method=S256


You are now logged in as [sharedaiprojects@gmail.com].
Your current project is [aia-ds-accelerator-flight-1].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [32]:
# SERVICE_ACCOUNT = "flight-ml-demo-general@aia-ds-accelerator-flight-1.iam.gserviceaccount.com"  # @param {type:"string"}

In [29]:
from deltalake import DeltaTable
import pandas as pd
import gcsfs
import os
from utils import JSON_EncoderDecoder

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_columns", 100)
pd.set_option('display.width', 4000)



def get_gcp_creds_json():
    import json
    import os
    gcp_creds_encoded = os.environ.get("GCP_CREDENTIALS_JSON_ENCODED")
    gcp_credentials_json = JSON_EncoderDecoder(gcp_creds_encoded).decode().get()
    return gcp_credentials_json


def init_storage_client():
    from google.oauth2 import service_account
    from google.cloud import storage
    gcp_creds_json = get_gcp_creds_json()
    
    gcp_credentials = service_account.Credentials.from_service_account_info(gcp_credentials_json)
    storage_client = storage.Client(credentials=gcp_credentials, project=project_id)
    return storage_client


def get_date_from_lookback(lookback_days: int):
    from datetime import datetime, timedelta
    # Calculate the target date by subtracting the lookback_days from the current UTC date
    target_date = datetime.utcnow() - timedelta(days=lookback_days)
    # Extract year, month, and day
    year, month, day = target_date.year, target_date.month, target_date.day
    return f'{year}/{month:02d}/{day:02d}'


def read_data_preprocessed(table_path='gs://datalake-flight-dev-1/flightsummary-delta-processed-stream'):

    gcp_creds_json_str = json.dumps(get_gcp_creds_json())

    storage_options = {'service_account_key': gcp_creds_json_str}

    dt = DeltaTable(table_path, storage_options=storage_options)
    
    start_date = get_date_from_lookback(lookback_days=365)
    print(f"Reading data from {start_date}")

    df = dt.to_pandas(partitions=[("crt_ts_date", ">=", start_date)])
    return df


def create_target(df):
    """
    Create a target variable for modelling. 
    The target is the number of minutes between the actual landing time and the event time.
    """

    # group by flight id to get all the landing times
    landing_times = df[['fa_flight_id','actual_in']].groupby('fa_flight_id').max()['actual_in']

    # fill in the landing times 
    df['actual_in_filled'] = df['fa_flight_id'].map(landing_times)
    
    # calculate target in minutes
    df['target'] = (df['actual_in_filled'] - df['event_ts']).dt.total_seconds() / 60
    
    # Color the target so we can see it better
    df.style.bar(subset=['target'], color='#d65f5f')
    
    # Sort based on the time which events are SUPPOSED to happened
    df = df.sort_values(by=['actual_in_filled','crt_ts'])
    return df

def remove_incomplete_flights(df):
    " Remove flights which don't have a landing time"
    return df[df['actual_in_filled'].notnull()]

def removed_arrival_events(df):
    " For modelling, we only want to look at events which happen before landing"
    return df[df['event_type'] != 'actual_in']

# def style_datframe(df):
#     def highlight_scheduled_out(row):
#         if row['event_type'] == 'scheduled_out':
#             return ['background-color: lightgreen; color: black'] * len(row)
#         return [''] * len(row)

#     # Apply the styling
#     return df.style.apply(highlight_scheduled_out, axis=1)

def write_data_to_gcs(df):
    client = init_storage_client()
    bucket = client.get_bucket('datalake-flight-dev-1')
    blob = bucket.blob('training/flights-modelready1.csv')
    blob.upload_from_string(df.to_csv(), 'text/csv')


def create_training_data(path_out: str):
    df = read_data_preprocessed()
    df = create_target(df)
    df = remove_incomplete_flights(df)
    df = removed_arrival_events(df)
    write_data_to_gcs(df)
    return df


# create_training_data().count()