In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

import boto3
import awswrangler

s3_bucket = 'traffic-data-bucket'

In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [None]:
#inspect which files are out ther.
s3 = my_session.resource('s3')

my_bucket = s3.Bucket('traffic-data-bucket')
# set prefix to name of folder
for file in my_bucket.objects.filter(Prefix='model_scoring/'):
    print(file.key)

In [None]:
s3 = my_session.resource('s3')

my_bucket = s3.Bucket('traffic-data-bucket')
list_of_files_ignore = ['model_scoring/individual_model_scores/GBM_01.csv']
# set prefix to name of folder

scored_frame_dict = {}
model_number = 1
for file in my_bucket.objects.filter(Prefix='model_scoring/'):
    print(file)
    if file.key in list_of_files_ignore:
        continue    # continue here
    print(file)
    this_model_df = awswrangler.s3.read_csv(path = f's3://{s3_bucket}/{file.key}', boto3_session=my_session, use_threads=True)
    scored_frame_dict[model_number] = this_model_df
    model_number = model_number + 1

In [None]:
scored_df = pd.concat(scored_frame_dict)
scored_df.reset_index(inplace = True, drop = True)
scored_df.sample(8)

In [None]:
model_df = awswrangler.s3.read_parquet(path = f's3://{s3_bucket}/model_data/model_data_post_transformation.parquet', boto3_session=my_session, use_threads=True)

In [None]:
target_df = model_df[['hex_id', 'collision_date', 'collision_hour', 'target', 'ttv_split']]

In [None]:
scored_validate_df = scored_df[scored_df.ttv_split == 'Validate']

In [None]:
model_list = scored_validate_df.model_name.unique()
all_target = model_df[model_df.ttv_split == 'Validate']['target'].astype(int)


#nutral score
ns_score_length = len(scored_validate_df)/len(model_list)
np.int(ns_score_length)

ns_probs = [0 for _ in range(np.int(ns_score_length))]
ns_auc = roc_auc_score(all_target, ns_probs)
# calculate roc curve
ns_fpr, ns_tpr, _ = roc_curve(all_target, ns_probs)
print('No Skill: ROC AUC=%.3f' % (ns_auc))

#initialize plot
plt.figure(figsize=(10, 10))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')

for this_model in model_list:
    
    this_scored_df = scored_validate_df[scored_validate_df.model_name == this_model]
    #display(this_scored_df.sample())
    
    this_auc = roc_auc_score(all_target, this_scored_df['prediction'])
    print(this_model, ': ROC AUC=%.3f' % (this_auc))

    ## calculate roc curve
    this_fpr, this_tpr, _ = roc_curve(all_target, this_scored_df['prediction'])

    plt.plot(this_fpr, this_tpr,linestyle='-', label=this_model)

### axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
## show the legend
plt.legend()
# show the plot
plt.show()