In [5]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import BallTree

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
df = pd.read_csv('../03_Data_for_Modeling/data_with_hour_for_clustering.csv')

In [4]:
df.sample(4)

Unnamed: 0,master_record_number,city,vehicles_involved,number_injured,number_dead,latitude,longitude,roadway_name,intersection_indicator,intersect_type,relation_to_junction,work_zone_type,weather_condition(s),manner_of_crash,pedestrian,bicycle,scooter,parking,hitrun,mou,day_of_week,weekend,hour,month,year,covid,time_window
46404,101180121,13,2,0,0,36.15129,-86.75095,3337,0,2,6,1,0,1,0,0,0,0,0,0,5,1,18,4,2016,0,night
28410,100996456,12,1,1,0,36.27339,-86.72341,4617,0,6,6,1,4,3,0,0,0,0,0,0,0,0,10,10,2015,0,morning
187159,102645371,13,3,0,0,36.14183,-86.73473,3337,0,2,6,1,0,1,0,0,0,0,1,0,3,0,18,2,2020,0,night
8371,100801895,13,2,0,0,36.15317,-86.72157,5911,0,2,6,1,1,8,0,0,0,0,0,0,4,0,11,4,2015,0,afternoon


In [6]:
train_val, test = train_test_split(df, test_size=0.1, stratify=df['time_window'], random_state=42)
train, val = train_test_split(train_val, test_size=2/9, stratify=train_val['time_window'], random_state=42) 

tree = BallTree(train[['latitude', 'longitude']].values)

# Function to calculate average hour of the four closest neighbors from the training set
def calculate_avg_hour_of_nearest(train_tree, train_data, point, k=4):
    dist, ind = train_tree.query(point, k=k)
    if len(train_data.iloc[ind[0]]['hour']) < k:  # In case there are fewer than k neighbors
        return np.nan
    return train_data.iloc[ind[0]]['hour'].mean()

# Calculate the 'avg_hour_4_neigh' for each point in the entire dataset
df['avg_hour_4_neigh'] = df.apply(
    lambda row: calculate_avg_hour_of_nearest(
        tree, 
        train[['hour']], 
        np.array([[row['latitude'], row['longitude']]]),
    ), 
    axis=1
)

In [11]:
# Make sure 'master_record_number' is set as the index for the merge operation if it's not already the index
df_with_avg_hour = df.set_index('master_record_number')[['avg_hour_4_neigh']]

# Merge 'avg_hour_4_neigh' into the original train, validation, and test sets using 'master_record_number'
train_with_avg_hour = train.set_index('master_record_number').join(df_with_avg_hour)
val_with_avg_hour = val.set_index('master_record_number').join(df_with_avg_hour)
test_with_avg_hour = test.set_index('master_record_number').join(df_with_avg_hour)

# Reset the index if required to bring 'master_record_number' back to columns
train_with_avg_hour = train_with_avg_hour.reset_index()
val_with_avg_hour = val_with_avg_hour.reset_index()
test_with_avg_hour = test_with_avg_hour.reset_index()

In [23]:
def time_window_from_hour(hour):
    if 23 <= hour or hour < 5:
        return 'late night'
    elif 5 <= hour < 11:
        return 'morning'
    elif 11 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 23:
        return 'night'
    else:
        return 'unknown' 
def compare_time_windows(df):
    # Create a temporary column for the calculated 'time_window' from 'avg_hour_4_neigh'
    df['calculated_time_window'] = df['avg_hour_4_neigh'].apply(time_window_from_hour)
    
    # Compare the 'calculated_time_window' with the existing 'time_window' column
    matches = df[df['calculated_time_window'] == df['time_window']]
    mismatches = df[df['calculated_time_window'] != df['time_window']]
    
    # Print out the number of matches and mismatches
    print(f"Number of matches: {len(matches)}")
    print(f"Number of mismatches: {len(mismatches)}")
    
    # Optionally, remove the temporary column to clean up the DataFrame
    df.drop('calculated_time_window', axis=1, inplace=True)
    
    # Return the counts in case they need to be used later
    return len(matches), len(mismatches)

In [26]:
matches, mismatches = compare_time_windows(test_with_avg_hour)

Number of matches: 9890
Number of mismatches: 16388


In [27]:
def compare_hours(df):
    # Calculate the absolute difference between 'avg_hour_4_neigh' and 'hour'
    df['hour_difference'] = (df['avg_hour_4_neigh'] - df['hour']).abs()
    
    # Calculate the average difference
    average_difference = df['hour_difference'].mean()
    
    # Print out the average difference
    print(f"The average hour difference is: {average_difference}")
    
    # Optionally, you can drop the 'hour_difference' column if it was only needed for this calculation
    df.drop('hour_difference', axis=1, inplace=True)
    
    # Return the average difference in case it needs to be used later
    return average_difference


In [30]:
average_diff = compare_hours(test_with_avg_hour)

The average hour difference is: 4.613117436639014


In [31]:
train_with_avg_hour.sample(5)

Unnamed: 0,master_record_number,city,vehicles_involved,number_injured,number_dead,latitude,longitude,roadway_name,intersection_indicator,intersect_type,relation_to_junction,work_zone_type,weather_condition(s),manner_of_crash,pedestrian,bicycle,scooter,parking,hitrun,mou,day_of_week,weekend,hour,month,year,covid,time_window,avg_hour_4_neigh
16580,102306916,13,2,0,0,36.0947,-86.70711,3330,0,2,6,1,4,1,0,0,0,0,0,0,4,0,15,2,2019,0,afternoon,11.75
166620,103402461,13,2,0,0,36.16754,-86.66289,3830,0,2,6,1,0,0,0,0,0,0,1,0,2,0,0,8,2022,1,late night,11.0
13120,101991678,13,2,0,0,36.18176,-86.72559,2157,1,1,5,1,0,0,0,0,0,0,0,0,4,0,18,4,2018,0,night,18.0
31122,101685835,13,2,0,0,36.03779,-86.78475,3427,1,6,6,1,0,8,0,0,0,0,0,0,2,0,16,6,2017,0,afternoon,12.75
130131,101127195,13,3,0,0,36.13334,-86.72713,3330,0,2,6,1,0,1,0,0,0,0,0,0,3,0,21,3,2016,0,night,19.25


In [32]:
# Drop 'hour' and 'master_record_number' from train_with_avg_hour
train_with_avg_hour = train_with_avg_hour.drop(['hour', 'master_record_number'], axis=1)

# Drop 'hour' and 'master_record_number' from val_with_avg_hour
val_with_avg_hour = val_with_avg_hour.drop(['hour', 'master_record_number'], axis=1)

# Drop 'hour' and 'master_record_number' from test_with_avg_hour
test_with_avg_hour = test_with_avg_hour.drop(['hour', 'master_record_number'], axis=1)

In [33]:
train_with_avg_hour.to_csv("../03_Data_for_Modeling/train_with_avg_hour_from_4_nearest_neigh.csv", index=False)
val_with_avg_hour.to_csv("../03_Data_for_Modeling/valid_with_avg_hour_from_4_nearest_neigh.csv", index=False)
test_with_avg_hour.to_csv("../03_Data_for_Modeling/test_with_avg_hour_from_4_nearest_neigh.csv", index=False)