In [9]:
import pandas as pd
import numpy as np
import random as rd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
import datetime
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
folder = "../00_Raw_Data/"

# Generate filenames from 2015_Q1 to 2022_Q4
years = range(2015, 2023)
quarters = range(1, 5)

files = ["{}_Q{}_Traffic_Crashes.csv".format(year, quarter) for year in years for quarter in quarters]

dfs = [pd.read_csv(folder + file) for file in files]
df = pd.concat(dfs, ignore_index=True)

In [3]:
df['collision_date'] = pd.to_datetime(df['collision_date'], format='%m/%d/%Y %I:%M:%S %p')
df['date'] = df['collision_date'].dt.date

df['day_of_week'] = df['collision_date'].dt.dayofweek # 0: Monday - 6: Sunday
df['weekend'] = df['day_of_week'].apply(lambda x: 1 if x in [5, 6] else 0)

df['time'] = df['collision_date'].dt.time
df['hour'] = df['collision_date'].dt.hour
df['day_of_month'] = df['collision_date'].dt.day
df['month'] = df['collision_date'].dt.month
df['year'] = df['collision_date'].dt.year
df['covid'] = (df['date'] >= pd.Timestamp('2020-03-15').date()).astype(int)

def time_window(hour):
    if 23 <= hour or hour < 5:
        return 'late night'
    elif 5 <= hour < 11:
        return 'morning'
    elif 11 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 23:
        return 'night'
    else:
        return 'unknown' 
df['time_window'] = df['hour'].apply(time_window)

columns_to_convert = ['pedestrian', 'bicycle', 'scooter', 'hitrun', 'parking']

for column in columns_to_convert:
    df[column] = df[column].map({'yes': 1, 'no': 0})
    
    
df['manner_of_crash'] = df['manner_of_crash'].fillna('Unknown/Missing')
df['manner_of_crash'] = df['manner_of_crash'].replace(['Missing', 'Unknown'], 'Unknown/Missing')
crash_onehot = pd.get_dummies(df['manner_of_crash'], prefix='crash')
df = pd.concat([df, crash_onehot], axis=1)
df.drop('manner_of_crash', axis=1, inplace=True)

df.drop('roadway_name', axis=1, inplace=True)

df['intersect_type'].replace(['OTHER', 'Missing', 'Unknown'], 'Unknown/Other', inplace=True)
intersect_type_onehot = pd.get_dummies(df['intersect_type'], prefix='intersect_type')
df = pd.concat([df, intersect_type_onehot], axis=1)
df.drop('intersect_type', axis=1, inplace=True)

df['relation_to_junction'].replace(['Missing', 'OtherLocation', 'Unknown'], 'Unknown/Other', inplace=True)
relation_onehot = pd.get_dummies(df['relation_to_junction'], prefix='relation_to_junction')
df = pd.concat([df, relation_onehot], axis=1)
df.drop('relation_to_junction', axis=1, inplace=True)

df.drop('city', axis=1, inplace=True)

df['intersection_indicator'] = df['intersection_indicator'].fillna("N")
df['intersection_indicator'].replace("Missing", "N", inplace=True)
mapping = {"N": 0, "Y": 1}
df['intersection_indicator'] = df['intersection_indicator'].map(mapping)
df['intersection_indicator'] = df['intersection_indicator'].astype('int64')

# Combine "Campus" and "OHO" categories
df['mou'] = df['mou'].replace('OHO', 'Campus')

# Now, encode "Campus" as 1 and "False" as 0
df['mou'] = df['mou'].apply(lambda x: 1 if x == 'Campus' else 0)

df['work_zone_type'] = df['work_zone_type'].apply(lambda x: 0 if x == "Missing" or x == "Unknown" or x is None else 1)


def simplify_weather(weather):
    if pd.isna(weather):
        return 'Unknown/Other'
    elif 'Snow' in weather:
        return 'Snow'
    elif 'Sleet/Hail' in weather:
        return 'Sleet/Hail'
    elif 'Rain' in weather:
        return 'Rain'
    elif weather in ['Fog', 'Smoke', 'Smog']:
        return 'Fog'
    elif 'Cloudy' in weather:
        return 'Cloudy'
    elif weather == 'Clear':
        return 'Clear'
    elif weather in ['Missing', 'Unknown', 'Other']:
        return 'Unknown/Other'
    else:
        return 'Other Conditions'

df['simplified_weather'] = df['weather_condition(s)'].apply(simplify_weather)
df = pd.get_dummies(df, columns=['simplified_weather'], prefix='weather')
df.drop('weather_condition(s)', axis=1, inplace=True)

def bool_to_int(df):
    for col in df.select_dtypes(['bool']).columns:
        df[col] = df[col].astype(int)
    return df

df = bool_to_int(df)

columns_to_scale = ['latitude', 'longitude', 'vehicles_involved', 'number_injured', 'number_dead', 'year']
scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

df['time'] = df['time'].astype(str)
df['time'] = df['time'].str.strip()
has_time_df = df[df['time'] != "00:00:00"]

columns_to_drop = [
    "master_record_number", "agency", "agency_tracking_number", "county", 
    "collision_date", "roadway_suffix", "roadway_number", "roadway_local_id", 
    "distance_from_reference", "miles-feet_indicator", "direction_from_reference",
    "intersection_road_name", "intersection_road_name_suffix", "intersection_road_number",
    "intersection_local_id", "mile_marker", "interchange_related_indicator", 
    "construction_maintenance_zone", "construction_maintenance_zone_location",
    "fatal_case_number", "date", "officer_first_name", "officer_last_name", "time"
]

has_time_df = has_time_df.drop(columns=columns_to_drop)

In [4]:
has_time_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 262773 entries, 0 to 279706
Data columns (total 57 columns):
 #   Column                                               Non-Null Count   Dtype  
---  ------                                               --------------   -----  
 0   vehicles_involved                                    262773 non-null  float64
 1   number_injured                                       262773 non-null  float64
 2   number_dead                                          262773 non-null  float64
 3   latitude                                             262773 non-null  float64
 4   longitude                                            262773 non-null  float64
 5   intersection_indicator                               262773 non-null  int64  
 6   work_zone_type                                       262773 non-null  int64  
 7   pedestrian                                           262773 non-null  int64  
 8   bicycle                                              2

In [12]:
columns_to_keep = [
    'latitude', 'longitude', 'year', 'covid', 'number_injured',
    'intersection_indicator', 'month', 'number_dead', 'vehicles_involved',
    'parking', 'hitrun', 'day_of_week','crash_Angle','day_of_month', 'hour',
    'day_of_week'
]

# Create a new DataFrame with the specified columns
top_PC_df = has_time_df[columns_to_keep]

In [13]:
correlation_matrix = top_PC_df.corr()

# Create a heatmap using Plotly
fig = px.imshow(correlation_matrix)
fig.update_layout(title="Correlation Matrix Heatmap")
fig.show()

In [15]:
top_PC_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 262773 entries, 0 to 279706
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   latitude                262773 non-null  float64
 1   longitude               262773 non-null  float64
 2   year                    262773 non-null  float64
 3   covid                   262773 non-null  int64  
 4   number_injured          262773 non-null  float64
 5   intersection_indicator  262773 non-null  int64  
 6   month                   262773 non-null  int64  
 7   number_dead             262773 non-null  float64
 8   vehicles_involved       262773 non-null  float64
 9   parking                 262773 non-null  int64  
 10  hitrun                  262773 non-null  int64  
 11  day_of_week             262773 non-null  int64  
 12  crash_Angle             262773 non-null  uint8  
 13  day_of_month            262773 non-null  int64  
 14  hour                