### imports

In [1]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2

# Load Data

In [2]:
weatherData_raw = pd.read_sql_table('WeatherData', 'sqlite:///data.sqlite')
crashData2017_raw = pd.read_sql_table('CrashData2017', 'sqlite:///data.sqlite')
crashData2018_raw = pd.read_sql_table('CrashData2018', 'sqlite:///data.sqlite')
crashData2019_raw = pd.read_sql_table('CrashData2019', 'sqlite:///data.sqlite')
weatherData = weatherData_raw.copy()

# Filter CrashData to match time frame of WeatherData

In [3]:
crashData2017 = crashData2017_raw[crashData2017_raw['UMONAT'] == 12]
crashData2019 = crashData2019_raw[crashData2019_raw['UMONAT'] != 12]
crashData2018 = crashData2018_raw.copy()

# Match Coordinates of CrashData to WeatherData to select only relevant Crashdata

In [4]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist


def calculate_distance(coords1, coords2):
    # Radius of the Earth in meters
    radius = 6371000

    # Convert latitudes and longitudes to radians
    lat1_rad = np.radians(coords1[:, 0])
    lon1_rad = np.radians(coords1[:, 1])
    lat2_rad = np.radians(coords2[:, 0])
    lon2_rad = np.radians(coords2[:, 1])

    # Haversine formula
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = radius * c
    return distance

# Set a threshold distance for considering two locations as a match
threshold_distance = 500  # 1000 meters (adjust as needed)

# Calculate distances using a loop
distances = []
for i, row in crashData2017.iterrows():
    lat1, lon1 = row['Latitude'], row['Longitude']
    dist = calculate_distance(np.array([(lat1, lon1)]), weatherData[['Latitude', 'Longitude']].to_numpy())
    distances.append(dist)

# Find matching rows
distances = np.concatenate(distances, axis=0)
distances = distances.reshape(len(crashData2017), len(weatherData))
crashData2017_nearby = crashData2017[np.min(distances, axis=1) <= threshold_distance]

crashData2017_nearby.head()

Unnamed: 0,OBJECTID,UIDENTSTLA,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,...,IstPKW,IstFuss,IstKrad,IstSonstig,LICHT,STRZUSTAND,LINREFX,LINREFY,Longitude,Latitude
8294,8295,2171220000044974592,2,1,15,134,2017,12,17,4,...,1,0,0,0,2,1,567858.2509,5930936.0,10.023571,53.522967
8425,8426,2171212000046966784,2,7,2,703,2017,12,8,3,...,1,0,0,1,0,1,568087.2875,5924317.0,10.025588,53.463445
24881,24882,3171203413796699136,3,3,53,34,2017,12,8,1,...,1,0,0,0,0,2,571962.8975,5904397.0,10.079419,53.283911
24882,24883,3171203413796695552,3,3,53,34,2017,12,6,1,...,1,0,0,0,2,2,571928.8006,5904574.0,10.078947,53.285502
24902,24903,3171203213301707264,3,1,53,12,2017,12,16,1,...,1,0,0,0,2,1,578338.9026,5756940.0,10.140068,51.957632


In [6]:
import plotly.io as pio
import plotly.express as px

pio.renderers.default = "notebook"

fig = px.scatter_mapbox(weatherData, 
                        lat='Latitude', 
                        lon='Longitude', 
                        hover_name="Strecke", 
                        color="Strecke",
                        zoom=5, 
                        height=800,
                        width=1200)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [7]:
len(crashData2017_nearby), len(crashData2017)

(479, 13881)

In [8]:
import plotly.io as pio
import plotly.express as px

pio.renderers.default = "notebook"

fig = px.scatter_mapbox(crashData2017_nearby, 
                        lat='Latitude', 
                        lon='Longitude', 
                        zoom=5, 
                        height=800,
                        width=1200)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()