In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from datetime import datetime
import requests
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import warnings
import json
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import copy
from tmpFission import tempFissionStream, tempFissionCrashesFromStation, tempFissionWeather, tempMSearch, tempFissionCrashes, tempFissionStations, tmpFissionCrashes2

warnings.filterwarnings("ignore")

FISSION_URL = 'http://172.26.135.52:9090/'
FISSION_HEADERS = {'HOST': 'fission'}

In [3]:
def get_full_data(params):
    # Define search query (optional, can be empty to retrieve all documents)
    data = []
    max_retries = 3
    retry_delay = 5  # seconds
    timeout = 60  # seconds

    for _ in range(max_retries):
        try:
            res = requests.get(f"{FISSION_URL}/{params}", headers=FISSION_HEADERS, timeout=timeout)
            if res.status_code != 200:
                print(res.text)
                return None
            data = json.loads(res.text)
            return data
        except requests.exceptions.RequestException as e:
            print(f"Connection error: {e}")
            print("Retrying in 5 seconds...")
            time.sleep(retry_delay)
    
    print("Max retries exceeded. Unable to retrieve data.")
    return None

In [7]:

resp = get_full_data('stations')
if resp is None:
    print("Failed to retrieve data.")
elif resp['Status'] != 200:
    print(f"Failed to retrieve data. Status code: {resp['Status']}")
stations = pd.DataFrame(resp["Data"])

# # drop unnecessary columns
stations = stations.drop(columns=['created_at'])
stations_copy = stations.copy(deep=True)
stations.head()

Unnamed: 0,Station ID,Station Name,location
0,3003,BROOME AIRPORT,"[122.2352, -17.9475]"
1,9965,BUNBURY,"[115.6447, -33.3567]"
2,39128,BUNDABERG AERO,"[152.323, -24.9069]"
3,23000,ADELAIDE (WEST TERRACE / NGAYIRDAPIRA),"[138.5832, -34.9257]"
4,87184,BREAKWATER (GEELONG RACECOURSE),"[144.3765, -38.1737]"


In [None]:

# # resp = get_full_data('stations')
# resp = tempFissionStations()
# if resp is None:
#     print("Failed to retrieve data.")
# elif resp['Status'] != 200:
#     print(f"Failed to retrieve data. Status code: {resp['Status']}")
# stations = pd.json_normalize(resp["Data"])
# # print(resp)
# # drop unnecessary columns
# stations = stations.drop(columns=['_index', '_id', '_source.created_at', '_score'])
# # rename columns _source.Station ID to Station ID, _source.Station Name to Station Name, _source.location Description to location
# stations = stations.rename(columns={"_source.Station ID": "Station ID", "_source.Station Name": "Station Name", "_source.location": "location"})
# #convert location array to 2 columns
# stations = pd.concat([stations, stations['location'].apply(pd.Series)], axis=1)
# # drop location column
# stations = stations.drop(columns=['location'])
# # rename columns 0 to Latitude, 1 to Longitude
# stations = stations.rename(columns={0: "longitude", 1: "latitude"})
# stations_copy = stations.copy(deep=True)

In [5]:
stations_copy.head()

Unnamed: 0,Station ID,Station Name,created_at,location
0,3003,BROOME AIRPORT,2024-05-09T12:08:39.890088161Z,"[122.2352, -17.9475]"
1,9965,BUNBURY,2024-05-09T12:08:39.890202055Z,"[115.6447, -33.3567]"
2,39128,BUNDABERG AERO,2024-05-09T12:08:39.890265394Z,"[152.323, -24.9069]"
3,23000,ADELAIDE (WEST TERRACE / NGAYIRDAPIRA),2024-05-09T12:08:39.876975237Z,"[138.5832, -34.9257]"
4,87184,BREAKWATER (GEELONG RACECOURSE),2024-05-09T12:08:39.889062773Z,"[144.3765, -38.1737]"


In [47]:
#loop through all stations
size = 3000 #number of crashes to get
radius = 10 #radius in kms
crashes = {}
empty_stations = []
error_stations = []
dfs = []
for index, station in stations_copy.iterrows():
    station_id = station['Station ID']
    # station_id = "91306"
    crashes[station_id] = crashes.get(station_id, [])
    params = f"crashes/{station_id}/{size}/{radius}"
    resp = get_full_data(params)
    if resp is None or "Data" not in resp or "Token" not in resp or "Status" not in resp:
        error_stations.append(station_id)
        continue
    elif resp["Data"] == [] or resp["Token"] == "END" or resp["Status"] != 200:
        empty_stations.append(station_id)
        continue
    else:
        while resp and "Token" in resp and resp["Token"] != "END":
            temp = pd.json_normalize(resp["Data"])
            temp['Station ID'] = station_id
            dfs.append(temp)
            params = f"stream/{resp['Token']}"
            resp = get_full_data(params)


crashes_df = pd.concat(dfs, ignore_index=True)
crashes_df

In [50]:
crashes_df_copy =  crashes_df.copy(deep=True)
# drop unnecessary columns
crashes_df_copy = crashes_df_copy.drop(columns=['_index', '_id', '_score'])
# rename columns _source.light_condition to light_condition, _source.crash_date to crash_date, _source.severity to severity
crashes_df_copy = crashes_df_copy.rename(columns={"_source.light_condition": "light_condition", "_source.crash_date": "crash_date", "_source.severity": "severity","_source.location": "location"})
#convert crash_date to datetime DD/MM/YYYY
crashes_df_copy['crash_date'] = pd.to_datetime(crashes_df_copy['crash_date']).dt.strftime('%d/%m/%Y')
#drop rows with missing/magic values (severity == -1)
crashes_df_copy = crashes_df_copy[crashes_df_copy['severity'] != -1]
#convert location array to 2 columns
crashes_df_copy = pd.concat([crashes_df_copy, crashes_df_copy['location'].apply(pd.Series)], axis=1)
# rename columns 0 to Latitude, 1 to Longitude
crashes_df_copy = crashes_df_copy.rename(columns={0: "longitude", 1: "latitude"})
crashes_df_copy['severity'] = crashes_df_copy['severity'].astype(int)
crashes_df_copy['Station ID'] = crashes_df_copy['Station ID'].astype(str)
crashes_df_copy


Unnamed: 0,light_condition,crash_date,severity,location,Station ID,longitude,latitude
0,Daylight,04/02/2014,1,"[146.996014259, -41.6504384561]",91375,146.996014,-41.650438
1,Daylight,27/09/2015,3,"[147.149224535, -41.6564154046]",91375,147.149225,-41.656415
2,Daylight,09/06/2015,0,"[147.105201354, -41.6237607582]",91375,147.105201,-41.623761
3,Daylight,06/03/2016,0,"[147.09064093, -41.7190137679]",91375,147.090641,-41.719014
4,Darkness (without street light),18/05/2016,3,"[147.084898847, -41.734885865]",91375,147.084899,-41.734886
...,...,...,...,...,...,...,...
67232,Daylight,25/11/2010,2,"[146.750729075, -43.0964443765]",97024,146.750729,-43.096444
67233,Daylight,24/03/2011,1,"[146.805976869, -43.0477734284]",97024,146.805977,-43.047773
67234,Darkness (without street light),27/08/2010,1,"[146.811145064, -43.0518765608]",97024,146.811145,-43.051877
67235,Darkness (without street light),01/09/2018,1,"[146.818281888, -43.0569031809]",97024,146.818282,-43.056903


In [56]:
crashes_per_day = crashes_df_copy.groupby(['crash_date',"Station ID"]).size()

crashes_per_day = crashes_per_day.reset_index(name='count')
crashes_per_day


Unnamed: 0,crash_date,Station ID,count
0,01/01/2010,91237,2
1,01/01/2010,91306,1
2,01/01/2010,91375,1
3,01/01/2010,92120,1
4,01/01/2010,94029,1
...,...,...,...
18739,31/12/2019,91292,1
18740,31/12/2019,94008,1
18741,31/12/2019,94029,6
18742,31/12/2019,94087,5


In [60]:

print(min_date)
print(max_date)

2010
2020


In [80]:
# Function to get min and max crash dates for a given station ID
def get_min_max_dates(station_id):
    # Filter DataFrame for the given station ID
    station_data = crashes_per_day[crashes_per_day['Station ID'] == station_id]
    
    # Find minimum and maximum crash dates
    min_date = station_data['crash_date'].min().year
    max_date = station_data['crash_date'].max().year
    
    return min_date, max_date


In [81]:
crashes_per_day['crash_date'] = pd.to_datetime(crashes_per_day['crash_date'], dayfirst=True)
#getting weather data from API
weather_data = []
for station in crashes_per_day['Station ID'].unique():
    min_date, max_date = get_min_max_dates(station)
    params = f"weather/{station}/{min_date}/{max_date}"
    resp = get_full_data(params)
    if resp is None or "Data" not in resp or "Status" not in resp:
        error_stations.append(station)
        continue
    elif resp["Data"] == [] or resp["Status"] != 200:
        empty_stations.append(station)
        continue
    else:
        temp = pd.json_normalize(resp["Data"])
        temp['Station ID'] = station
        weather_data.append(temp)


weather_df = pd.concat(weather_data, ignore_index=True)
weather_df

2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2020
2010
2019
2010
2020
2010
2019
2010
2020
2010
2020
2015
2015
2018
2018


Unnamed: 0,UV,Max Humid,created_at,Min Temp,WindSpeed,Min Humid,source,Station Name,Date,Rain,Pan-Rain,Max Temp,Evapo-Rain,Station ID
0,33.91,86,2024-05-14T02:32:04.335597956Z,19.2,4.78,31,launceston_(ti_tree_bend),LAUNCESTON (TI TREE BEND),01/01/2010,0.0,-1.0,29.7,8.0,91237
1,31.15,93,2024-05-14T02:32:04.335603015Z,14.4,5.13,25,launceston_(ti_tree_bend),LAUNCESTON (TI TREE BEND),02/01/2010,0.6,-1.0,24.3,6.7,91237
2,32.97,74,2024-05-14T02:32:04.335608696Z,9.6,3.28,30,launceston_(ti_tree_bend),LAUNCESTON (TI TREE BEND),03/01/2010,0.0,-1.0,24.8,6.3,91237
3,33.80,85,2024-05-14T02:32:04.335613815Z,8.7,4.42,51,launceston_(ti_tree_bend),LAUNCESTON (TI TREE BEND),04/01/2010,0.0,-1.0,22.0,5.5,91237
4,31.38,89,2024-05-14T02:32:04.335618895Z,13.7,4.94,46,launceston_(ti_tree_bend),LAUNCESTON (TI TREE BEND),05/01/2010,0.0,-1.0,22.7,5.7,91237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79056,22.54,99,2024-05-14T02:31:14.730637237Z,12.6,6.25,29,king_island_airport,KING ISLAND AIRPORT,30/12/2019,0.0,-1.0,34.1,8.0,98017
79057,29.39,82,2024-05-14T02:31:14.730642256Z,13.0,7.02,55,king_island_airport,KING ISLAND AIRPORT,31/12/2019,3.0,-1.0,20.0,5.1,98017
79058,30.38,99,2024-05-14T02:31:14.730647225Z,13.3,7.60,67,king_island_airport,KING ISLAND AIRPORT,01/01/2020,0.0,-1.0,20.9,4.4,98017
79059,25.63,98,2024-05-14T02:38:46.553731029Z,8.6,9.34,48,scotts_peak_dam,SCOTTS PEAK DAM,01/01/2015,5.6,-1.0,16.3,4.1,97083


In [83]:

weather_df_copy = weather_df.copy(deep=True)

#drop created_at, source, Station Name columns
weather_df_copy = weather_df_copy.drop(['created_at', 'source', 'Station Name', 'Pan-Rain'], axis=1)
#add dtypes for columns
weather_df_copy['Date'] = pd.to_datetime(weather_df_copy['Date'], format='%d/%m/%Y')
weather_df_copy['Evapo-Rain'] = weather_df_copy['Evapo-Rain'].astype(float)
weather_df_copy['Rain'] = weather_df_copy['Rain'].astype(float)
weather_df_copy['Max Temp'] = weather_df_copy['Max Temp'].astype(float)
weather_df_copy['Min Temp'] = weather_df_copy['Min Temp'].astype(float)
weather_df_copy['Max Humid'] = weather_df_copy['Max Humid'].astype(int)
weather_df_copy['Min Humid'] = weather_df_copy['Min Humid'].astype(int)
weather_df_copy['WindSpeed'] = weather_df_copy['WindSpeed'].astype(float)
weather_df_copy['UV'] = weather_df_copy['UV'].astype(float)
weather_df_copy['Station ID'] = weather_df_copy['Station ID'].astype(str)
#drop rows with missing/magic values
#drop Min Temp, Max Temp	= -999
weather_df_copy = weather_df_copy[weather_df_copy['Min Temp'] != -999.0]
weather_df_copy = weather_df_copy[weather_df_copy['Max Temp'] != -999.0]
#drop Rain, Evapo-Rain, Max Humid, Min Humid = -1.0
weather_df_copy = weather_df_copy[weather_df_copy['Rain'] != -1.0]
weather_df_copy = weather_df_copy[weather_df_copy['Evapo-Rain'] != -1.0]
weather_df_copy = weather_df_copy[weather_df_copy['Max Humid'] != -1.0]
weather_df_copy = weather_df_copy[weather_df_copy['Min Humid'] != -1.0]
weather_df_copy


Unnamed: 0,UV,Max Humid,Min Temp,WindSpeed,Min Humid,Date,Rain,Max Temp,Evapo-Rain,Station ID
0,33.91,86,19.2,4.78,31,2010-01-01,0.0,29.7,8.0,91237
1,31.15,93,14.4,5.13,25,2010-01-02,0.6,24.3,6.7,91237
2,32.97,74,9.6,3.28,30,2010-01-03,0.0,24.8,6.3,91237
3,33.80,85,8.7,4.42,51,2010-01-04,0.0,22.0,5.5,91237
4,31.38,89,13.7,4.94,46,2010-01-05,0.0,22.7,5.7,91237
...,...,...,...,...,...,...,...,...,...,...
79056,22.54,99,12.6,6.25,29,2019-12-30,0.0,34.1,8.0,98017
79057,29.39,82,13.0,7.02,55,2019-12-31,3.0,20.0,5.1,98017
79058,30.38,99,13.3,7.60,67,2020-01-01,0.0,20.9,4.4,98017
79059,25.63,98,8.6,9.34,48,2015-01-01,5.6,16.3,4.1,97083


In [85]:
merged_df = crashes_per_day.merge(weather_df_copy, left_on=['Station ID', 'crash_date'], right_on=['Station ID', 'Date'], how='inner')

#drop Date and crash_date columns
merged_df = merged_df.drop(['Date', 'crash_date','Station ID'], axis=1)

#perform label encoding
# label_encoder = LabelEncoder()
# merged_df['Station ID'] = label_encoder.fit_transform(merged_df['Station ID'])
merged_df

Unnamed: 0,count,UV,Max Humid,Min Temp,WindSpeed,Min Humid,Rain,Max Temp,Evapo-Rain
0,2,33.91,86,19.2,4.78,31,0.0,29.7,8.0
1,1,32.98,81,18.7,4.36,17,0.0,32.7,8.8
2,1,30.86,84,21.0,4.40,29,0.4,29.8,7.6
3,1,34.13,96,14.9,4.59,33,0.0,25.5,6.8
4,3,32.19,87,13.3,6.70,22,0.0,24.1,7.3
...,...,...,...,...,...,...,...,...,...
12094,1,31.06,94,14.8,3.28,28,6.4,24.8,6.2
12095,1,28.10,93,9.3,6.22,47,1.0,20.0,4.9
12096,6,27.39,81,13.2,6.08,26,3.4,23.4,6.6
12097,5,27.94,94,2.3,11.17,54,5.6,10.7,3.3


In [86]:

# Select only the numerical columns
numerical_columns = merged_df.select_dtypes(include=[np.number])

# Calculate the correlation between 'severity' and the numerical columns
correlation = numerical_columns.corr()['count'].sort_values()

# Print the correlation
print(correlation)

Max Humid    -0.182762
Max Temp     -0.110191
Min Humid    -0.071759
UV           -0.068066
Min Temp     -0.013355
Rain         -0.007802
Evapo-Rain    0.015368
WindSpeed     0.170793
count         1.000000
Name: count, dtype: float64


In [None]:
#perform split on the data
from sklearn.model_selection import train_test_split
X = merged_df.drop('severity', axis=1)
y = merged_df['severity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#perform linear regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse



In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

In [None]:
r2

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
merged_df = crashes_per_day.merge(weather_dfs, left_on=['Station ID', 'crash_date'], right_on=['Station ID', 'Date'], how='inner')
# merged_df = merged_df.merge(crashes_per_day, on=['Station ID', 'crash_date'], how='inner')
#drop Date and crash_date columns
merged_df = merged_df.drop(['Date', 'crash_date','Station ID'], axis=1)
merged_df

In [None]:
# Select only the numerical columns
numerical_columns = merged_df.select_dtypes(include=[np.number])

# Calculate the correlation between 'severity' and the numerical columns
correlation = numerical_columns.corr()['count'].sort_values()

# Print the correlation
print(merged_df.corr())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(numerical_columns.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
from scipy import stats

# Assuming you have two sets of observations, obs1 and obs2
obs1 = [1, 2, 3, 4, 5]
obs2 = [2, 3, 4, 5, 6]

t_stat, p_value = stats.ttest_ind(obs1, obs2)

print("P-value: ", p_value)