Module - CIS7017 Dissertation
Student ID - #20275320

## Data collection

In [2]:
# Import all relevant libraries
import pandas as pd
import requests
from tqdm import tqdm

In [3]:
# Load the dataset
data = pd.read_csv('C:/dataset/US_Accidents.csv')


KeyboardInterrupt



In [None]:
data.head()

In [1]:
# Filter for rows where the State column is 'UT' for Utah
for column in ['State']:
    print(f"\nValue Counts for {column}:")
    print(data[column].value_counts())
    
utah_data = data[data['State'] == 'UT']


Value Counts for Wind_Direction:


NameError: name 'data' is not defined

### Integrate Altitude data

In [None]:
# Function to get elevations for a list of latitudes and longitudes
def get_elevations(latitudes, longitudes):
    # Validate latitudes and longitudes
    valid_latitudes = [str(lat) for lat in latitudes if -90 <= lat <= 90]
    valid_longitudes = [str(lon) for lon in longitudes if -180 <= lon <= 180]
    
    # Ensure we have the same number of valid latitudes and longitudes
    if len(valid_latitudes) != len(valid_longitudes) or not valid_latitudes:
        return [None] * len(latitudes)  # Return None for invalid pairs
    
    url = f"https://api.open-meteo.com/v1/elevation?latitude={','.join(valid_latitudes)}&longitude={','.join(valid_longitudes)}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # an exception for HTTP error codes
        return response.json().get('elevation', [None] * len(latitudes))
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return [None] * len(latitudes)  # Return None for failed requests

# Splitting the DataFrame into chunks of 100 rows to comply with the API's limitation
chunk_size = 100
altitude_list = []

# Wrap the range function with tqdm to see the progress
for start in tqdm(range(0, utah_data.shape[0], chunk_size), desc='Fetching Altitudes'):
    end = start + chunk_size
    batch = utah_data.iloc[start:end]
    latitudes = batch['Start_Lat'].tolist()
    longitudes = batch['Start_Lng'].tolist()
    
    elevations = get_elevations(latitudes, longitudes)
    altitude_list.extend(elevations)

# Add the altitude information to the DataFrame
utah_data['Altitude'] = altitude_list

In [None]:
# Run again for failed API requests. 
# TODO:: Merge both snippets

def get_elevations(latitudes, longitudes):
    # Construct the API URL with the given latitudes and longitudes
    url = f"https://api.open-meteo.com/v1/elevation?latitude={','.join(map(str, latitudes))}&longitude={','.join(map(str, longitudes))}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an error for bad responses
        return response.json().get('elevation', [None] * len(latitudes))
    except requests.RequestException as e:
        print(f"API request failed: {e}")
        return [None] * len(latitudes)  # Return None for failed requests

# Filter the DataFrame to rows where Altitude is missing (NaN)
missing_altitude_df = utah_data[pd.isna(utah_data['Altitude'])]

# Initialize an empty list to store the fetched altitudes
fetched_altitudes = []

for start in tqdm(range(0, missing_altitude_df.shape[0], chunk_size), desc='Filling Missing Altitudes'):
    end = start + chunk_size
    batch = missing_altitude_df.iloc[start:end]
    latitudes = batch['Start_Lat'].tolist()
    longitudes = batch['Start_Lng'].tolist()
    
    elevations = get_elevations(latitudes, longitudes)
    fetched_altitudes.extend(elevations)

# Update the original DataFrame with the newly fetched altitudes
for (index, altitude), (_, row) in zip(enumerate(fetched_altitudes), missing_altitude_df.iterrows()):
    if altitude is not None:  # Only update if the API call was successful
        utah_data.at[row.name, 'Altitude'] = altitude


In [None]:
# Save the draft dataset to a CSV file
utah_data.to_csv('utah_traffic_accidents.csv', index=False)

### Integrate Temperature Variations, Oxygen Levels, UV Radiation, Hazards etc.

In [None]:
#TODO::Integrate Temperature Variations, Oxygen Levels, UV Radiation, Hazards etc.

## Data cleaning

In [4]:
# Load the saved dataset
utah_data = pd.read_csv('utah_traffic_accidents.csv')

In [None]:
utah_data

In [None]:
# Check for missing values
print("Check for missing values \n")
print(utah_data.isnull().sum())

In [None]:
# Get value counts for a column
for column in ['Precipitation(in)']:
    print(f"\nValue Counts for {column}:")
    print(utah_data[column].value_counts())

In [None]:
# Dropping columns with less percentage of data and unnecessary columns
utah_data = utah_data.drop(columns=['Source', 'End_Lat','End_Lng','Wind_Chill(F)', 'Description', 'Street', 'County', 'Zipcode', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Amenity', 'Bump', 'Give_Way', 'No_Exit', 'Roundabout', 'Traffic_Calming', 'Turning_Loop'],axis=1)
utah_data.columns

In [None]:
# Remove data points with missing values (for insignificant amounts)
utah_data = utah_data.dropna(subset=['Nautical_Twilight', 'Precipitation(in)'])
utah_data=utah_data.dropna(axis=0).reset_index(drop=True)

In [None]:
# Renaming columns
utah_data = utah_data.rename(columns={'Start_Lat': 'Geo_lat', 'Start_Lng': 'Geo_lng', 'Distance(mi)': 'Distance', 'Temperature(F)': 'Temperature',
                                      'Humidity(%)': 'Humidity', 'Pressure(in)': 'Pressure', 'Visibility(mi)': 'Visibility', 'Wind_Speed(mph)': 'Wind_Speed',
                                      'Precipitation(in)': 'Precipitation'})

In [None]:
utah_data = utah_data.drop(columns=['State'],axis=1)

In [6]:
# Drop NA columns
# nan_columns = utah_data.columns[utah_data.isnull().any()].tolist()
# if nan_columns:
#     raise ValueError(f"NaN values found in columns: {nan_columns}")
utah_data = utah_data.dropna()

In [5]:
# Calculate 'Time_Duration' in seconds
utah_data['Start_Time'] = pd.to_datetime(utah_data['Start_Time'])
utah_data['End_Time'] = pd.to_datetime(utah_data['End_Time'])
utah_data['Time_Duration'] = (utah_data['End_Time'] - utah_data['Start_Time']).dt.total_seconds()

In [None]:
#Fixing fractual seconds in Time columns.

try:
    utah_data['Start_Time'] = pd.to_datetime(utah_data['Start_Time']).dt.floor('S')
    utah_data['End_Time'] = pd.to_datetime(utah_data['End_Time']).dt.floor('S')
except Exception as e:
    print("Error converting dates:", e)
    # Attempt to convert with coercion to find problematic entries
    utah_data['Start_Time'] = pd.to_datetime(utah_data['Start_Time'], errors='coerce').dt.floor('S')
    utah_data['End_Time'] = pd.to_datetime(utah_data['End_Time'], errors='coerce').dt.floor('S')
    # Check for NaT values which indicate failed conversions
    problematic_starts = utah_data[utah_data['Start_Time'].isna()]
    problematic_ends = utah_data[utah_data['End_Time'].isna()]
    if not problematic_starts.empty or not problematic_ends.empty:
        print("Problematic Start Times:", problematic_starts)
        print("Problematic End Times:", problematic_ends)

In [6]:
# Re-saving the cleaned dataset to a CSV file
utah_data.to_csv('utah_traffic_accidents.csv', index=False)

In [None]:
# ## Tests...
# #can not be used due to the API limits
# 
# API_KEY = 'hC05ajfrO8WgNBrvZv6j7ifPew7vG2nn'
# 
# BASE_URL = 'https://api.tomtom.com/traffic/services/4/flowSegmentData/absolute/10/json'
# COORDINATES = '40.781876,-111.910858'  # Replace with your specific coordinates
# 
# def get_traffic_data(api_key, coordinates):
#     params = {
#         'key': api_key,
#         'point': coordinates,
#         'unit': 'KMPH',
#     }
# 
#     response = requests.get(BASE_URL, params=params)
# 
#     if response.status_code == 200:
#         return response.json()  # Parse JSON response if the call was successful
#     else:
#         raise Exception(f"Failed to fetch data: {response.status_code} - {response.text}")
# 
# if __name__ == "__main__":
#     try:
#         traffic_data = get_traffic_data(API_KEY, COORDINATES)
#         print(traffic_data)
#     except Exception as e:
#         print(e)
