<a href="https://colab.research.google.com/github/junclemente/ads507-finalproject/blob/main/ADS_507_EDA_Source_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Import Data & Libraries

In [1]:
#library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#get URLS
traffic_url = 'https://raw.githubusercontent.com/junclemente/ads507-finalproject/main/datasets/traffic_alerts.csv'
weather_url = 'https://raw.githubusercontent.com/junclemente/ads507-finalproject/main/datasets/weather_alerts.csv'
travel_url = 'https://raw.githubusercontent.com/junclemente/ads507-finalproject/main/datasets/travel_times.csv'

# Read the CSV file
traffic_df = pd.read_csv(traffic_url)
weather_df = pd.read_csv(weather_url)
travel_df = pd.read_csv(travel_url)

## Create functions

In [3]:
#create categorical data quality report function
# Calculate mode and frequency for each categorical feature
def cat_dqr(df, categorical_features):  # define function to calculate categorical features
    modes = []
    mode_freqs = []
    second_modes = []
    second_mode_freqs = []
    mode_percentages = []
    second_mode_percentages = []

    for feature in categorical_features:
        count = df[feature].count()
        value_counts = df[feature].value_counts()

        # Mode and its frequency
        mode = value_counts.index[0] if not value_counts.empty else None
        mode_freq = value_counts.iloc[0] if not value_counts.empty else 0
        mode_percentage = (mode_freq / count) * 100 if count > 0 else 0

        # Second mode and its frequency
        if len(value_counts) > 1:
            second_mode = value_counts.index[1]
            second_mode_freq = value_counts.iloc[1]
            second_mode_percentage = (second_mode_freq / count) * 100 if count > 0 else 0
        else:
            second_mode = None
            second_mode_freq = 0
            second_mode_percentage = 0

        # Append results
        modes.append(mode)
        mode_freqs.append(mode_freq)
        mode_percentages.append(mode_percentage)
        second_modes.append(second_mode)
        second_mode_freqs.append(second_mode_freq)
        second_mode_percentages.append(second_mode_percentage)

    # Build the data quality report
    cat_quality = pd.DataFrame({
        'Feature': categorical_features,
        'Count': df[categorical_features].count().values,
        'Missing Values': df[categorical_features].isnull().sum().values,
        'Cardinality': df[categorical_features].nunique().values,
        'Mode': modes,
        'Mode Frequency': mode_freqs,
        'Mode %': mode_percentages,
        '2nd Mode': second_modes,
        '2nd Mode Frequency': second_mode_freqs,
        '2nd Mode %': second_mode_percentages,
    })

    return cat_quality


In [4]:
#create continuous data quality function
def cont_dqr(df, exclude_columns=None):
    # Select continuous features (int64, float64)
    continuous_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

    # Exclude specified columns
    if exclude_columns:
        continuous_features = [col for col in continuous_features if col not in exclude_columns]

    # Build the data quality report for continuous features
    cont_quality = pd.DataFrame({
        'Feature': continuous_features,
        'Count': df[continuous_features].count().values,
        'Missing Values': df[continuous_features].isnull().sum().values,
        'Cardinality': df[continuous_features].nunique().values,
        'Min': df[continuous_features].min().values,
        '1st Quartile': df[continuous_features].quantile(0.25).values,
        'Mean': df[continuous_features].mean().values,
        'Median': df[continuous_features].median().values,
        '3rd Quartile': df[continuous_features].quantile(0.75).values,
        'Max': df[continuous_features].max().values,
        'Standard Deviation': df[continuous_features].std().values,
    })

    return cont_quality

# Traffic Data

## Traffic Data Review

In [5]:
traffic_df.head()

Unnamed: 0,AlertID,County,EndRoadwayLocation,EndTime,EventCategory,EventStatus,ExtendedDescription,HeadlineDescription,LastUpdatedTime,Priority,Region,StartRoadwayLocation,StartTime,timestamp,ta_id
0,624112,Grant,"{""Description"": null, ""Direction"": ""B"", ""Latit...",,Rest Area,Open,,The blue Lake rest area is closed for the winter.,/Date(1731629066113-0800)/,Low,North Central,"{""Description"": null, ""Direction"": ""B"", ""Latit...",/Date(1731628800000-0800)/,2025-02-03 22:21:17.828031,2
1,632850,,"{""Description"": null, ""Direction"": ""B"", ""Latit...",/Date(1738328400000-0800)/,Maintenance,Open,,"Nightly from 9 p.m. until 5 a.m. Sunday, Jan. ...",/Date(1737765776500-0800)/,High,Northwest,"{""Description"": null, ""Direction"": ""B"", ""Latit...",/Date(1737765300000-0800)/,2025-02-03 22:21:17.828031,2
2,632245,,"{""Description"": null, ""Direction"": ""B"", ""Latit...",/Date(1738371600000-0800)/,Maintenance,Open,,"Weekdays, from 7 a.m. to 5 p.m., Wednesday, Ja...",/Date(1737148405100-0800)/,Low,Northwest,"{""Description"": null, ""Direction"": ""B"", ""Latit...",/Date(1737148080000-0800)/,2025-02-03 22:21:17.828031,2
3,626169,Pend Oreille,"{""Description"": null, ""Direction"": ""B"", ""Latit...",,Road Report,Open,,"US2 from Spokane to Newport, road condition: ...",/Date(1738649920440-0800)/,Medium,Eastern,"{""Description"": null, ""Direction"": ""B"", ""Latit...",/Date(1738642140000-0800)/,2025-02-03 22:21:17.828031,2
4,632835,,"{""Description"": null, ""Direction"": ""B"", ""Latit...",,Construction,Open,,Eastbound and westbound SR 18 will shift to th...,/Date(1737761204897-0800)/,Low,Northwest,"{""Description"": null, ""Direction"": ""B"", ""Latit...",/Date(1738155600000-0800)/,2025-02-03 22:21:17.828031,2


In [6]:
traffic_df.shape

(203, 15)

In [7]:
#are there duplicates?
traffic_df.duplicated().sum()

0

In [8]:
#get datatypes
traffic_df.dtypes

Unnamed: 0,0
AlertID,int64
County,object
EndRoadwayLocation,object
EndTime,object
EventCategory,object
EventStatus,object
ExtendedDescription,float64
HeadlineDescription,object
LastUpdatedTime,object
Priority,object


In [9]:
traffic_df['AlertID'].nunique()

203

In [10]:
traffic_df['ta_id'].nunique()

1

There is 203 rows, Alert ID has 203 values. Thus every is unique / candidate for a primary key. ta ID appears to only have one value, everything = '2'.

## Data Quality Report - Traffic


Will do categorical data only as the integer data is not needed.

In [12]:
traffic_catf = traffic_df.select_dtypes(include=['object', 'category']).columns.tolist()
traffic_quality_report = cat_dqr(traffic_df, traffic_catf)
traffic_quality_report

Unnamed: 0,Feature,Count,Missing Values,Cardinality,Mode,Mode Frequency,Mode %,2nd Mode,2nd Mode Frequency,2nd Mode %
0,County,18,185,11,Spokane,6,33.333333,Whitman,3,16.666667
1,EndRoadwayLocation,203,0,131,"{""Description"": null, ""Direction"": ""N"", ""Latit...",18,8.866995,"{""Description"": null, ""Direction"": ""S"", ""Latit...",9,4.433498
2,EndTime,161,42,90,/Date(1738328400000-0800)/,20,12.42236,/Date(1738933200000-0800)/,10,6.21118
3,EventCategory,203,0,14,Construction,139,68.472906,Maintenance,26,12.807882
4,EventStatus,203,0,1,Open,203,100.0,,0,0.0
5,HeadlineDescription,203,0,201,The Iron Goat rest area is closed for the winter.,2,0.985222,"From 8 a.m. Monday, Feb. 3 until noon Friday, ...",2,0.985222
6,LastUpdatedTime,203,0,203,/Date(1731629066113-0800)/,1,0.492611,/Date(1737761028007-0800)/,1,0.492611
7,Priority,203,0,5,Medium,108,53.20197,Low,70,34.482759
8,Region,203,0,6,Northwest,147,72.413793,Olympic,20,9.852217
9,StartRoadwayLocation,203,0,170,"{""Description"": null, ""Direction"": ""S"", ""Latit...",3,1.477833,"{""Description"": null, ""Direction"": ""N"", ""Latit...",3,1.477833


# Weather Data

## Weather Data Review

In [13]:
weather_df.head()

Unnamed: 0,BarometricPressure,Latitude,Longitude,PrecipitationInInches,ReadingTime,RelativeHumidity,SkyCoverage,StationID,StationName,TemperatureInFahrenheit,Visibility,WindDirection,WindDirectionCardinal,WindGustSpeedInMPH,WindSpeedInMPH,timestamp,wa_id
0,950.0,47.4748,-122.2704,,/Date(1738648806000-0800)/,99.0,,1909,S 144th St on SB I-5 at mp 155.32,34.52,1.0,130.0,SE,1.0,0.0,2025-02-03 22:21:17.828031,3
1,894.7,47.760633,-122.184048,,/Date(1738648817000-0800)/,100.0,,1910,NE 195th on SB I-405 at mp 24.58,32.72,1.0,5.0,N,0.0,0.0,2025-02-03 22:21:17.828031,3
2,971.6,47.509,-121.885,,/Date(1738648807000-0800)/,98.0,,1928,EB I-90 / SR-18 (Echo Lake) at mp 26.30,31.28,1.0,,,1.0,0.0,2025-02-03 22:21:17.828031,3
3,993.5,47.726,-122.324,,/Date(1738648871000-0800)/,96.0,,1966,NE 130th Street on I-5 at mp 173.75,33.8,1.0,,,1.0,0.0,2025-02-03 22:21:17.828031,3
4,,46.436,-117.35,,/Date(1738649254000-0800)/,93.0,,1968,Alpowa Summit on US 12 at mp 413.36,30.2,12.0,171.0,S,8.0,6.0,2025-02-03 22:21:17.828031,3


In [14]:
#are there duplicates?
weather_df.duplicated().sum()

0

In [15]:
#get data types
weather_df.dtypes

Unnamed: 0,0
BarometricPressure,float64
Latitude,float64
Longitude,float64
PrecipitationInInches,float64
ReadingTime,object
RelativeHumidity,float64
SkyCoverage,float64
StationID,int64
StationName,object
TemperatureInFahrenheit,float64


## Weather Data Quality Report

### Weather Continuous Data Quality Report

In [47]:
exclude_columns = []
weather_cont_quality = cont_dqr(weather_df, exclude_columns)
weather_cont_quality

Unnamed: 0,Feature,Count,Missing Values,Cardinality,Min,1st Quartile,Mean,Median,3rd Quartile,Max,Standard Deviation
0,BarometricPressure,34,69,33,825.5,946.325,965.820588,979.3,996.275,1099.5,54.53081
1,Latitude,103,0,103,45.569,46.53773,47.220197,47.240023,47.7512,49.0,0.8370237
2,Longitude,103,0,103,-124.333,-122.401108,-120.779463,-120.786,-119.396143,-117.081125,1.980481
3,PrecipitationInInches,0,103,0,,,,,,,
4,RelativeHumidity,90,13,34,1.0,77.25,84.133333,90.5,97.0,100.0,19.63041
5,SkyCoverage,0,103,0,,,,,,,
6,StationID,103,0,103,1909.0,2097.5,2975.378641,2955.0,3271.5,5927.0,1000.412
7,TemperatureInFahrenheit,94,9,61,-36.4,27.32,29.024255,30.74,33.08,36.32,8.217564
8,Visibility,86,17,3,0.0,1.0,0.965116,1.0,1.0,12.0,1.259764
9,WindDirection,68,35,39,0.0,10.0,82.514706,67.5,145.5,230.0,72.43


wa_id is not helpful as a primary key. We will need to create a new one. SkyCoverage completely null. Have Long/Lat for everything

### Weather Categorical Data Quality Report

In [52]:

weather_catf = weather_df.select_dtypes(include=['object', 'category']).columns.tolist()
weather_quality_report = cat_dqr(weather_df, weather_catf)
weather_quality_report

Unnamed: 0,Feature,Count,Missing Values,Cardinality,Mode,Mode Frequency,Mode %,2nd Mode,2nd Mode Frequency,2nd Mode %
0,ReadingTime,103,0,102,/Date(1738649594000-0800)/,2,1.941748,/Date(1738648806000-0800)/,1,0.970874
1,StationName,103,0,103,S 144th St on SB I-5 at mp 155.32,1,0.970874,Parker on I-82 at mp 41.50,1,0.970874
2,WindDirectionCardinal,58,45,11,N,8,13.793103,ENE,7,12.068966
3,timestamp,103,0,1,2025-02-03 22:21:17.828031,103,100.0,,0,0.0


# Travel

## Travel Data Review

In [17]:
travel_df.head()

Unnamed: 0,AverageTime,CurrentTime,Description,Distance,EndPoint,Name,StartPoint,TimeUpdated,TravelTimeID,timestamp,tt_id
0,25,25,Everett to Downtown Seattle using HOV lanes,26.72,"{""Description"": ""I-5 @ University St in Seattl...",Everett-Seattle HOV,"{""Description"": ""I-5 @ 41st St in Everett"", ""D...",/Date(1738650000000-0800)/,2,2025-02-03 22:21:17.828031,1
1,27,25,Downtown Seattle to Everett using HOV lanes,26.94,"{""Description"": ""I-5 @ 41st St in Everett"", ""D...",Seattle-Everett HOV,"{""Description"": ""I-5 @ University St in Seattl...",/Date(1738650000000-0800)/,3,2025-02-03 22:21:17.828031,1
2,27,25,Downtown Seattle to Everett,26.94,"{""Description"": ""I-5 @ 41st St in Everett"", ""D...",Seattle-Everett,"{""Description"": ""I-5 @ University St in Seattl...",/Date(1738650000000-0800)/,4,2025-02-03 22:21:17.828031,1
3,9,9,Downtown Bellevue to Issaquah,9.28,"{""Description"": ""I-90 @ Front St in Issaquah"",...",Bellevue-Issaquah,"{""Description"": ""I-405 @ NE 8th St in Bellevue...",/Date(1738650000000-0800)/,5,2025-02-03 22:21:17.828031,1
4,9,9,Downtown Bellevue to Issaquah using HOV lanes,9.28,"{""Description"": ""I-90 @ Front St in Issaquah"",...",Bellevue-Issaquah HOV,"{""Description"": ""I-405 @ NE 8th St in Bellevue...",/Date(1738650000000-0800)/,6,2025-02-03 22:21:17.828031,1


In [19]:
#are there duplicate rows?
travel_df.duplicated().sum()

0

In [20]:
#get data types
travel_df.dtypes

Unnamed: 0,0
AverageTime,int64
CurrentTime,int64
Description,object
Distance,float64
EndPoint,object
Name,object
StartPoint,object
TimeUpdated,object
TravelTimeID,int64
timestamp,object


## Travel Data Quality Report

### Travel Continuous Data Quality Report

In [21]:
exclude_columns = []
travel_cont_quality = cont_dqr(travel_df, exclude_columns)
travel_cont_quality

Unnamed: 0,Feature,Count,Missing Values,Cardinality,Min,1st Quartile,Mean,Median,3rd Quartile,Max,Standard Deviation
0,AverageTime,168,0,31,0.0,9.0,14.482143,13.0,19.25,38.0,7.526379
1,CurrentTime,168,0,33,0.0,9.0,15.083333,14.0,20.0,50.0,8.155977
2,Distance,168,0,122,2.66,9.6125,15.110179,13.545,18.3325,39.79,7.367362
3,TravelTimeID,168,0,168,1.0,45.75,192.303571,109.0,344.25,532.0,160.769957
4,tt_id,168,0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0


### Categorical Data Quality Report


In [22]:
travel_catf = travel_df.select_dtypes(include=['object', 'category']).columns.tolist()
travel_quality_report = cat_dqr(travel_df, travel_catf)
travel_quality_report

Unnamed: 0,Feature,Count,Missing Values,Cardinality,Mode,Mode Frequency,Mode %,2nd Mode,2nd Mode Frequency,2nd Mode %
0,Description,168,0,165,Downtown Seattle to Woodinville,2,1.190476,Woodinville to Downtown Seattle,2,1.190476
1,EndPoint,168,0,89,"{""Description"": ""I-5 @ University St in Seattl...",13,7.738095,"{""Description"": ""I-5 @ University St in Seattl...",12,7.142857
2,Name,168,0,168,Everett-Seattle HOV,1,0.595238,King County Line to Federal Way HOV,1,0.595238
3,StartPoint,168,0,91,"{""Description"": ""I-5 @ University St in Seattl...",11,6.547619,"{""Description"": ""I-405 @ NE 8th St in Bellevue...",10,5.952381
4,TimeUpdated,168,0,1,/Date(1738650000000-0800)/,168,100.0,,0,0.0
5,timestamp,168,0,1,2025-02-03 22:21:17.828031,168,100.0,,0,0.0
