**Data Source:** https://archive.ics.uci.edu/dataset/492/metro+interstate+traffic+volume

In [3]:
import pandas as pd
import numpy as np
import random

# Load the dataset 
file_path = 'data/Metro_Interstate_Traffic_Volume.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


Notice how "Holday" Column values show up as 'NaN' where there are actually "None" when opened in Excel? We need to instruct Pandas to avoid reading those as NaN or Null as this will become an issue later in the processing steps.

In [4]:
# Load the dataset and treat 'None' as a regular string
data = pd.read_csv(file_path, keep_default_na=False)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


But What if there are actual "Null/NaN" values? This approach will ignore those as well which is not desired. We will have to use a trwo step approach for this dataset.

In [6]:
# Load the dataset normally
data = pd.read_csv(file_path)

# Replace NaN values in 'holiday' column with the string 'None'
data['holiday'].fillna('None', inplace=True)

# Check the changes
print(data['holiday'].unique())


['None' 'Columbus Day' 'Veterans Day' 'Thanksgiving Day' 'Christmas Day'
 'New Years Day' 'Washingtons Birthday' 'Memorial Day' 'Independence Day'
 'State Fair' 'Labor Day' 'Martin Luther King Jr Day']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['holiday'].fillna('None', inplace=True)


In [7]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


# Introduce Noise and Issues to the data

1- Let's randomly introduce missing values into 'temp', 'rain_1h', and 'traffic_volume'.

In [8]:
# Define a function to randomly insert NaN values
def insert_missing(df, column, percentage=0.05):
    num_missing = int(len(df) * percentage)
    missing_indices = random.sample(range(len(df)), num_missing)
    for i in missing_indices:
        df.at[i, column] = np.nan

# Insert missing values
insert_missing(data, 'temp')
insert_missing(data, 'rain_1h')
insert_missing(data, 'traffic_volume')

2- Now, We’ll add outliers to the temp and traffic_volume.

In [9]:
# Function to add outliers
def add_outliers(df, column, percentage=0.01, scale=3):
    num_outliers = int(len(df) * percentage)
    outlier_indices = random.sample(range(len(df)), num_outliers)
    for i in outlier_indices:
        outlier_value = df[column].mean() + scale * df[column].std() * random.choice([-1, 1])
        df.at[i, column] = outlier_value

# Add outliers to temp and traffic_volume
add_outliers(data, 'temp')
add_outliers(data, 'traffic_volume')


3- Next, we introduce Categorical Errors

In [10]:
# Introduce errors in categorical data
def introduce_errors(df, column, error_rate=0.01):
    categories = df[column].unique()
    num_errors = int(len(df) * error_rate)
    error_indices = random.sample(range(len(df)), num_errors)
    for i in error_indices:
        df.at[i, column] = random.choice(categories) + random.choice(['x', ' ', '#', '!'])

introduce_errors(data, 'weather_main')

4- Finally, We’ll add duplicate records.

In [11]:
# Introduce duplicates
def add_duplicates(df, num_duplicates=10):
    duplicates = df.sample(n=num_duplicates, replace=False)
    df = pd.concat([df, duplicates], ignore_index=True)
    return df

data = add_duplicates(data)


Now let's look at the new data

In [12]:
# Display the first 20 rows of the new dataset
data.head(20)

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545.0
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516.0
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,9359.07625
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026.0
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918.0
5,,291.72,0.0,0.0,1,Clear,sky is clear,2012-10-02 14:00:00,5181.0
6,,293.17,0.0,0.0,1,Clear,sky is clear,2012-10-02 15:00:00,5584.0
7,,293.86,0.0,0.0,1,Clear,sky is clear,2012-10-02 16:00:00,6015.0
8,,294.14,0.0,0.0,20,Clouds,few clouds,2012-10-02 17:00:00,5791.0
9,,293.1,0.0,0.0,20,Clouds,few clouds,2012-10-02 18:00:00,4770.0


Finally, save the modified dataset 

In [14]:
data.to_csv('data/modified_Metro_traffic_data.csv', index=False)
