# Data Preparation

**Data Source:** https://archive.ics.uci.edu/dataset/492/metro+interstate+traffic+volume

In [None]:
import pandas as pd
import numpy as np
import random

# Load the dataset 
file_path = 'data/Metro_Interstate_Traffic_Volume.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()
# Test

Notice how "Holday" Column values show up as 'NaN' where there are actually "None" when opened in Excel? We need to instruct Pandas to avoid reading those as NaN or Null as this will become an issue later in the processing steps.

In [None]:
# Load the dataset and treat 'None' as a regular string
data = pd.read_csv(file_path, keep_default_na=False)

# Display the first few rows of the dataset
data.head()

But What if there are actual "Null/NaN" values? This approach will ignore those as well which is not desired. We will have to use a trwo step approach for this dataset.

In [None]:
# Load the dataset normally
data = pd.read_csv(file_path)

# Replace NaN values in 'holiday' column with the string 'None'
# data['holiday'].fillna('None', inplace=True)  # Old Pandas Method
data.fillna({'holiday': 'None'}, inplace=True)  # New Pandas Method
# Check the changes
print(data['holiday'].unique())


In [None]:
# Display the first few rows of the dataset
data.head()

# Introduce Noise and Issues to the data

1- Let's randomly introduce missing values into 'temp', 'rain_1h', and 'traffic_volume'.

In [9]:
# Define a function to randomly insert NaN values
def insert_missing(df, column, percentage=0.05):
    num_missing = int(len(df) * percentage)
    missing_indices = random.sample(range(len(df)), num_missing)
    for i in missing_indices:
        df.at[i, column] = np.nan

# Insert missing values
insert_missing(data, 'temp')
insert_missing(data, 'rain_1h')
insert_missing(data, 'traffic_volume')

2- Now, We’ll add outliers to the temp and traffic_volume.

In [10]:
# Function to add outliers
def add_outliers(df, column, percentage=0.01, scale=3):
    num_outliers = int(len(df) * percentage)
    outlier_indices = random.sample(range(len(df)), num_outliers)
    for i in outlier_indices:
        outlier_value = df[column].mean() + scale * df[column].std() * random.choice([-1, 1])
        df.at[i, column] = outlier_value

# Add outliers to temp and traffic_volume
add_outliers(data, 'temp')
add_outliers(data, 'traffic_volume')


3- Next, we introduce Categorical Errors

In [11]:
# Introduce errors in categorical data
def introduce_errors(df, column, error_rate=0.01):
    categories = df[column].unique()
    num_errors = int(len(df) * error_rate)
    error_indices = random.sample(range(len(df)), num_errors)
    for i in error_indices:
        df.at[i, column] = random.choice(categories) + random.choice(['x', ' ', '#', '!'])

introduce_errors(data, 'weather_main')

4- Finally, We’ll add duplicate records.

In [12]:
# Introduce duplicates
def add_duplicates(df, num_duplicates=10):
    duplicates = df.sample(n=num_duplicates, replace=False)
    df = pd.concat([df, duplicates], ignore_index=True)
    return df

data = add_duplicates(data)


Now let's look at the new data

In [None]:
# Display the first 20 rows of the new dataset
data.head(50)

Finally, save the modified dataset 

In [17]:
data.to_csv('data/modified_Metro_traffic_data.csv', index=False)
