In [60]:
import pandas as pd
from pathlib import Path
import re

In [None]:
# Set the limit for max rows and columns to display
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50

In [61]:
#big_path = "../../../Traffic_Violations.csv"
big_path = "data\Traffic_Violations.csv"
big_df = pd.read_csv(big_path, low_memory=False)

In [62]:
big_df.head()

In [63]:
big_df.columns

In [163]:
# Drop the columns that we have deemed unhelpful
big_df_smaller = big_df.drop(["Agency", "Geolocation", "VehicleType", "HAZMAT", 
                                "Commercial Vehicle", "Commercial License", "Article", "Charge", 
                                "SeqID", "SubAgency", "Model", "Contributed To Accident", "Driver State", 
                                "Driver City", "Arrest Type", "Search Reason For Stop"], axis = 1)

In [164]:
# Drop rows that do not result in a citation or warning
big_df_smaller.drop(big_df_smaller[big_df_smaller['Violation Type'] == 'ESERO'].index, inplace=True)
big_df_smaller.drop(big_df_smaller[big_df_smaller['Violation Type'] == 'SERO'].index, inplace=True)

In [165]:
# Clean the Year column
# Drop rows with null year
big_df_smaller = big_df_smaller.dropna(subset=["Year"])
# Change year to integer
big_df_smaller["Year"]= big_df_smaller["Year"].astype(int)
# Get years in a list
years = big_df_smaller["Year"].unique()
# Make a list of garbage years
years_to_remove = [0, 6338, 1005, 1196, 2912, 1009, 2088, 1007, 2102, 2109, 2105, 2997]
# Clean the list of years
good_years = [value for value in years if value not in years_to_remove]
# Keep only rows with good years
big_df_smaller = big_df_smaller[big_df_smaller["Year"].isin(good_years)]
big_df_smaller.shape

(1790375, 27)

In [166]:
# Keep only data that has latitude and longitude
big_df_smaller = big_df_smaller[(big_df_smaller['Latitude'] != 0) & (big_df_smaller['Longitude'] != 0)]


In [167]:
big_df_smaller.columns

Index(['Date Of Stop', 'Time Of Stop', 'Description', 'Location', 'Latitude',
       'Longitude', 'Accident', 'Belts', 'Personal Injury', 'Property Damage',
       'Fatal', 'Alcohol', 'Work Zone', 'Search Conducted',
       'Search Disposition', 'Search Outcome', 'Search Reason', 'Search Type',
       'Search Arrest Reason', 'State', 'Year', 'Make', 'Color',
       'Violation Type', 'Race', 'Gender', 'DL State'],
      dtype='object')

In [168]:
big_df_smaller.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1665844 entries, 1 to 1878004
Data columns (total 27 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Date Of Stop          1665844 non-null  object 
 1   Time Of Stop          1665844 non-null  object 
 2   Description           1665836 non-null  object 
 3   Location              1665841 non-null  object 
 4   Latitude              1665844 non-null  float64
 5   Longitude             1665844 non-null  float64
 6   Accident              1665844 non-null  object 
 7   Belts                 1665844 non-null  object 
 8   Personal Injury       1665844 non-null  object 
 9   Property Damage       1665844 non-null  object 
 10  Fatal                 1665844 non-null  object 
 11  Alcohol               1665844 non-null  object 
 12  Work Zone             1665844 non-null  object 
 13  Search Conducted      1023793 non-null  object 
 14  Search Disposition    71352 non-nu

In [169]:
# null_count = big_df_smaller['Accident'].isnull().sum()
# null_count

In [170]:
# if big_df_smaller.isnull().values.any():
#     print("There are null values in the DataFrame.")
# else:
#     print("There are no null values in the DataFrame.")

In [171]:
# big_df_smaller[['Fatal', 'Alcohol', 'Work Zone', 'Search Conducted',
#        'Search Disposition', 'Search Outcome', 'Search Reason']].isnull()

In [172]:
# # Check the number of null values in each column
# null_count_per_column = big_df_smaller.isnull().sum()

# # Display the result
# print("Number of null values in each column:")
# print(null_count_per_column)


In [173]:

# big_df_smaller["Search Reason"].value_counts(dropna = False)


In [174]:
big_df_smaller['Date Of Stop'] = pd.to_datetime(big_df_smaller['Date Of Stop'])
big_df_smaller['Year Date'] = big_df_smaller['Date Of Stop'].dt.year
big_df_smaller['Month'] = big_df_smaller['Date Of Stop'].dt.month
big_df_smaller['Day'] = big_df_smaller['Date Of Stop'].dt.day
random_forest_df = big_df_smaller.drop(['Date Of Stop'], axis = 1)

In [175]:
# Drop rows with null Description
big_df_smaller = big_df_smaller.dropna(subset=["Description"])

In [176]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*EXCEEDING.*SPEED LIMIT.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Speeding', x))

In [177]:
big_df_smaller["Description"].nunique()

14529

In [178]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*FAILURE.*YIELD.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Failure to Yield', x))

In [179]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*FAIL.*YIELD.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Failure to Yield', x))

In [180]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*LEARNER.*PERMIT.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Learners Permit', x))

In [181]:
big_df_smaller["Description"].nunique()

14241

In [182]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*SPEED.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Speeding', x))

In [183]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*SPEEDING.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Speeding', x))

In [184]:
big_df_smaller["Description"].nunique()

11705

In [185]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*LAMP.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Equipment', x))

In [186]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*HEADLIGHT.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Equipment', x))

In [187]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*REQUIRED.*MINIMUM.*EQUIPMENT.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Equipment', x))

In [188]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*INOPERATIVE.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Equipment', x))

In [189]:
big_df_smaller["Description"].nunique()

9413

In [190]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*PARK.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Non-Moving Violation', x))

In [191]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*ABANDONING.*VEH.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Non-Moving Violation', x))

In [192]:
big_df_smaller["Description"].nunique()

9151

In [193]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*INFLUENCE.*ALCOHOL.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'DUI', x))

In [194]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*IMPAIRED.*ALCOHOL.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'DUI', x))

In [195]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*FAILED.*STOP.*SIGN.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Stop', x))

In [196]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*FAILURE.*STOP.*SIGN.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Stop', x))

In [197]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*FAIL.*STOP.*SIGN.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Stop', x))

In [198]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*FAILURE.*STOP.*SIGNAL.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Stop', x))

In [199]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*FAILED.*STOP.*SIGNAL.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Stop', x))

In [200]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*FAIL.*STOP.*SIGNAL.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Improper Stop', x))

In [201]:
big_df_smaller["Description"].nunique()

8767

In [202]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*ELECTRONIC.*MSG.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Texting', x))

In [203]:
big_df_smaller["Description"].nunique()

8464

### Function to replace the complete column value if a substring matches

In [204]:
# Function to replace a complete value if a substring matches
def data_cleanup(df, column_name, substring, replacement):
    mask = df[column_name].str.contains(substring, case=False)
    df.loc[mask, column_name] = replacement
    return df

In [205]:
# Replace Decription with REGISTRATION to 'Registration Violation'
column_to_modify = 'Description'
substring_to_match = 'REGISTRATION'
replacement_value = 'Registration Violation'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [206]:
# Replace Decription with REGISTRATION to 'Registration Violation'
column_to_modify = 'Description'
substring_to_match = 'UNREGISTERED'
replacement_value = 'Registration Violation'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [207]:
big_df_smaller["Description"].nunique()

8012

In [208]:
# Replace Decription with SEATBELT to 'SeatBelt - Not restrained'
column_to_modify = 'Description'
substring_to_match = 'SEATBELT'
replacement_value = 'SeatBelt - Not restrained'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [224]:
# Replace Decription with SEAT BELT to 'SeatBelt - Not restrained'
column_to_modify = 'Description'
substring_to_match = 'SEAT BELT'
replacement_value = 'SeatBelt - Not restrained'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [225]:
big_df_smaller["Description"].nunique()

6852

In [210]:
# Replace Decription with LICENSE to 'License Violation'
column_to_modify = 'Description'
substring_to_match = 'LICENSE'
replacement_value = 'License Violation'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [211]:
big_df_smaller["Description"].nunique()

7493

In [218]:
# Replace Decription with SUSPENDED to 'Suspended License or Registration'
column_to_modify = 'Description'
substring_to_match = 'SUSPENDED'
replacement_value = 'Suspended License or Registration'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [213]:
# Replace Decription with LANE to 'Lane Violation'
column_to_modify = 'Description'
substring_to_match = 'LANE'
replacement_value = 'Lane Violation'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [219]:
big_df_smaller["Description"].nunique()

7062

In [221]:
# Replace Decription with UNINSURED to 'Uninsured Vehicle'
column_to_modify = 'Description'
substring_to_match = 'UNINSURED'
replacement_value = 'Uninsured Vehicle'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [222]:
big_df_smaller["Description"].nunique()

6953

In [227]:
# Replace Decription with FAILURE TO STOP to 'Failure to stop at different circumstances'
column_to_modify = 'Description'
substring_to_match = 'FAILURE TO STOP'
replacement_value = 'Failure to stop at different circumstances'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [228]:
big_df_smaller["Description"].nunique()

6692

In [231]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*PHONE.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Using Phone', x))

In [232]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*TEXTMSG.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Using Phone', x))

In [233]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*TEXT.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Using Phone', x))

In [234]:
big_df_smaller["Description"].nunique()

6432

In [235]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*TINT.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Obstructed View', x))

In [236]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*OBSTRUCTED.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Obstructed View', x))

In [237]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*RECKLESS.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Reckless Driving', x))

In [238]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*ACCIDENT.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Involved in Accident', x))

In [239]:
# Use Regex to replace any description including speeding with just "Speeding"
pattern = re.compile(r'.*UNATTENDED.*DAMAGED.*', flags=re.IGNORECASE)

big_df_smaller['Description'] = big_df_smaller['Description'].apply(lambda x: re.sub(pattern, 'Involved in Accident', x))

In [240]:
big_df_smaller["Description"].nunique()

5777

In [243]:
# Replace Decription with PRIVATE PROPERTY to 'Trespassing'
column_to_modify = 'Description'
substring_to_match = 'PRIVATE PROPERTY'
replacement_value = 'Trespassing'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [249]:
big_df_smaller["Description"].nunique()

5617

In [248]:
# Replace Decription with ' TURN' to 'Unsafe Turns'. Note the blank space before TURN to avoid replacing descriptions with 'RETURN'
column_to_modify = 'Description'
substring_to_match = ' TURN'
replacement_value = 'Unsafe Turns'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [251]:
# Replace Decription with '-TURN' to 'Unsafe Turns'.
column_to_modify = 'Description'
substring_to_match = '-TURN'
replacement_value = 'Unsafe Turns'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [253]:
# Combine Decription with 'DRIVING OFF ROADWAY WHILE PASSING VEHICLE' and 'DRIVING MOTOR VEHICLE OFF ROADWAY WHILE PASSING VEHICLE'
column_to_modify = 'Description'
substring_to_match = 'OFF ROADWAY'
replacement_value = 'DRIVING OFF ROADWAY WHILE PASSING VEHICLE'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [255]:
# Replace Decription with 'CROSSWALK' to 'Defy Crosswalk rules'.
column_to_modify = 'Description'
substring_to_match = 'CROSSWALK'
replacement_value = 'Defy Crosswalk rules'
big_df_smaller = data_cleanup(big_df_smaller, column_to_modify, substring_to_match, replacement_value)

In [257]:
big_df_smaller["Description"].nunique()

5586

In [256]:
big_df_smaller["Description"].value_counts()

Speeding                                                                                                301842
Registration Violation                                                                                  268130
License Violation                                                                                       198150
DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC CONTROL DEVICE INSTRUCTIONS                              148229
Improper Stop                                                                                           113981
Improper Equipment                                                                                      106854
DUI                                                                                                      53300
Using Phone                                                                                              51545
Lane Violation                                                                                           49110
S

In [258]:
# Write the processed data to a CSV
processed_file_path = "data\Traffic_Violations_Processed.csv"
big_df_smaller.to_csv(processed_file_path, index=False)