In [1]:
import numpy as np
import pandas as pd

import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [83]:
## Import dataframe from pickle
df = pd.read_pickle('dataset/dataframe.pickle', compression='gzip')

## Issue Date Time

In [85]:
## Count tickets per year
df.issue_date.dt.year.value_counts()

2015.0    2181331
2016.0    1783038
2014.0      39406
2017.0      33936
2013.0         38
2012.0         30
2010.0         12
2011.0         10
Name: issue_date, dtype: int64

In [86]:
## Only 2015 and 2016 has complete dataset so filter those out
df = df[df.issue_date.between('2015', '2017')]

In [87]:
## There are only a few nan values for issue_time
print('Number of nan Issue time: {}'.format((df.issue_time == -1).sum()))
## Replace -1 with median time because the issue time is pretty normal
df.loc[df.issue_time == -1, 'issue_time'] = df.issue_time.median()

Number of nan Issue time: 844


In [90]:
with pd.option_context('display.float_format', '{:.0f}'.format):
    print(df.issue_time.describe())

count   3964369
mean       1230
std         469
min           0
25%         927
50%        1208
75%        1546
max        2359
Name: issue_time, dtype: float64


In [110]:
## Combine Issue Date and Issue Time 
## Convert time to int to remove decimal then to str to pad the front with zero 
## Final format HHMM
time = df.issue_time.astype('int').astype('str').str.pad(width = 4, side = 'left', fillchar = '0')
## Combine date and time
date_time = df.issue_date.astype('str') + ' ' + time
## Add to dataframe
df['issue_datetime'] = pd.to_datetime(date_time, format = '%Y-%m-%d %H%M')

In [112]:
## Verify conversion was correct
df[['issue_datetime', 'issue_date', 'issue_time']].head()

Unnamed: 0,issue_datetime,issue_date,issue_time
0,2015-12-30 22:01:00,2015-12-30,2201.0
1,2015-12-30 22:05:00,2015-12-30,2205.0
2,2015-12-30 17:25:00,2015-12-30,1725.0
3,2015-12-30 17:38:00,2015-12-30,1738.0
4,2015-12-30 18:07:00,2015-12-30,1807.0


## Fine Amount

In [122]:
## Impute fine_amount 
with pd.option_context('display.float_format', '{:.0f}'.format):
    print(df.fine_amount.describe())
print('Number of nan: {}'.format((df.fine_amount == -1).sum()))

count   3964369
mean         70
std          32
min          -1
25%          63
50%          68
75%          73
max         505
Name: fine_amount, dtype: float64
Number of nan: 1532


In [124]:
## Fine looks a little skewed so use median
df.loc[df.fine_amount == -1, 'fine_amount'] = df.fine_amount.median()
print('Verify that there are no more nan: {}'.format((df.fine_amount == -1).sum()))

Verify that there are no more nan: 0


In [129]:
## After imputation, I found that a few of them are $10
## This is incorrect because the minimum fine is 25
## for Display of tabs
print('Fine amount less than $25: {}'.format((df.fine_amount < 25).sum()))
## All of the violations are Display of tabs
print('Number of violations for Display of tabs: {}'.\
      format((df.loc[df.fine_amount < 25, 'violation_description'] == 'DISPLAY OF TABS').sum()))
## From lookup_v_fine.csv, that fine amount is $25
df.loc[df.fine_amount < 25, 'fine_amount'] = 25
## Verify fine correction
print('Fine amount less than $25 after correction: {}'.format((df.fine_amount < 25).sum()))

Fine amount less than $25: 25
Number of violations for Display of tabs: 25
Fine amount less than $25 after correction: 0


In [132]:
## Downcast fine_amount 
df['fine_amount'] = pd.to_numeric(df.fine_amount, downcast = 'unsigned')

## Drop columns

In [138]:
## Remove columns that aren't useful
df.drop(columns= ['meter_id', 'marked_time', 
                  'plate_expiry_date', 'vin', 
                  'body_style', 'color', 
                  'route', 'violation_code', 
                  'latitude', 'longitude', 
                  'issue_date', 'issue_time'], errors = 'ignore', inplace = True)

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3964369 entries, 0 to 4004294
Data columns (total 8 columns):
ticket_number            uint64
rp_state_plate           object
make                     object
location                 object
agency                   category
violation_description    object
fine_amount              uint16
issue_datetime           datetime64[ns]
dtypes: category(1), datetime64[ns](1), object(4), uint16(1), uint64(1)
memory usage: 383.1+ MB


## Export dataframe to csv

In [142]:
df.to_csv('dataset/la_citation_tableau.csv', index = False)