In [1]:
import seaborn as sn
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from pprint import pprint
import random
from sklearn import preprocessing
from __future__ import division
import pickle
from datetime import datetime


filename = '../data/parking_citations_uncorrupted.csv'
random.seed(42)

In [2]:
# Get headers and data types
columns = []
with open(filename) as f:
    reader = csv.reader(f)
    columns = reader.next()
    
names = [col.replace(" ","_") for col in columns]

dtypes = {
    'Ticket_number': 'unicode' ,
    'Issue_Date': 'unicode' ,
    'Issue_Time': np.float64  ,
    'Meter_Id': 'unicode',
    'Marked_Time': np.float64 ,
    'RP_State_Plate': 'unicode',
    'Plate_Expiry_Date': 'unicode' ,
    'VIN': 'unicode' ,
    'Make': 'unicode' ,
    'Body Style': 'unicode' ,
    'Color': 'unicode' ,
    'Location': 'unicode' ,
    'Route': 'unicode' ,
    'Agency': 'unicode' ,
    'Violation_Code': 'unicode' ,
    'Violation_Description': 'unicode' ,
    'Fine_amount': np.float64 ,
    'Latitude': np.float64 ,
    'Longitude': np.float64 ,
}


In [3]:
%%time

# Load data from file
citations = pd.read_csv(
    filename ,
    header = 0 ,
    names = names ,
    dtype = dtypes ,
    #parse_dates = parse_dates ,
)

CPU times: user 12.4 s, sys: 1.31 s, total: 13.7 s
Wall time: 17.6 s


In [4]:
citations['Issue_Date'] = pd.to_datetime(citations['Issue_Date'])

citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].fillna('')
citations['Plate_Expiry_Date'] = pd.to_datetime(citations['Plate_Expiry_Date'].str[:-2], format='%Y%M', errors='coerce')

0         2003-01-01 00:16:00
1         2016-01-01 00:05:00
2         2015-01-01 00:11:00
3         2017-01-01 00:01:00
4         2016-01-01 00:05:00
5                         NaT
6         2016-01-01 00:06:00
7         2015-01-01 00:09:00
8         2015-01-01 00:11:00
9         2016-01-01 00:10:00
10        2016-01-01 00:01:00
11        2016-01-01 00:01:00
12        2015-01-01 00:10:00
13        2015-01-01 00:06:00
14        2015-01-01 00:11:00
15        2015-01-01 00:12:00
16        2016-01-01 00:01:00
17                        NaT
18        2015-01-01 00:08:00
19        2016-01-01 00:02:00
20        2016-01-01 00:06:00
21        2015-01-01 00:06:00
22        2016-01-01 00:01:00
23        2015-01-01 00:08:00
24        2015-01-01 00:04:00
25        2016-01-01 00:08:00
26        2016-01-01 00:01:00
27        2015-01-01 00:05:00
28        2016-01-01 00:05:00
29        2015-01-01 00:09:00
                  ...        
4357505   2019-01-01 00:05:00
4357506   2019-01-01 00:04:00
4357507   

In [100]:
citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].apply(lambda x: str(x)[:-2] if len(x) == 8 else np.nan)

def convert_date_string(string):
    if type(string) != str:
        return np.nan
    else:
        date = datetime.strptime(string, '%Y%M').replace(hour=0, minute=0)

    return date

citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].apply(lambda x: convert_date_string(x))


In [11]:
%%time

# Top 25 makes
top_25_makes = citations.groupby(['Make']).size().sort_values(ascending=False)[:25]

CPU times: user 202 ms, sys: 15.9 ms, total: 218 ms
Wall time: 217 ms


In [118]:
%%time

# Most common color for each make
most_common_color = pd.DataFrame(citations.groupby(['Make','Color']).size())#.sort_values(ascending = False)
most_common_color = most_common_color.reset_index(level='Make').groupby(['Make'])[0].idxmax()


CPU times: user 828 ms, sys: 39.9 ms, total: 868 ms
Wall time: 868 ms


In [33]:
%%time

# First ticket issued for each make
first_ticket_issued = citations.groupby(['Make'])['Issue_Date'].idxmin()
first_ticket_issued = citations.loc[first_ticket_issued.values][['Make','Ticket_number']]

CPU times: user 631 ms, sys: 136 ms, total: 767 ms
Wall time: 766 ms


In [24]:
%%time 

# is out of state license more likely to be expired than in-state?
citations = citations[citations['Plate_Expiry_Date'].notnull()]

oos_expired = citations[(citations['RP_State_Plate'] != 'CA') & (citations['Plate_Expiry_Date'] < citations['Issue_Date'])].shape[0]
oos_total = citations[(citations['RP_State_Plate'] != 'CA')].shape[0]
oos_prb = oos_expired / oos_total
print(oos_prb)

is_expired = citations[(citations['RP_State_Plate'] == 'CA') & (citations['Plate_Expiry_Date'] < citations['Issue_Date'])].shape[0]
is_total = citations[(citations['RP_State_Plate'] == 'CA')].shape[0]
is_prb = is_expired / is_total
print(is_prb)

0.574891225526
0.609997203871
CPU times: user 2.57 s, sys: 579 ms, total: 3.14 s
Wall time: 3.14 s


Out of state probability: 0.574891225526
In state probability: 0.609997203871
    
Conclusion:  Out of state plates are not more likely to be expired when ticketed than in-state plates.