In [1]:
import seaborn as sn
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from pprint import pprint
import random
from sklearn import preprocessing
from __future__ import division
import pickle
from datetime import datetime


filename = '../data/parking_citations_uncorrupted.csv'
random.seed(42)

In [2]:
# Get headers and data types
columns = []
with open(filename) as f:
    reader = csv.reader(f)
    columns = reader.next()
    
names = [col.replace(" ","_") for col in columns]

dtypes = {
    'Ticket_number': 'unicode' ,
    'Issue_Date': 'unicode' ,
    'Issue_Time': np.float64  ,
    'Meter_Id': 'unicode',
    'Marked_Time': np.float64 ,
    'RP_State_Plate': 'unicode',
    'Plate_Expiry_Date': 'unicode' ,
    'VIN': 'unicode' ,
    'Make': 'unicode' ,
    'Body Style': 'unicode' ,
    'Color': 'unicode' ,
    'Location': 'unicode' ,
    'Route': 'unicode' ,
    'Agency': 'unicode' ,
    'Violation_Code': 'unicode' ,
    'Violation_Description': 'unicode' ,
    'Fine_amount': np.float64 ,
    'Latitude': np.float64 ,
    'Longitude': np.float64 ,
}


In [4]:
#%%timeit -n 5 -r 1

# Load data from file
citations = pd.read_csv(
    filename ,
    header = 0 ,
    names = names ,
    dtype = dtypes ,
    #parse_dates = parse_dates ,
)

In [5]:
citations['Issue_Date'] = pd.to_datetime(citations['Issue_Date'])

citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].fillna('')
citations['Plate_Expiry_Date'] = pd.to_datetime(citations['Plate_Expiry_Date'].str[:-2], format='%Y%M', errors='coerce')

In [6]:
%%timeit -n 5 -r 1

# Top 25 makes
top_25_makes = citations.groupby(['Make']).size().sort_values(ascending=False)[:25]

5 loops, best of 1: 202 ms per loop


In [13]:
%%timeit -n 5 -r 1

# Most common color for each make
most_common_color = pd.DataFrame(citations.groupby(['Make','Color']).size())#.sort_values(ascending = False)
most_common_color = most_common_color.reset_index(level='Make').groupby(['Make'])[0].idxmax()

5 loops, best of 1: 859 ms per loop


In [11]:
%%timeit -n 5 -r 1

# First ticket issued for each make
first_ticket_issued = citations.groupby(['Make'])['Issue_Date'].idxmin()
first_ticket_issued = citations.loc[first_ticket_issued.values][['Make','Ticket_number']]

5 loops, best of 1: 748 ms per loop


In [24]:
# is out of state license more likely to be expired than in-state?
citations = citations[citations['Plate_Expiry_Date'].notnull()]

oos_expired = citations[(citations['RP_State_Plate'] != 'CA') & (citations['Plate_Expiry_Date'] < citations['Issue_Date'])].shape[0]
oos_total = citations[(citations['RP_State_Plate'] != 'CA')].shape[0]
oos_prb = oos_expired / oos_total
print(oos_prb)

is_expired = citations[(citations['RP_State_Plate'] == 'CA') & (citations['Plate_Expiry_Date'] < citations['Issue_Date'])].shape[0]
is_total = citations[(citations['RP_State_Plate'] == 'CA')].shape[0]
is_prb = is_expired / is_total
print(is_prb)

0.574891225526
0.609997203871
CPU times: user 2.57 s, sys: 579 ms, total: 3.14 s
Wall time: 3.14 s


Out of state probability: 0.574891225526
In state probability: 0.609997203871
    
Conclusion:  Out of state plates are not more likely to be expired when ticketed than in-state plates.