In [1]:
import seaborn as sn
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from pprint import pprint
import random
from sklearn import preprocessing
from __future__ import division
import pickle
from datetime import datetime


filename = '../data/parking_citations_uncorrupted.csv'
random.seed(42)

In [2]:
# Get headers and data types
columns = []
with open(filename) as f:
    reader = csv.reader(f)
    columns = reader.next()
    
names = [col.replace(" ","_") for col in columns]

dtypes = {
    'Ticket_number': 'unicode' ,
    'Issue_Date': 'unicode' ,
    'Issue_Time': np.float64  ,
    'Meter_Id': 'unicode',
    'Marked_Time': np.float64 ,
    'RP_State_Plate': 'unicode',
    'Plate_Expiry_Date': 'unicode' ,
    'VIN': 'unicode' ,
    'Make': 'unicode' ,
    'Body Style': 'unicode' ,
    'Color': 'unicode' ,
    'Location': 'unicode' ,
    'Route': 'unicode' ,
    'Agency': 'unicode' ,
    'Violation_Code': 'unicode' ,
    'Violation_Description': 'unicode' ,
    'Fine_amount': np.float64 ,
    'Latitude': np.float64 ,
    'Longitude': np.float64 ,
}


In [3]:
#%%timeit -n 5 -r 1

# Load data from file
citations = pd.read_csv(
    filename ,
    header = 0 ,
    names = names ,
    dtype = dtypes ,
    #parse_dates = parse_dates ,
)

In [9]:
# check issue parsing to make sure it's accurate
print(citations['Issue_Date']).isnull().sum() 

citations['Issue_Date_Parsed'] = pd.to_datetime(citations['Issue_Date'])

print(citations[['Issue_Date','Issue_Date_Parsed']].dtypes)
citations[['Issue_Date','Issue_Date_Parsed']].head()
citations[['Issue_Date','Issue_Date_Parsed']].describe(include='all')

0
Issue_Date                   object
Issue_Date_Parsed    datetime64[ns]
dtype: object


Unnamed: 0,Issue_Date,Issue_Date_Parsed
count,4357535,4357535
unique,1724,1724
top,2016-01-19T00:00:00,2016-01-19 00:00:00
freq,4998,4998
first,,2010-01-09 00:00:00
last,,2019-01-10 00:00:00


In [16]:
# check expiry date parsing to make sure it's accurate
citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].fillna('')
citations['Plate_Expiry_Date_Parsed'] = pd.to_datetime(citations['Plate_Expiry_Date'].str[:-2], format='%Y%m', errors='coerce')
citations['expired'] = citations['Plate_Expiry_Date_Parsed'] < citations['Issue_Date_Parsed']

#citations[['Plate_Expiry_Date']][citations['Plate_Expiry_Date'].notnull()].head()
citations[['Plate_Expiry_Date','Issue_Date_Parsed', 'Plate_Expiry_Date_Parsed','expired']][citations['Plate_Expiry_Date'].notnull()].head()

Unnamed: 0,Plate_Expiry_Date,Issue_Date_Parsed,Plate_Expiry_Date_Parsed,expired
0,200316.0,2015-09-15,NaT,False
1,201605.0,2015-12-17,2016-05-01,False
2,201511.0,2015-12-22,2015-11-01,True
3,201701.0,2015-12-22,2017-01-01,False
4,201605.0,2015-12-27,2016-05-01,False


In [17]:
# store original dates for comparison
citations['Issue_Date_Orig'] = citations['Issue_Date']
citations['Plate_Expiry_Date_Orig'] = citations['Plate_Expiry_Date']

# parse dates
citations['Issue_Date'] = pd.to_datetime(citations['Issue_Date'])

citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].fillna('')
citations['Plate_Expiry_Date'] = pd.to_datetime(citations['Plate_Expiry_Date'].str[:-2], format='%Y%m', errors='coerce')

In [6]:
%%timeit -n 5 -r 1

# Top 25 makes
top_25_makes = citations.groupby(['Make']).size().sort_values(ascending=False)[:25]

5 loops, best of 1: 202 ms per loop


In [13]:
%%timeit -n 5 -r 1

# Most common color for each make
most_common_color = pd.DataFrame(citations.groupby(['Make','Color']).size())#.sort_values(ascending = False)
most_common_color = most_common_color.reset_index(level='Make').groupby(['Make'])[0].idxmax()

5 loops, best of 1: 859 ms per loop


In [11]:
%%timeit -n 5 -r 1

# First ticket issued for each make
first_ticket_issued = citations.groupby(['Make'])['Issue_Date'].idxmin()
first_ticket_issued = citations.loc[first_ticket_issued.values][['Make','Ticket_number']]

5 loops, best of 1: 748 ms per loop


In [25]:
# check nulls of plate expiry date
print citations['Plate_Expiry_Date'].isnull().sum()
print citations['Plate_Expiry_Date_Parsed'].isnull().sum()

citations[['Plate_Expiry_Date','Plate_Expiry_Date_Orig','Issue_Date','expired']][citations['Plate_Expiry_Date'].isnull()].head()

693162
693162


Unnamed: 0,Plate_Expiry_Date,Plate_Expiry_Date_Orig,Issue_Date,expired
0,NaT,200316.0,2015-09-15,False
5,NaT,,2015-09-16,False
17,NaT,,2015-12-26,False
82,NaT,,2015-12-21,False
86,NaT,,2015-12-15,False


In [31]:
# is out of state license more likely to be expired than in-state?
citations = citations[citations['Plate_Expiry_Date'].notnull()]

# capture overall probability of expiry for testing
expired = citations[citations['Plate_Expiry_Date'] < citations['Issue_Date']].shape[0]
total = citations.shape[0]
prb = expired / total
print(prb)

oos_expired = citations[(citations['RP_State_Plate'] != 'CA') & (citations['Plate_Expiry_Date'] < citations['Issue_Date'])].shape[0]
oos_total = citations[(citations['RP_State_Plate'] != 'CA')].shape[0]
oos_prb = oos_expired / oos_total
print(oos_prb)

is_expired = citations[(citations['RP_State_Plate'] == 'CA') & (citations['Plate_Expiry_Date'] < citations['Issue_Date'])].shape[0]
is_total = citations[(citations['RP_State_Plate'] == 'CA')].shape[0]
is_prb = is_expired / is_total
print(is_prb)

0.220682501481
0.234497372744
0.219798483794


Out of state probability: 0.234497372744
In state probability: 0.219798483794

In [66]:
# test whether probabilities are different
# TS = (oos_prb - is_prb) / sqrt( (prb)(1-prb)*( (1/oos_total) + (1/is_total) ) )

from math import sqrt
import scipy.stats as st

numerator = (oos_prb - is_prb)
denominator = sqrt(((prb)*(1-prb)) * ((1/oos_total)+(1/is_total)))
z_stat = numerator / denominator
print(z_stat)

# calculate z-stat
p_stat = st.norm.cdf(z_stat)
print(1-p_stat)

16.1310475927
0.0
