In [1]:
import pandas as pd
import numpy as np

In [2]:
# loading in the data
training_data = pd.read_csv('readonly/train.csv', encoding='latin1')
testing_data = pd.read_csv('readonly/test.csv')
address_data = pd.read_csv('readonly/addresses.csv')
latlon_data = pd.read_csv('readonly/latlons.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# removing variables unavailable in test set
unavailable_vars = ['payment_amount', 'payment_date', 'payment_status', 
                   'balance_due', 'collection_status', 'compliance_detail']
training_data.drop(unavailable_vars, inplace=True, axis=1)

In [4]:
# examining percentage of null occurrences in each variable
training_data.isnull().sum()/len(training_data)

ticket_id                     0.000000
agency_name                   0.000000
inspector_name                0.000000
violator_name                 0.000136
violation_street_number       0.000000
violation_street_name         0.000000
violation_zip_code            1.000000
mailing_address_str_number    0.014390
mailing_address_str_name      0.000016
city                          0.000000
state                         0.000372
zip_code                      0.000004
non_us_str_code               0.999988
country                       0.000000
ticket_issued_date            0.000000
hearing_date                  0.049903
violation_code                0.000000
violation_description         0.000000
disposition                   0.000000
fine_amount                   0.000004
admin_fee                     0.000000
state_fee                     0.000000
late_fee                      0.000000
discount_amount               0.000000
clean_up_cost                 0.000000
judgment_amount          

In [5]:
# removing null dominant variables
training_data.drop(['violation_zip_code', 'grafitti_status', 'non_us_str_code'], 
                   inplace=True, axis=1)
testing_data.drop(['violation_zip_code', 'grafitti_status', 'non_us_str_code'], 
                   inplace=True, axis=1)

In [6]:
# removing rows where compliance is null
training_data = training_data.loc[~training_data['compliance'].isnull()]

In [7]:
# assuming all data is from Detroit and thus removing unnecessary variables
# while noting that some relevant address info may be capture in lat/lon data
training_data.drop(['city', 'state', 'zip_code', 'country'],
                   inplace=True, axis=1)
testing_data.drop(['city', 'state', 'zip_code', 'country'],
                   inplace=True, axis=1)

In [8]:
# adding lat and lon variables by merging while removing address variable
addlanlon_data = address_data.merge(latlon_data, how='inner', on='address')
addlanlon_data.drop('address', inplace=True, axis=1)
training_data = training_data.merge(addlanlon_data, how='inner', on='ticket_id')
testing_data = testing_data.merge(addlanlon_data, how='inner', on='ticket_id')

In [9]:
# creating new binomial variable that indicates whether the mailing address and
# violation address street name are the same 
# then removing unnecessary variables
training_data['violadd_equal_mailadd'] = training_data['violation_street_name']==training_data['mailing_address_str_name']
training_data['violadd_equal_mailadd'] = training_data['violadd_equal_mailadd'].astype('uint8')
training_data.drop(['violation_street_number', 'violation_street_name',
                   'mailing_address_str_number', 'mailing_address_str_name'],
                  inplace=True, axis=1)

testing_data['violadd_equal_mailadd'] = testing_data['violation_street_name']==testing_data['mailing_address_str_name']
testing_data['violadd_equal_mailadd'] = testing_data['violadd_equal_mailadd'].astype('uint8')
testing_data.drop(['violation_street_number', 'violation_street_name',
                   'mailing_address_str_number', 'mailing_address_str_name'],
                  inplace=True, axis=1)

In [10]:
# looking at data types, with some quick examination of non-numeric variables
training_data.dtypes

ticket_id                  int64
agency_name               object
inspector_name            object
violator_name             object
ticket_issued_date        object
hearing_date              object
violation_code            object
violation_description     object
disposition               object
fine_amount              float64
admin_fee                float64
state_fee                float64
late_fee                 float64
discount_amount          float64
clean_up_cost            float64
judgment_amount          float64
compliance               float64
lat                      float64
lon                      float64
violadd_equal_mailadd      uint8
dtype: object

In [11]:
# I see that violation description can be removed whilst date variables must be
# converted to a datetime format and can be merged to one variable
# note that there are NaN hearing dates so I have replaced these day count
# differences with the mean day difference of the whole dataset
training_data.drop('violation_description', inplace=True, axis=1)
training_data['ticket_issued_date'] = pd.to_datetime(training_data['ticket_issued_date'] , 
                                                     format='%Y/%m/%d %H:%M:%S')
training_data['hearing_date'] = pd.to_datetime(training_data['hearing_date'] , 
                                                     format='%Y/%m/%d %H:%M:%S')

x = (training_data['hearing_date']-training_data['ticket_issued_date']).dt.days
xmean = x[x>0].mean()
x[x.isnull()] = xmean
training_data['days_from_issue_to_hearing'] = x
training_data.drop(['hearing_date'], inplace=True, axis=1)

testing_data.drop('violation_description', inplace=True, axis=1)
testing_data['ticket_issued_date'] = pd.to_datetime(testing_data['ticket_issued_date'] , 
                                                     format='%Y/%m/%d %H:%M:%S')
testing_data['hearing_date'] = pd.to_datetime(testing_data['hearing_date'] , 
                                                     format='%Y/%m/%d %H:%M:%S')

x = (testing_data['hearing_date']-testing_data['ticket_issued_date']).dt.days
xmean = x[x>0].mean()
x[x.isnull()] = xmean
testing_data['days_from_issue_to_hearing'] = x
testing_data.drop(['hearing_date'], inplace=True, axis=1)

  self._update_inplace(new_data)


In [12]:
# dealing with more variables I presume inspector_name can be removed and I look
# to create new variable from violator_name which indicates whether the violator
# has a previous record or is a first time offender
training_data['prev_offence'] = np.nan
training_data = training_data.sort_values(by='ticket_issued_date')

for i in range(0, len(training_data)):
    name = training_data.iloc[i]['violator_name']
    all_names = list(training_data.iloc[:i]['violator_name'])
    if name in all_names :
        training_data.iloc[i, training_data.columns.get_loc('prev_offence')] = 1
    else :
        training_data.iloc[i, training_data.columns.get_loc('prev_offence')] = 0
        
testing_data['prev_offence'] = np.nan
testing_data = testing_data.sort_values(by='ticket_issued_date')
training_names = training_data['violator_name'].unique()

for i in range(0, len(testing_data)):
    name = testing_data.iloc[i]['violator_name']
    all_names = list(testing_data.iloc[:i]['violator_name'].unique())
    if name in all_names :
        testing_data.iloc[i, testing_data.columns.get_loc('prev_offence')] = 1
    elif name in training_names :
        testing_data.iloc[i, testing_data.columns.get_loc('prev_offence')] = 1
    else :
        testing_data.iloc[i, testing_data.columns.get_loc('prev_offence')] = 0

In [13]:
# saving the dataset for quicker loading
training_data.to_csv('training_data2.csv')
testing_data.to_csv('testing_data2.csv')

In [14]:
training_data2 = pd.read_csv('training_data2.csv')
testing_data2 = pd.read_csv('testing_data2.csv')

In [15]:
# dropping unnecessary variables
training_data2.drop(['inspector_name', 'violator_name', 'ticket_issued_date'], 
                    inplace=True, axis=1)
testing_data2.drop(['inspector_name', 'violator_name', 'ticket_issued_date'], 
                    inplace=True, axis=1)

In [16]:
# clean_up_cost, state_fee and admin_fee all contain the same values so can be 
# removed
training_data2.drop(['clean_up_cost', 'admin_fee', 'state_fee', 'Unnamed: 0'], 
                    inplace=True, axis=1)
training_data2.set_index('ticket_id', inplace=True)
testing_data2.drop(['clean_up_cost', 'admin_fee', 'state_fee', 'Unnamed: 0'], 
                    inplace=True, axis=1)
testing_data2.set_index('ticket_id', inplace=True)

In [17]:
# disposition and agency_name variables are categorical so can be converted
# into sets of binomial variables respectively
training_data2 = pd.get_dummies(training_data2, columns = ['agency_name', 'disposition'])
testing_data2 = pd.get_dummies(testing_data2, columns = ['agency_name', 'disposition'])
training_data2['disposition_Responsible - Compl/Adj by Default'] = 0
training_data2['disposition_Responsible - Compl/Adj by Determi'] = 0

In [18]:
# violation_code is a variable of interest since the type of violation could 
# absolutely be a strong indicator of compliance. However, with too many unique
# values and the potential for new violation codes to be created and appear, the
# variable will have to be ignored
training_data2.drop('violation_code', inplace=True, axis=1)
testing_data2.drop('violation_code', inplace=True, axis=1)

In [19]:
# removing judgment_amount as this information is captured in the other 
# variables concerning fees
training_data2.drop('judgment_amount', inplace=True, axis=1)
testing_data2.drop('judgment_amount', inplace=True, axis=1)

In [20]:
# looking at variable correlations with compliance there doesn't seem to be any 
# issues
training_data2.corr()['compliance']

fine_amount                                                  -0.049134
late_fee                                                     -0.085055
discount_amount                                               0.156073
compliance                                                    1.000000
lat                                                          -0.021569
lon                                                          -0.000431
violadd_equal_mailadd                                         0.008933
days_from_issue_to_hearing                                   -0.004666
prev_offence                                                 -0.093310
agency_name_Buildings, Safety Engineering & Env Department   -0.055637
agency_name_Department of Public Works                        0.046939
agency_name_Detroit Police Department                         0.038672
agency_name_Health Department                                -0.005559
agency_name_Neighborhood City Halls                          -0.000699
dispos

In [21]:
# checking for nans one last time shows the presence of some in the lat and lon 
# variables.
training_data2.isnull().sum()/len(training_data2)

fine_amount                                                   0.000000
late_fee                                                      0.000000
discount_amount                                               0.000000
compliance                                                    0.000000
lat                                                           0.000013
lon                                                           0.000013
violadd_equal_mailadd                                         0.000000
days_from_issue_to_hearing                                    0.000000
prev_offence                                                  0.000000
agency_name_Buildings, Safety Engineering & Env Department    0.000000
agency_name_Department of Public Works                        0.000000
agency_name_Detroit Police Department                         0.000000
agency_name_Health Department                                 0.000000
agency_name_Neighborhood City Halls                           0.000000
dispos

In [22]:
# replacing nans with the mean
latmean = np.mean(training_data2['lat'])
lonmean = np.mean(training_data2['lon'])
nanlatlon = list(training_data2[training_data2['lat'].isnull()].index)
for i in nanlatlon:
    training_data2.loc[i, 'lat'] = latmean
    training_data2.loc[i, 'lon'] = lonmean

nanlatlon = list(testing_data2[testing_data2['lat'].isnull()].index)
for i in nanlatlon:
    testing_data2.loc[i, 'lat'] = latmean
    testing_data2.loc[i, 'lon'] = lonmean

In [23]:
# now for the prediction models. First I will split the training set into 
# train and test sets
from sklearn.model_selection import train_test_split

traindata = training_data2.drop('compliance', axis=1)
traincompliance = training_data2['compliance']

X_train, X_test, y_train, y_test = train_test_split(traindata, traincompliance,
                                                    random_state=0)

In [58]:
# first trying a Naive Bayes model
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

nbclf = GaussianNB().fit(X_train, y_train)
cross_val_score(nbclf, X_test, y_test, cv=5, scoring = 'roc_auc')

array([ 0.76281538,  0.77621997,  0.77689392,  0.77946438,  0.78377937])

In [59]:
# next trying random forests model
from sklearn.ensemble import RandomForestRegressor

rfclf = RandomForestRegressor(random_state=0).fit(X_train, y_train)
cross_val_score(rfclf, X_test, y_test, cv=5, scoring = 'roc_auc')

array([ 0.7497346 ,  0.74796527,  0.7386544 ,  0.75410886,  0.74572621])

In [108]:
# next trying a gradient boosted decision tree model
from sklearn.ensemble import GradientBoostingRegressor

gbclf = GradientBoostingRegressor(random_state=0).fit(X_train, y_train)
cross_val_score(gbclf, X_test, y_test, cv=5, scoring = 'roc_auc')

array([ 0.80726241,  0.80475678,  0.81073189,  0.81416076,  0.82483667])

In [109]:
# using grid_values to find good parameters
from sklearn.model_selection import GridSearchCV
grid_values = {'n_estimators': [50,75,100], 
               'learning_rate': [0.01, 0.1, 0.3, 0.4]}
grid_gbclf = GridSearchCV(gbclf, param_grid = grid_values, scoring='roc_auc')
grid_gbclf.fit(X_train, y_train)
print(grid_gbclf.best_params_)
print(grid_gbclf.best_score_)

{'learning_rate': 0.3, 'n_estimators': 50}
0.820923386309


In [13]:
gbclf = GradientBoostingRegressor(random_state=0,
                                  n_estimators=50,
                                  learning_rate=0.3).fit(X_train, y_train)
cross_val_score(gbclf, X_test, y_test, cv=5, scoring = 'roc_auc')

NameError: name 'GradientBoostingRegressor' is not defined

In [114]:
grid_values = {'max_depth': [3,4,5]}
grid_gbclf = GridSearchCV(gbclf, param_grid = grid_values, scoring='roc_auc')
grid_gbclf.fit(X_train, y_train)
print(grid_gbclf.best_params_)
print(grid_gbclf.best_score_)

{'max_depth': 5}
0.82173473978


In [24]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
gbclf = GradientBoostingRegressor(random_state=0,
                                  n_estimators=50,
                                  learning_rate=0.3,
                                 max_depth=5).fit(X_train, y_train)
cross_val_score(gbclf, X_test, y_test, cv=5, scoring = 'roc_auc')

array([ 0.80116443,  0.80810559,  0.80593239,  0.80971667,  0.80993976])

In [25]:
y_preds = gbclf.predict(testing_data2)
y_preds = pd.Series(y_preds, index=list(testing_data2.index))
y_preds = y_preds.rename_axis('ticket_id')
y_preds.head()

ticket_id
284932    0.168202
285343    0.144434
285344    0.365637
285362    0.120585
285342    0.880129
dtype: float64

In [118]:
# due to the function timing out in the auto-grader, the process needs speeding
# up and as prev_offence is the slowest process it should be removed

def blight_model():
    
    training_data = pd.read_csv('train.csv', encoding='latin1')
    testing_data = pd.read_csv('test.csv')
    address_data = pd.read_csv('addresses.csv')
    latlon_data = pd.read_csv('latlons.csv')
    
    unavailable_vars = ['payment_amount', 'payment_date', 'payment_status', 
                   'balance_due', 'collection_status', 'compliance_detail']
    training_data.drop(unavailable_vars, inplace=True, axis=1)
    
    training_data.drop(['violation_zip_code', 'grafitti_status', 'non_us_str_code',
                       'city', 'state', 'zip_code', 'country'], 
                   inplace=True, axis=1)
    testing_data.drop(['violation_zip_code', 'grafitti_status', 'non_us_str_code',
                      'city', 'state', 'zip_code', 'country'], 
                   inplace=True, axis=1)
    
    training_data = training_data.loc[~training_data['compliance'].isnull()]
    
    addlanlon_data = address_data.merge(latlon_data, how='inner', on='address')
    addlanlon_data.drop('address', inplace=True, axis=1)
    training_data = training_data.merge(addlanlon_data, how='inner', on='ticket_id')
    testing_data = testing_data.merge(addlanlon_data, how='inner', on='ticket_id')
    
    training_data['violadd_equal_mailadd'] = training_data['violation_street_name']==training_data['mailing_address_str_name']
    training_data['violadd_equal_mailadd'] = training_data['violadd_equal_mailadd'].astype('uint8')
    training_data.drop(['violation_street_number', 'violation_street_name',
                   'mailing_address_str_number', 'mailing_address_str_name'],
                  inplace=True, axis=1)

    testing_data['violadd_equal_mailadd'] = testing_data['violation_street_name']==testing_data['mailing_address_str_name']
    testing_data['violadd_equal_mailadd'] = testing_data['violadd_equal_mailadd'].astype('uint8')
    testing_data.drop(['violation_street_number', 'violation_street_name',
                   'mailing_address_str_number', 'mailing_address_str_name'],
                  inplace=True, axis=1)
    
    training_data.drop('violation_description', inplace=True, axis=1)
    training_data['ticket_issued_date'] = pd.to_datetime(training_data['ticket_issued_date'] , 
                                                     format='%Y/%m/%d %H:%M:%S')
    training_data['hearing_date'] = pd.to_datetime(training_data['hearing_date'] , 
                                                     format='%Y/%m/%d %H:%M:%S')

    x = (training_data['hearing_date']-training_data['ticket_issued_date']).dt.days
    xmean = x[x>0].mean()
    x[x.isnull()] = xmean
    training_data['days_from_issue_to_hearing'] = x
    training_data.drop(['hearing_date'], inplace=True, axis=1)

    testing_data.drop('violation_description', inplace=True, axis=1)
    testing_data['ticket_issued_date'] = pd.to_datetime(testing_data['ticket_issued_date'] , 
                                                     format='%Y/%m/%d %H:%M:%S')
    testing_data['hearing_date'] = pd.to_datetime(testing_data['hearing_date'] , 
                                                     format='%Y/%m/%d %H:%M:%S')

    x = (testing_data['hearing_date']-testing_data['ticket_issued_date']).dt.days
    xmean = x[x>0].mean()
    x[x.isnull()] = xmean
    testing_data['days_from_issue_to_hearing'] = x
    testing_data.drop(['hearing_date'], inplace=True, axis=1)
    
    training_data.drop(['inspector_name', 'violator_name', 'ticket_issued_date'], 
                    inplace=True, axis=1)
    testing_data.drop(['inspector_name', 'violator_name', 'ticket_issued_date'], 
                    inplace=True, axis=1)

    training_data.drop(['clean_up_cost', 'admin_fee', 'state_fee'], 
                    inplace=True, axis=1)
    training_data.set_index('ticket_id', inplace=True)
    testing_data.drop(['clean_up_cost', 'admin_fee', 'state_fee'], 
                    inplace=True, axis=1)
    testing_data.set_index('ticket_id', inplace=True)
    
    training_data = pd.get_dummies(training_data, columns = ['agency_name', 'disposition'])
    testing_data = pd.get_dummies(testing_data, columns = ['agency_name', 'disposition'])
    training_data['disposition_Responsible - Compl/Adj by Default'] = 0
    training_data['disposition_Responsible - Compl/Adj by Determi'] = 0
    
    training_data.drop(['violation_code', 'judgment_amount'], inplace=True, axis=1)
    testing_data.drop(['violation_code', 'judgment_amount'], inplace=True, axis=1)
    
    latmean = np.mean(training_data['lat'])
    lonmean = np.mean(training_data['lon'])
    nanlatlon = list(training_data[training_data['lat'].isnull()].index)
    for i in nanlatlon:
        training_data.loc[i, 'lat'] = latmean
        training_data.loc[i, 'lon'] = lonmean

    nanlatlon = list(testing_data[testing_data['lat'].isnull()].index)
    for i in nanlatlon:
        testing_data.loc[i, 'lat'] = latmean
        testing_data.loc[i, 'lon'] = lonmean
        
    from sklearn.model_selection import train_test_split

    traindata = training_data.drop('compliance', axis=1)
    traincompliance = training_data['compliance']
    
    from sklearn.ensemble import GradientBoostingRegressor

    X_train, X_test, y_train, y_test = train_test_split(traindata, traincompliance,
                                                    random_state=0)
    gbclf = GradientBoostingRegressor(random_state=0,
                                  n_estimators=50,
                                  learning_rate=0.3,
                                  max_depth=5).fit(X_train, y_train)
    
    y_preds = gbclf.predict(testing_data)
    y_preds = pd.Series(y_preds, index=list(testing_data.index))
    y_preds = y_preds.rename_axis('ticket_id')
    
    return y_preds

In [None]:
# Your AUC of 0.782910006428 was awarded a value of 1.0 out of 1.0 total grades
blight_model()