

**File descriptions** (Use only this data for training your model!)

    readonly/train.csv - the training set (all tickets issued 2004-2011)
    readonly/test.csv - the test set (all tickets issued 2012-2016)
    readonly/addresses.csv & readonly/latlons.csv - mapping from ticket id to addresses, and from addresses to lat/lon coordinates. 
     Note: misspelled addresses may be incorrectly geolocated.

<br>

**Data fields**

train.csv & test.csv

    ticket_id - unique identifier for tickets
    agency_name - Agency that issued the ticket
    inspector_name - Name of inspector that issued the ticket
    violator_name - Name of the person/organization that the ticket was issued to
    violation_street_number, violation_street_name, violation_zip_code - Address where the violation occurred
    mailing_address_str_number, mailing_address_str_name, city, state, zip_code, non_us_str_code, country - Mailing address of the violator
    ticket_issued_date - Date and time the ticket was issued
    hearing_date - Date and time the violator's hearing was scheduled
    violation_code, violation_description - Type of violation
    disposition - Judgment and judgement type
    fine_amount - Violation fine amount, excluding fees
    admin_fee - $20 fee assigned to responsible judgments
state_fee - $10 fee assigned to responsible judgments
    late_fee - 10% fee assigned to responsible judgments
    discount_amount - discount applied, if any
    clean_up_cost - DPW clean-up or graffiti removal cost
    judgment_amount - Sum of all fines and fees
    grafitti_status - Flag for graffiti violations
    
train.csv only

    payment_amount - Amount paid, if any
    payment_date - Date payment was made, if it was received
    payment_status - Current payment status as of Feb 1 2017
    balance_due - Fines and fees still owed
    collection_status - Flag for payments in collections
    compliance [target variable for prediction] 
     Null = Not responsible
     0 = Responsible, non-compliant
     1 = Responsible, compliant
    compliance_detail - More information on why each ticket was marked compliant or non-compliant



In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


def blight_model():

    train = pd.read_csv("train.csv", encoding="cp1252")

    test = pd.read_csv("test.csv", encoding="cp1252")

    latlons = pd.read_csv('latlons.csv')
    addresses = pd.read_csv('addresses.csv')

    addresses_latlons = latlons.merge(addresses, how='inner', on='address')

    train = train.merge(addresses_latlons, how='left', on='ticket_id')
    test = test.merge(addresses_latlons, how='left', on='ticket_id')

    #Training

    train = train.dropna(subset=['compliance'])

    #columns that not in test dataset
    train_drop_list = ['payment_amount', 'payment_date','payment_status', 'balance_due', 'collection_status', 'compliance_detail']
    full_drop_list = ['violation_zip_code', 'non_us_str_code', 'grafitti_status', 'violation_zip_code', 'mailing_address_str_number', 'violator_name']

    train = train.drop(train_drop_list, axis=1)
    train = train.drop(full_drop_list, axis=1)

    test = test.drop(full_drop_list, axis=1)

    train_str = train.select_dtypes(include=[object])

    train_str = train_str.astype(str)

    train_str = train_str.fillna('')

    ohe = OneHotEncoder()

    #for x in train_str.columns:
    #    print(x, len(train_str[x].unique()))

    #agency_name 5
    #inspector_name 159
    #violation_street_name 1716
    #mailing_address_str_name 28441
    #city 4093
    #state 60
    #zip_code 3499
    #country 5
    #ticket_issued_date 68097
    #hearing_date 5971
    #violation_code 189
    #violation_description 207
    #disposition 4
    #address 71901

    list_dummies = ['agency_name','inspector_name','state', 'country', 'violation_code', 'violation_description', 'disposition']

    train_str_dummies = pd.get_dummies(train_str, columns=list_dummies)

    train_str_le = train_str.drop(list_dummies, axis=1)

    le = LabelEncoder()

    train_str_le = train_str_le.apply(le.fit_transform)

    train_df = train_str_dummies.drop(train_str_le.columns, axis=1).join(train_str_le)

    train_float = train.select_dtypes(include=['int64', 'float64'])

    #train_float.hist(figsize=(15,15))
    #plt.show()

    train_df = train_df.join(train_float)

    train_df.compliance = train_df.compliance.astype(int)

    train_df = train_df.fillna(train_df.mean())

    X = train_df.drop('compliance', axis=1)

    y = train_df.compliance




    #Testing prepocessing

    test_str = test.select_dtypes(include=[object])

    test_str = test_str.astype(str)

    test_str = test_str.fillna('')

    test_str_dummies = pd.get_dummies(test_str, columns=list_dummies)

    test_str_le = test_str.drop(list_dummies, axis=1)

    test_str_le = test_str_le.apply(le.fit_transform)

    test_df = test_str_dummies.drop(test_str_le.columns, axis=1).join(test_str_le)

    test_float = test.select_dtypes(include=['int64', 'float64'])

    test_df = test_df.join(test_float)

    test_df = test_df.fillna(test_df.mean())

    #Test and train compare

    union_columns = (np.intersect1d(train_df.columns, test_df.columns))

    train_df = train_df.loc[:,np.insert(union_columns, [1], ['compliance'])]

    test_df = test_df.loc[:, union_columns]

    len(test_df.columns)

    len(train_df.columns)

    #Training

    X = train_df.drop('compliance', axis=1)
    y = train_df.compliance

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    clf = RandomForestClassifier(max_depth=5, n_estimators=30)

    clf.fit(X_train, y_train)
    
    #testing

    test['proba'] = clf.predict_proba(test_df)[:,1]


    return test.set_index('ticket_id')['proba']

In [5]:
blight_model()

  if (await self.run_code(code, result,  async_=asy)):


ticket_id
284932    0.129604
285362    0.090694
285361    0.112928
285338    0.079665
285346    0.086922
            ...   
376496    0.053379
376497    0.053379
376499    0.080556
376500    0.084053
369851    0.224448
Name: proba, Length: 61001, dtype: float64