# [Reference Solution 4 for Assignment 4](https://github.com/Abdul-Rafae-Mohammed/Applied-Machine-Learning-for-Blight-Violation-Data-using-Python---------------/blob/master/Machine_learning_blight_violation_data.py)

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

In [24]:
# load dataset

df_train = pd.read_csv("train.csv", encoding='ISO-8859-1', low_memory=False).set_index('ticket_id')
df_final = pd.read_csv("test.csv", encoding='ISO-8859-1', low_memory=False).set_index('ticket_id')

df_train.shape, df_final.shape

((250306, 33), (61001, 26))

In [25]:
df_train.head(5).T

ticket_id,22056,27586,22062,22084,22093
agency_name,"Buildings, Safety Engineering & Env Department","Buildings, Safety Engineering & Env Department","Buildings, Safety Engineering & Env Department","Buildings, Safety Engineering & Env Department","Buildings, Safety Engineering & Env Department"
inspector_name,"Sims, Martinzie","Williams, Darrin","Sims, Martinzie","Sims, Martinzie","Sims, Martinzie"
violator_name,"INVESTMENT INC., MIDWEST MORTGAGE","Michigan, Covenant House","SANDERS, DERRON","MOROSI, MIKE","NATHANIEL, NEAL"
violation_street_number,2900,4311,1449,1441,2449
violation_street_name,TYLER,CENTRAL,LONGFELLOW,LONGFELLOW,CHURCHILL
violation_zip_code,,,,,
mailing_address_str_number,3,2959,23658,5,7449
mailing_address_str_name,S. WICKER,Martin Luther King,P.O. BOX,ST. CLAIR,CHURCHILL
city,CHICAGO,Detroit,DETROIT,DETROIT,DETROIT
state,IL,MI,MI,MI,MI


In [32]:
df_train.columns

Index(['agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number',
       'mailing_address_str_name', 'city', 'state', 'zip_code',
       'non_us_str_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'payment_amount', 'balance_due',
       'payment_date', 'payment_status', 'collection_status',
       'grafitti_status', 'compliance_detail', 'compliance'],
      dtype='object')

In [31]:
df_final.columns

Index(['agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number',
       'mailing_address_str_name', 'city', 'state', 'zip_code',
       'non_us_str_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'grafitti_status'],
      dtype='object')

In [14]:
# extract compliance column as label of the model and fill in NaN w/ 0 (non-compliance)
df_label = df_train['compliance'].fillna(0)

df_label.head(5)

ticket_id
22056    0.0
27586    1.0
22062    0.0
22084    0.0
22093    0.0
Name: compliance, dtype: float64

In [33]:
# select attributes for training dataset
# remaining attributes:
#    violation_street_number, violation_zip_code, zip_code, disposition, fine_amount,
#    admin_fee, state_fee, late_fee, discount_amount, clean_up_cost, judgment_amount, grafitti_status

df_train = df_train.drop(
    ['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status',
     'agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
     'city','state','violator_name','violation_street_name','violation_description', 
     'compliance_detail','mailing_address_str_number','ticket_issued_date','hearing_date',
     'non_us_str_code','compliance'], axis=1)

df_train.head(5).T

ticket_id,22056,27586,22062,22084,22093
violation_street_number,2900,4311,1449,1441,2449
violation_zip_code,,,,,
zip_code,60606,48208,48223,48214,48206
disposition,Responsible by Default,Responsible by Determination,Not responsible by Dismissal,Not responsible by City Dismissal,Not responsible by Dismissal
fine_amount,250,750,250,250,250
admin_fee,20,20,0,0,0
state_fee,10,10,0,0,0
late_fee,25,75,0,0,0
discount_amount,0,0,0,0,0
clean_up_cost,0,0,0,0,0


In [16]:
# fill NaN w/ 0 value
df_train = df_train.fillna(0)

df_train.head(5).T

ticket_id,22056,27586,22062,22084,22093
violation_street_number,2900,4311,1449,1441,2449
violation_zip_code,0,0,0,0,0
zip_code,60606,48208,48223,48214,48206
disposition,Responsible by Default,Responsible by Determination,Not responsible by Dismissal,Not responsible by City Dismissal,Not responsible by Dismissal
fine_amount,250,750,250,250,250
admin_fee,20,20,0,0,0
state_fee,10,10,0,0,0
late_fee,25,75,0,0,0
discount_amount,0,0,0,0,0
clean_up_cost,0,0,0,0,0


In [18]:
# reserve attributes same as training dataset
df_final = df_final.drop(
    ['agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
     'city','state','violator_name','violation_street_name','violation_description',
     'mailing_address_str_number','non_us_str_code','ticket_issued_date','hearing_date'], axis=1)

['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status',
 'agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
 'city','state','violator_name','violation_street_name','violation_description', 
 'compliance_detail','mailing_address_str_number','ticket_issued_date','hearing_date',
 'non_us_str_code','compliance']

df_final = df_final.fillna(0)

ValueError: labels ['agency_name' 'inspector_name' 'violation_code' 'country'
 'mailing_address_str_name' 'city' 'state' 'violator_name'
 'violation_street_name' 'violation_description'
 'mailing_address_str_number' 'non_us_str_code' 'ticket_issued_date'
 'hearing_date'] not contained in axis

In [None]:
# 
df_train = pd.get_dummies(data=df_train, columns=['grafitti_status', 'disposition'])
df_final = pd.get_dummies(data=df_final, columns=['grafitti_status', 'disposition'])

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

def blight_model(debug=False):
    
    # Cleaning Up the Data
    df = pd.read_csv("train.csv",encoding='ISO-8859-1')
    df1 = pd.read_csv("test.csv",encoding='ISO-8859-1')

    #Fill the na values
    df_label = df['compliance'].fillna(0)

    # Removing the Unnecessary features
    df = df.drop( ['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status',
                   'agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
                  'city','state','violator_name','violation_street_name','violation_description', 
                   'compliance_detail','mailing_address_str_number','ticket_issued_date','hearing_date',
                   'non_us_str_code','compliance'],axis=1)
    df = df.fillna(0)
    df1 = df1.drop( ['agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
                   'city','state','violator_name','violation_street_name','violation_description',
                     'mailing_address_str_number','non_us_str_code','ticket_issued_date','hearing_date'],axis=1)
    df1 = df1.fillna(0)
    df = pd.get_dummies(data=df, columns=['grafitti_status', 'disposition'])
    df1 = pd.get_dummies(data=df1, columns=['grafitti_status', 'disposition'])

    #Calculating new features using the existing ones to improve the predictions of the classifier
    df['late_amount'] = df['judgment_amount']*df['late_fee']
    df1['late_amount'] = df1['judgment_amount']*df1['late_fee']
    
    #df['hearing_date'] = pd.to_datetime(df['hearing_date']).fillna(0)
    #df['ticket_issued_date'] = pd.to_datetime(df['ticket_issued_date']).fillna(0)
    #df['date_diff'] = (pd.to_datetime(df['hearing_date']).dt.date - 
    #                         pd.to_datetime(df['ticket_issued_date']).dt.date).fillna(0)

    # Converting the datatype according to the data.
    df = df.convert_objects(convert_numeric=True).fillna(0)
    df1 = df1.convert_objects(convert_numeric=True).fillna(0)
    df1.violation_zip_code = df1.violation_zip_code.astype('float').fillna(0)
    #print(df1.dtypes)

    #Splitting the data into training and test set
    X_train, X_test, y_train, y_test = train_test_split(df, df_label, random_state=0)

    #Fit the classifier and predict the values for test set
    #clf = DecisionTreeClassifier().fit(X_train, y_train)
    clf = GradientBoostingClassifier().fit(X_train, y_train)
    tree_predicted = clf.predict(X_test)
    # clf = lr.fit(X_train, y_train).decision_function(X_test)

    #Calculating the Area Under the Curve
    fpr_lr, tpr_lr, _ = roc_curve(y_test, tree_predicted)
    roc_auc_lr = auc(fpr_lr, tpr_lr)

    if debug:
        print(tree_predicted.shape)
        print(df1['ticket_id'].shape)
    
    df1['disposition_SET-ASIDE (PENDING JUDGMENT)'] = df['disposition_SET-ASIDE (PENDING JUDGMENT)']
    
    if debug:
        print(len(df.columns),"---",len(df1.columns))
        print(df.head())
        print(df1.head())

    
    preds = clf.predict(df1)
    preds = pd.DataFrame(data=preds)
    preds.set_index(df1['ticket_id'],inplace=True)

    if debug:
        print(preds.head())
        print(preds.dtypes)
        print()
        print('Accuracy of DT classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
        print('Accuracy of DT classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))
        print('ROC Score on test set: {:.2f}'.format(roc_auc_lr))
        print(df.head())


    
    return preds