# [Reference Solution 4 for Assignment 4](https://github.com/Abdul-Rafae-Mohammed/Applied-Machine-Learning-for-Blight-Violation-Data-using-Python---------------/blob/master/Machine_learning_blight_violation_data.py)

In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

__Loading data__

In [22]:
# load dataset
df_train = pd.read_csv("train.csv", encoding='ISO-8859-1', low_memory=False).set_index('ticket_id')
df_final = pd.read_csv("test.csv", encoding='ISO-8859-1', low_memory=False).set_index('ticket_id')

df_train.shape, df_final.shape

((250306, 33), (61001, 26))

In [23]:
df_train.head(5).T

ticket_id,22056,27586,22062,22084,22093
agency_name,"Buildings, Safety Engineering & Env Department","Buildings, Safety Engineering & Env Department","Buildings, Safety Engineering & Env Department","Buildings, Safety Engineering & Env Department","Buildings, Safety Engineering & Env Department"
inspector_name,"Sims, Martinzie","Williams, Darrin","Sims, Martinzie","Sims, Martinzie","Sims, Martinzie"
violator_name,"INVESTMENT INC., MIDWEST MORTGAGE","Michigan, Covenant House","SANDERS, DERRON","MOROSI, MIKE","NATHANIEL, NEAL"
violation_street_number,2900,4311,1449,1441,2449
violation_street_name,TYLER,CENTRAL,LONGFELLOW,LONGFELLOW,CHURCHILL
violation_zip_code,,,,,
mailing_address_str_number,3,2959,23658,5,7449
mailing_address_str_name,S. WICKER,Martin Luther King,P.O. BOX,ST. CLAIR,CHURCHILL
city,CHICAGO,Detroit,DETROIT,DETROIT,DETROIT
state,IL,MI,MI,MI,MI


In [24]:
df_train.columns

Index(['agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number',
       'mailing_address_str_name', 'city', 'state', 'zip_code',
       'non_us_str_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'payment_amount', 'balance_due',
       'payment_date', 'payment_status', 'collection_status',
       'grafitti_status', 'compliance_detail', 'compliance'],
      dtype='object')

In [25]:
df_final.columns

Index(['agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number',
       'mailing_address_str_name', 'city', 'state', 'zip_code',
       'non_us_str_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'grafitti_status'],
      dtype='object')

__Generate independent (df_train) and dependent (df_label) dataframes__

In [26]:
# extract compliance column as label of the model and fill in NaN w/ 0 (non-compliance)
df_label = df_train['compliance'].fillna(0)

df_label.head(5)

ticket_id
22056    0.0
27586    1.0
22062    0.0
22084    0.0
22093    0.0
Name: compliance, dtype: float64

In [27]:
# select attributes for training dataset
# remaining attributes:
#    violation_street_number, violation_zip_code, zip_code, disposition, fine_amount,
#    admin_fee, state_fee, late_fee, discount_amount, clean_up_cost, judgment_amount, grafitti_status

select = ['violation_street_number', 'violation_zip_code', 'zip_code', 'disposition', 
          'fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount', 
          'clean_up_cost', 'judgment_amount', 'grafitti_status']

df_train_select = df_train[select]

# df_train_select = df_train.drop(
#     ['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status',
#      'agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
#      'city','state','violator_name','violation_street_name','violation_description', 
#      'compliance_detail','mailing_address_str_number','ticket_issued_date','hearing_date',
#      'non_us_str_code','compliance'], axis=1)

df_train_select.columns

Index(['violation_street_number', 'violation_zip_code', 'zip_code',
       'disposition', 'fine_amount', 'admin_fee', 'state_fee', 'late_fee',
       'discount_amount', 'clean_up_cost', 'judgment_amount',
       'grafitti_status'],
      dtype='object')

In [28]:
# fill NaN w/ 0 value
df_train_select = df_train_select.fillna(0)

df_train_select.head(5).T

ticket_id,22056,27586,22062,22084,22093
violation_street_number,2900,4311,1449,1441,2449
violation_zip_code,0,0,0,0,0
zip_code,60606,48208,48223,48214,48206
disposition,Responsible by Default,Responsible by Determination,Not responsible by Dismissal,Not responsible by City Dismissal,Not responsible by Dismissal
fine_amount,250,750,250,250,250
admin_fee,20,20,0,0,0
state_fee,10,10,0,0,0
late_fee,25,75,0,0,0
discount_amount,0,0,0,0,0
clean_up_cost,0,0,0,0,0


__Processing the final predict dataset w/ selected attributes & dummies__

In [29]:
# reserve attributes same as training dataset

# df_final = df_final.drop(
#     ['agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name', 
#      'city','state','violator_name','violation_street_name','violation_description', 
#      'mailing_address_str_number','non_us_str_code','ticket_issued_date','hearing_date'], axis=1)

df_final_select = df_final[select]

df_final_select = df_final_select.fillna(0)

df_final_select.columns

Index(['violation_street_number', 'violation_zip_code', 'zip_code',
       'disposition', 'fine_amount', 'admin_fee', 'state_fee', 'late_fee',
       'discount_amount', 'clean_up_cost', 'judgment_amount',
       'grafitti_status'],
      dtype='object')

In [30]:
# convert 'grafitti_status', 'disposition' categorical variables into dummy variables
df_train_select = pd.get_dummies(data=df_train_select, columns=['grafitti_status', 'disposition'])
df_final_select = pd.get_dummies(data=df_final_select, columns=['grafitti_status', 'disposition'])

# 'grafitti_status_0', 
# 'grafitti_status_GRAFFITI TICKET',

# 'disposition_Responsible (Fine Waived) by Admis',
# 'disposition_Responsible (Fine Waived) by Deter',
# 'disposition_Responsible - Compl/Adj by Default',
# 'disposition_Responsible - Compl/Adj by Determi',
# 'disposition_Responsible by Admission',
# 'disposition_Responsible by Default',
# 'disposition_Responsible by Determination',
# 'disposition_Responsible by Dismissal'

In [31]:
df_train_select.columns

Index(['violation_street_number', 'violation_zip_code', 'zip_code',
       'fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'grafitti_status_0',
       'grafitti_status_GRAFFITI TICKET',
       'disposition_Not responsible by City Dismissal',
       'disposition_Not responsible by Determination',
       'disposition_Not responsible by Dismissal',
       'disposition_PENDING JUDGMENT',
       'disposition_Responsible (Fine Waived) by Deter',
       'disposition_Responsible by Admission',
       'disposition_Responsible by Default',
       'disposition_Responsible by Determination',
       'disposition_SET-ASIDE (PENDING JUDGMENT)'],
      dtype='object')

In [32]:
df_final_select.columns

Index(['violation_street_number', 'violation_zip_code', 'zip_code',
       'fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'grafitti_status_0',
       'grafitti_status_GRAFFITI TICKET',
       'disposition_Responsible (Fine Waived) by Admis',
       'disposition_Responsible (Fine Waived) by Deter',
       'disposition_Responsible - Compl/Adj by Default',
       'disposition_Responsible - Compl/Adj by Determi',
       'disposition_Responsible by Admission',
       'disposition_Responsible by Default',
       'disposition_Responsible by Determination',
       'disposition_Responsible by Dismissal'],
      dtype='object')

__Further processing data about paid amount__

In [33]:
#Calculating new features using the existing ones to improve the predictions of the classifier
df_train_select['late_amount'] = df_train_select['judgment_amount']*df_train_select['late_fee']
df_final_select['late_amount'] = df_final_select['judgment_amount']*df_train_select['late_fee']

df_train_select.columns

Index(['violation_street_number', 'violation_zip_code', 'zip_code',
       'fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'grafitti_status_0',
       'grafitti_status_GRAFFITI TICKET',
       'disposition_Not responsible by City Dismissal',
       'disposition_Not responsible by Determination',
       'disposition_Not responsible by Dismissal',
       'disposition_PENDING JUDGMENT',
       'disposition_Responsible (Fine Waived) by Deter',
       'disposition_Responsible by Admission',
       'disposition_Responsible by Default',
       'disposition_Responsible by Determination',
       'disposition_SET-ASIDE (PENDING JUDGMENT)', 'late_amount'],
      dtype='object')

In [34]:
# Converting the datatype according to the data.
df_train_select = df_train_select.convert_objects(convert_numeric=True).fillna(0)
df_final_select = df_final_select.convert_objects(convert_numeric=True).fillna(0)

df_final_select.violation_zip_code = df_final_select.violation_zip_code.astype('float').fillna(0)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [35]:
df_train_select.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250306 entries, 22056 to 325561
Data columns (total 22 columns):
violation_street_number                           250306 non-null float64
violation_zip_code                                250306 non-null float64
zip_code                                          250306 non-null float64
fine_amount                                       250306 non-null float64
admin_fee                                         250306 non-null float64
state_fee                                         250306 non-null float64
late_fee                                          250306 non-null float64
discount_amount                                   250306 non-null float64
clean_up_cost                                     250306 non-null float64
judgment_amount                                   250306 non-null float64
grafitti_status_0                                 250306 non-null uint8
grafitti_status_GRAFFITI TICKET                   250306 non-null uint8
d

In [36]:
df_train_select.head(5).T

ticket_id,22056,27586,22062,22084,22093
violation_street_number,2900.0,4311.0,1449.0,1441.0,2449.0
violation_zip_code,0.0,0.0,0.0,0.0,0.0
zip_code,60606.0,48208.0,48223.0,48214.0,48206.0
fine_amount,250.0,750.0,250.0,250.0,250.0
admin_fee,20.0,20.0,0.0,0.0,0.0
state_fee,10.0,10.0,0.0,0.0,0.0
late_fee,25.0,75.0,0.0,0.0,0.0
discount_amount,0.0,0.0,0.0,0.0,0.0
clean_up_cost,0.0,0.0,0.0,0.0,0.0
judgment_amount,305.0,855.0,0.0,0.0,0.0


In [37]:
df_final_select.head(5).T

ticket_id,284932,285362,285361,285338,285346
violation_street_number,10041.0,18520.0,18520.0,1835.0,1700.0
violation_zip_code,0.0,0.0,0.0,0.0,0.0
zip_code,48213.0,48219.0,48219.0,48183.0,48154.0
fine_amount,200.0,1000.0,100.0,200.0,100.0
admin_fee,20.0,20.0,20.0,20.0,20.0
state_fee,10.0,10.0,10.0,10.0,10.0
late_fee,20.0,100.0,10.0,20.0,10.0
discount_amount,0.0,0.0,0.0,0.0,0.0
clean_up_cost,0.0,0.0,0.0,0.0,0.0
judgment_amount,250.0,1130.0,140.0,250.0,140.0


__generate training & test dataset with df_train_select__

In [38]:
# Split training dataset w/ selected attributes into model training & test datasets

X_train, X_test, y_train, y_test = train_test_split(df_train_select, df_label, random_state=0)

debug = True

if debug:
    X_train.info()
    print("\nX_train: {}".format(X_train.shape))
    print("y_train: {}".format(y_train.shape))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187729 entries, 31321 to 231584
Data columns (total 22 columns):
violation_street_number                           187729 non-null float64
violation_zip_code                                187729 non-null float64
zip_code                                          187729 non-null float64
fine_amount                                       187729 non-null float64
admin_fee                                         187729 non-null float64
state_fee                                         187729 non-null float64
late_fee                                          187729 non-null float64
discount_amount                                   187729 non-null float64
clean_up_cost                                     187729 non-null float64
judgment_amount                                   187729 non-null float64
grafitti_status_0                                 187729 non-null uint8
grafitti_status_GRAFFITI TICKET                   187729 non-null uint8
d

In [39]:
# Train w/ GDBT classifier
# clf = GradientBoostingClassifier().fit(X_train, y_train)

# clf = lr.fit(X_train, y_train).decision_function(X_test)

In [46]:
from sklearn.model_selection import GridSearchCV

params= {'learning_rate': [0.3], 'n_estimators':[100], 'max_depth':[3]}

clf = GradientBoostingClassifier(random_state=0)

gscv = GridSearchCV(estimator=clf, param_grid=params, scoring='roc_auc', cv=5, n_jobs=-1)
gscv.fit(X_train, y_train)

print('training complete with best score', gscv.best_score_, gscv.best_params_)

# training complete with best score 0.878811296108 {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100}
# training complete with best score 0.879531155772 {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 1000}
# training complete with best score 0.879993934674 {'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 1000}

training complete with best score 0.878811296108 {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100}


In [47]:
# get predict result w/ testing dataset
# tree_predicted = clf.predict(X_test)

tree_predicted = gscv.predict(X_test)

if debug:
    print("Predict result ({}): {}".format(tree_predicted.shape[0], sum(tree_predicted)))

Predict result (62577): 847.0


In [52]:
# Calculating the Area Under the Curve
fpr_lr, tpr_lr, _ = roc_curve(y_test, tree_predicted)
roc_auc_lr = auc(fpr_lr, tpr_lr)

if debug:
    print(tree_predicted.shape)
    print(df_final_select.shape)

    print('Accuracy of DT classifier on training set: {:.2f}'.format(gscv.score(X_train, y_train)))
    print('Accuracy of DT classifier on test set: {:.2f}'.format(gscv.score(X_test, y_test)))
    print('ROC Score on test set: {:.2f}'.format(roc_auc_lr))
    print('\n{}'.format(df_train_select.head(3).T))

(62577,)
(61001, 21)
Accuracy of DT classifier on training set: 0.89
Accuracy of DT classifier on test set: 0.88
ROC Score on test set: 0.63

ticket_id                                         22056    27586    22062
violation_street_number                          2900.0   4311.0   1449.0
violation_zip_code                                  0.0      0.0      0.0
zip_code                                        60606.0  48208.0  48223.0
fine_amount                                       250.0    750.0    250.0
admin_fee                                          20.0     20.0      0.0
state_fee                                          10.0     10.0      0.0
late_fee                                           25.0     75.0      0.0
discount_amount                                     0.0      0.0      0.0
clean_up_cost                                       0.0      0.0      0.0
judgment_amount                                   305.0    855.0      0.0
grafitti_status_0                           

In [53]:
# the final dataset for result does not contain 'disposition_SET-ASIDE (PENDING JUDGMENT)' column
# after converting to dummy variables & fill NaN w/ 0
df_final_select['disposition_SET-ASIDE (PENDING JUDGMENT)'] = df_train_select['disposition_SET-ASIDE (PENDING JUDGMENT)']
df_final_select = df_final_select.fillna(0)

if debug:
    print(len(df_train_select.columns),"---",len(df_final_select.columns))
    print("\ntrain data: \n{}".format(df_train_select.head(3).T))
    print("\nfinal data: \n{}".format(df_final_select.head(3).T))

22 --- 22

train data: 
ticket_id                                         22056    27586    22062
violation_street_number                          2900.0   4311.0   1449.0
violation_zip_code                                  0.0      0.0      0.0
zip_code                                        60606.0  48208.0  48223.0
fine_amount                                       250.0    750.0    250.0
admin_fee                                          20.0     20.0      0.0
state_fee                                          10.0     10.0      0.0
late_fee                                           25.0     75.0      0.0
discount_amount                                     0.0      0.0      0.0
clean_up_cost                                       0.0      0.0      0.0
judgment_amount                                   305.0    855.0      0.0
grafitti_status_0                                   1.0      1.0      1.0
grafitti_status_GRAFFITI TICKET                     0.0      0.0      0.0
disposition_No

__Apply trained calssifiered model to final dataset__

In [55]:
preds = gscv.predict(df_final_select)

preds = pd.Series(data=preds, index=df_final_select.index)

if debug:
    print("final result samples: {}".format(sum(preds)))
    print("final result datatype: {}".format(preds.dtypes))

final result samples: 2388.0
final result datatype: float64


In [56]:
df_final_select.tabulate([ticket_id, disposition_Responsible], headers=headers)

AttributeError: 'DataFrame' object has no attribute 'tabulate'

__Final answer__

In [None]:
# Ref: Ref2-Assignment4.ipynb

import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

def blight_model(debug=False):

    if debug:
        print("1. Loading dataset ...")

    # load dataset
    df_train = pd.read_csv("train.csv", encoding='ISO-8859-1', low_memory=False).set_index('ticket_id')
    df_final = pd.read_csv("test.csv", encoding='ISO-8859-1', low_memory=False).set_index('ticket_id')

    if debug:
        print("2. Processing data ...")

    # extract compliance column as label of the model and fill in NaN w/ 0 (non-compliance)
    df_train_label = df_train['compliance'].fillna(0)

    # select attributes for training dataset
    # remaining attributes:
    #    violation_street_number, violation_zip_code, zip_code, disposition, fine_amount,
    #    admin_fee, state_fee, late_fee, discount_amount, clean_up_cost, judgment_amount, grafitti_status

    select = ['violation_street_number', 'violation_zip_code', 'zip_code', 'disposition', 
            'fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount', 
            'clean_up_cost', 'judgment_amount', 'grafitti_status']

    df_train_select = df_train[select]

    # df_train_select = df_train.drop(
    #     ['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status',
    #      'agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
    #      'city','state','violator_name','violation_street_name','violation_description', 
    #      'compliance_detail','mailing_address_str_number','ticket_issued_date','hearing_date',
    #      'non_us_str_code','compliance'], axis=1)

    # fill NaN w/ 0 value
    df_train_select = df_train_select.fillna(0)

    # reserve attributes same as training dataset

    # df_final = df_final.drop(
    #     ['agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name', 
    #      'city','state','violator_name','violation_street_name','violation_description', 
    #      'mailing_address_str_number','non_us_str_code','ticket_issued_date','hearing_date'], axis=1)

    df_final_select = df_final[select]

    df_final_select = df_final_select.fillna(0)

    # convert 'grafitti_status', 'disposition' categorical variables into dummy variables
    df_train_select = pd.get_dummies(data=df_train_select, columns=['grafitti_status', 'disposition'])
    df_final_select = pd.get_dummies(data=df_final_select, columns=['grafitti_status', 'disposition'])

    #Calculating new features using the existing ones to improve the predictions of the classifier
    df_train_select['late_amount'] = df_train_select['judgment_amount']*df_train_select['late_fee']
    df_final_select['late_amount'] = df_final_select['judgment_amount']*df_final_select['late_fee']

    # Converting the datatype according to the data.
    df_train_select = df_train_select.convert_objects(convert_numeric=True).fillna(0)
    df_final_select = df_final_select.convert_objects(convert_numeric=True).fillna(0)

    df_final_select.violation_zip_code = df_final_select.violation_zip_code.astype('float').fillna(0)
    
    # Split training dataset w/ selected attributes into model training & test datasets
    X_train, X_test, y_train, y_test = train_test_split(df_train_select, df_train_label, random_state=0)

    if debug:
        print("3. Model training ...")

    # Train w/ GDBT classifier
    clf = GradientBoostingClassifier().fit(X_train, y_train)
    
    if debug:
        print("4. Score test data")

        # get predict result w/ testing dataset
        tree_predicted = clf.predict(X_test)

        # Calculating the Area Under the Curve
        fpr_lr, tpr_lr, _ = roc_curve(y_test, tree_predicted)
        roc_auc_lr = auc(fpr_lr, tpr_lr)

        print('Accuracy of DT classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
        print('Accuracy of DT classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))
        print('ROC Score on test set: {:.2f}'.format(roc_auc_lr))

    # the final dataset for result does not contain 'disposition_SET-ASIDE (PENDING JUDGMENT)' column
    # after converting to dummy variables & fill NaN w/ 0
    df_final_select['disposition_SET-ASIDE (PENDING JUDGMENT)'] = df_train_select['disposition_SET-ASIDE (PENDING JUDGMENT)']
    df_final_select = df_final_select.fillna(0)

    if debug:
        print("5. Generate find result ...")

    preds = clf.predict(df_final_select)
    preds = pd.Series(data=preds, index=df_final_select.index)

    return preds
              
# blight_model(True)

# Your AUC of 0.662619396326 was awarded a value of 0.64 out of 1.0 total grades

In [None]:
# Ref: Ref2-Assignment4.ipynb

import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

def blight_model(debug=False):
    
    # Cleaning Up the Data
    df = pd.read_csv("train.csv",encoding='ISO-8859-1')
    df1 = pd.read_csv("test.csv",encoding='ISO-8859-1')

    #Fill the na values
    df_label = df['compliance'].fillna(0)

    # Removing the Unnecessary features
    df = df.drop( ['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status',
                   'agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
                  'city','state','violator_name','violation_street_name','violation_description', 
                   'compliance_detail','mailing_address_str_number','ticket_issued_date','hearing_date',
                   'non_us_str_code','compliance'],axis=1)
    df = df.fillna(0)
    df1 = df1.drop( ['agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
                   'city','state','violator_name','violation_street_name','violation_description',
                     'mailing_address_str_number','non_us_str_code','ticket_issued_date','hearing_date'],axis=1)
    df1 = df1.fillna(0)
    df = pd.get_dummies(data=df, columns=['grafitti_status', 'disposition'])
    df1 = pd.get_dummies(data=df1, columns=['grafitti_status', 'disposition'])

    #Calculating new features using the existing ones to improve the predictions of the classifier
    df['late_amount'] = df['judgment_amount']*df['late_fee']
    df1['late_amount'] = df1['judgment_amount']*df1['late_fee']
    
    #df['hearing_date'] = pd.to_datetime(df['hearing_date']).fillna(0)
    #df['ticket_issued_date'] = pd.to_datetime(df['ticket_issued_date']).fillna(0)
    #df['date_diff'] = (pd.to_datetime(df['hearing_date']).dt.date - 
    #                         pd.to_datetime(df['ticket_issued_date']).dt.date).fillna(0)

    # Converting the datatype according to the data.
    df = df.convert_objects(convert_numeric=True).fillna(0)
    df1 = df1.convert_objects(convert_numeric=True).fillna(0)
    df1.violation_zip_code = df1.violation_zip_code.astype('float').fillna(0)
    #print(df1.dtypes)

    #Splitting the data into training and test set
    X_train, X_test, y_train, y_test = train_test_split(df, df_label, random_state=0)

    #Fit the classifier and predict the values for test set
    #clf = DecisionTreeClassifier().fit(X_train, y_train)
    clf = GradientBoostingClassifier().fit(X_train, y_train)
    tree_predicted = clf.predict(X_test)
    # clf = lr.fit(X_train, y_train).decision_function(X_test
    
    #Calculating the Area Under the Curve
    fpr_lr, tpr_lr, _ = roc_curve(y_test, tree_predicted)
    roc_auc_lr = auc(fpr_lr, tpr_lr)

    if debug:
        print(tree_predicted.shape)
        print(df1['ticket_id'].shape)
    
    df1['disposition_SET-ASIDE (PENDING JUDGMENT)'] = df['disposition_SET-ASIDE (PENDING JUDGMENT)']
    
    if debug:
        print(len(df.columns),"---",len(df1.columns))
        print(df.head())
        print(df1.head())
    
    preds = clf.predict(df1)
    preds = pd.DataFrame(data=preds)
    preds.set_index(df1['ticket_id'],inplace=True)
    
#     preds = pd.Series(data=preds, index=df1.index)
    
    if debug:
        print(preds.head())
        print(preds.dtypes)
        print()
        print('Accuracy of DT classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
        print('Accuracy of DT classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))
        print('ROC Score on test set: {:.2f}'.format(roc_auc_lr))
        print(df.head())
    
    return preds

# blight_model(True)

# Your AUC of 0.704128760648 was awarded a value of 0.8 out of 1.0 total grades

In [61]:
# Ref: Ref2-Assignment4.ipynb

# GridSearch

import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

def blight_model(debug=False):
    
    if debug: 
        print("1. loading dataset into Dataframe ...")
    # Cleaning Up the Data
    df = pd.read_csv("train.csv",encoding='ISO-8859-1')
    df1 = pd.read_csv("test.csv",encoding='ISO-8859-1')

    if debug: 
        print("2. processing dataframe ...")
    #Fill the na values
    df_label = df['compliance'].fillna(0)

    # Removing the Unnecessary features
    df = df.drop( ['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status',
                   'agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
                  'city','state','violator_name','violation_street_name','violation_description', 
                   'compliance_detail','mailing_address_str_number','ticket_issued_date','hearing_date',
                   'non_us_str_code','compliance'],axis=1)
    df = df.fillna(0)
    df1 = df1.drop( ['agency_name', 'inspector_name', 'violation_code','country','mailing_address_str_name',
                   'city','state','violator_name','violation_street_name','violation_description',
                     'mailing_address_str_number','non_us_str_code','ticket_issued_date','hearing_date'],axis=1)
    df1 = df1.fillna(0)
    df = pd.get_dummies(data=df, columns=['grafitti_status', 'disposition'])
    df1 = pd.get_dummies(data=df1, columns=['grafitti_status', 'disposition'])

    #Calculating new features using the existing ones to improve the predictions of the classifier
    df['late_amount'] = df['judgment_amount']*df['late_fee']
    df1['late_amount'] = df1['judgment_amount']*df1['late_fee']
    
    #df['hearing_date'] = pd.to_datetime(df['hearing_date']).fillna(0)
    #df['ticket_issued_date'] = pd.to_datetime(df['ticket_issued_date']).fillna(0)
    #df['date_diff'] = (pd.to_datetime(df['hearing_date']).dt.date - 
    #                         pd.to_datetime(df['ticket_issued_date']).dt.date).fillna(0)

    # Converting the datatype according to the data.
    df = df.convert_objects(convert_numeric=True).fillna(0)
    df1 = df1.convert_objects(convert_numeric=True).fillna(0)
    df1.violation_zip_code = df1.violation_zip_code.astype('float').fillna(0)
    #print(df1.dtypes)

    if debug: 
        print("3. split dataset into training and test data ...")
        
    #Splitting the data into training and test set
    X_train, X_test, y_train, y_test = train_test_split(df, df_label, random_state=0)

    #Fit the classifier and predict the values for test set
    #clf = DecisionTreeClassifier().fit(X_train, y_train)

    if debug: 
        print("4. training data with GridSearch ...")
        
    from sklearn.model_selection import GridSearchCV

    params= {'learning_rate': [0.3], 'n_estimators':[100], 'max_depth':[3]}

    clf = GradientBoostingClassifier(random_state=0)

    gscv = GridSearchCV(estimator=clf, param_grid=params, scoring='roc_auc', cv=5, n_jobs=-1)
    gscv.fit(X_train, y_train)

    if debug:
        print('5. training complete with best score', gscv.best_score_, gscv.best_params_)
        
    if debug:
        tree_predicted = gscv.predict(X_test)

        #Calculating the Area Under the Curve
        fpr_lr, tpr_lr, _ = roc_curve(y_test, tree_predicted)
        roc_auc_lr = auc(fpr_lr, tpr_lr)
        
        print(tree_predicted.shape)
        print('Accuracy of DT classifier on training set: {:.2f}'.format(gscv.score(X_train, y_train)))
        print('Accuracy of DT classifier on test set: {:.2f}'.format(gscv.score(X_test, y_test)))
        print('ROC Score on test set: {:.2f}'.format(roc_auc_lr))
#         print(df.head())
    
   
    if debug:
        print("6. processing predict dataset and generate predicts ...")
        print(df1['ticket_id'].shape)
    
    df1['disposition_SET-ASIDE (PENDING JUDGMENT)'] = df['disposition_SET-ASIDE (PENDING JUDGMENT)']
    
    preds = gscv.predict(df1)
    preds = pd.DataFrame(data=preds)
    preds.set_index(df1['ticket_id'],inplace=True)
    
    if debug:
        print("Final result")
        print(preds.head())
        print(preds.dtypes)
    
    return preds

# blight_model(True)

# Your AUC of 0.590712467377 was awarded a value of 0.36 out of 1.0 total grades


1. loading dataset into Dataframe ...


  if self.run_code(code, result):


2. processing dataframe ...




3. split dataset into training and test data ...
4. training data with GridSearch ...
5. training complete with best score 0.889719323185 {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100}
(62577,)
Accuracy of DT classifier on training set: 0.90
Accuracy of DT classifier on test set: 0.89
ROC Score on test set: 0.63
6. processing predict dataset and generate predicts ...
(61001,)
Final result
             0
ticket_id     
284932     0.0
285362     0.0
285361     0.0
285338     0.0
285346     0.0
0    float64
dtype: object


Unnamed: 0_level_0,0
ticket_id,Unnamed: 1_level_1
284932,0.0
285362,0.0
285361,0.0
285338,0.0
285346,0.0
285345,0.0
285347,0.0
285342,1.0
285530,0.0
284989,0.0
