In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [4]:
mjr_violations_orig = pd.read_csv('data/mjrhouse_violations.csv')

In [5]:
violations = mjr_violations_orig
violations.shape

(27863, 34)

In [6]:
violations.drop(['Resolve','Compliance_Requested','Case_Type','CEB_Compliance_Requested','Tag_', 'VIN_',
       'Make', 'Color','Vehicle_Description', 'Address', 'Quantity','PARCEL_1', 'MAPSHEET','COUNTY', 'DESCRIPT'], axis=1, inplace = True)
violations.columns

Index(['Number', 'Violation', 'Inspector', 'Status', 'Primary_Party',
       'Violation_Date', 'Compliance', 'Received_By', 'PARCEL_CLEAN',
       'LATITUDE', 'LONGITUDE', 'Prop_Use_Code', 'Prop_Use_Desc', 'TRS_WATSON',
       'SECTION_1', 'TOWNSHIP_1', 'RANGE_1', 'TD', 'RD'],
      dtype='object')

In [7]:
# violations['Violation_Date'][0][:4]
violations['Year'] = violations['Violation_Date'].map(lambda x: x[:4]).astype(int)

In [8]:
#change to datetime format
violations["Violation_Date"] = pd.to_datetime(violations["Violation_Date"])
violations["Violation_Date"] = pd.to_datetime(violations['Compliance'])

In [9]:
violations['blight'] = violations['Violation'].isin(['Blight-inducing Materials','Abandoned Vehicles And/or Blight Inducing Materials']).astype(int)
violations.head()


# violations['SECTION_1'].astype(int)
pd.to_numeric(violations['SECTION_1'] , errors='coerce')
violations.fillna(value=0, inplace=True)

# violations = violations['SECTION_1'].replace(' ' , '0')
violations = violations.replace(' ' , '0')


violations['SECTION_1'] = violations['SECTION_1'].astype(int)


violations['TOWNSHIP_1'] = violations['TOWNSHIP_1'].astype(int)
violations.head()


Unnamed: 0,Number,Violation,Inspector,Status,Primary_Party,Violation_Date,Compliance,Received_By,PARCEL_CLEAN,LATITUDE,...,Prop_Use_Code,Prop_Use_Desc,TRS_WATSON,SECTION_1,TOWNSHIP_1,RANGE_1,TD,RD,Year,blight
0,CE-15-02088,Overgrown Yard / Weeds,"Walter L. Booth, Sr.",Opened,ALAN SCHNEIDER,2015-12-18 00:00:00,2015-12-18,311GNV,05977-215-000,29.735921,...,200.0,MOBILE HOME,09S19E002,2,9,19,S,E,2015,0
1,CE-10-03462,Blight-inducing Materials,"Walter L. Booth, Sr.",Opened,"SANDERS, JUNE E",2011-01-04 00:00:00,2011-01-04,By Phone,05977-217-000,29.735988,...,200.0,MOBILE HOME,09S19E002,2,9,19,S,E,2010,1
2,CE-09-06929,Dead Tree or Hazardous Trees,"Walter L. Booth, Sr.",Opened,ADD PARTY,2010-03-22 00:00:00,2010-03-22,CE Officer,05977-217-000,29.735988,...,200.0,MOBILE HOME,09S19E002,2,9,19,S,E,2009,0
3,CE-09-06929,Overgrown Yard / Weeds,"Walter L. Booth, Sr.",Opened,ADD PARTY,2010-03-22 00:00:00,2010-03-22,CE Officer,05977-217-000,29.735988,...,200.0,MOBILE HOME,09S19E002,2,9,19,S,E,2009,0
4,CE-15-02201,Overgrown Yard / Weeds,"Walter L. Booth, Sr.",Opened,"TURKEY CREEK FOREST OWNERS, ASSOCIATION",2015-10-08 00:00:00,2015-10-08,311GNV,05977-217-000,29.735988,...,200.0,MOBILE HOME,09S19E002,2,9,19,S,E,2015,0


In [10]:
train_data = violations[(violations['Year'] >= 2014) & (violations['Year'] <= 2018)].reset_index(drop=True)
test_data = violations[(violations['Year'] >= 2019) & (violations['Year'] <= 2020)].reset_index(drop=True)

In [11]:
test_data['blight'].sum()

120

In [12]:
train_data['blight'].sum()

832

In [13]:
y_train = train_data["blight"]
train_data.drop(labels="blight", axis=1, inplace=True)
test_data.drop(labels="blight", axis=1, inplace=True)

full_data = train_data.append(test_data)

In [14]:
# drop_columns = ["Compliance", "Inspector", "TD", "RANGE_1", "Prop_Use_Desc", "RD", "Violation", "Number", "Primary_Party", "TRS_WATSON", ]
full_data = full_data[["SECTION_1", "Received_By", "Prop_Use_Code", "Year", "TOWNSHIP_1"]]

# full_data.drop(labels=drop_columns, axis=1, inplace=True)

In [15]:
full_data.head()

Unnamed: 0,SECTION_1,Received_By,Prop_Use_Code,Year,TOWNSHIP_1
0,2,311GNV,200.0,2015,9
1,2,311GNV,200.0,2015,9
2,2,0,200.0,2017,9
3,2,311GNV,200.0,2015,9
4,2,By Email,100.0,2015,9


In [16]:
full_data = pd.get_dummies(full_data, columns=["Received_By"])
full_data.fillna(value=0.0, inplace=True)
full_data.head()


Unnamed: 0,SECTION_1,Prop_Use_Code,Year,TOWNSHIP_1,Received_By_0,Received_By_311GNV,Received_By_Admin,Received_By_By Email,Received_By_By Fax,Received_By_By Mail,Received_By_By Phone,Received_By_CE Officer,Received_By_MANAGER,Received_By_Proactive,Received_By_SUPERVISOR,Received_By_Voice Mail,Received_By_Walk In
0,2,200.0,2015,9,0,1,0,0,0,0,0,0,0,0,0,0,0
1,2,200.0,2015,9,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2,200.0,2017,9,1,0,0,0,0,0,0,0,0,0,0,0,0
3,2,200.0,2015,9,0,1,0,0,0,0,0,0,0,0,0,0,0
4,2,100.0,2015,9,0,0,0,1,0,0,0,0,0,0,0,0,0


In [17]:
train_data.shape

(5958, 20)

In [18]:
X_train = full_data.values[0:5958]
X_train

array([[  2.00000000e+00,   2.00000000e+02,   2.01500000e+03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.00000000e+00,   2.00000000e+02,   2.01500000e+03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.00000000e+00,   2.00000000e+02,   2.01700000e+03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   2.01600000e+03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   2.01600000e+03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   2.01600000e+03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [19]:
X_test = full_data.values[5958:]
X_test[0]

array([  2.00000000e+00,   2.00000000e+02,   2.01900000e+03,
         9.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00])

In [20]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
state = 12  
test_size = 0.30  
  
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,  
    test_size=test_size, random_state=state)

In [22]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=12, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))

Learning rate:  0.05
Accuracy score (training): 0.858
Accuracy score (validation): 0.866
Learning rate:  0.075
Accuracy score (training): 0.858
Accuracy score (validation): 0.866
Learning rate:  0.1
Accuracy score (training): 0.858
Accuracy score (validation): 0.866
Learning rate:  0.25
Accuracy score (training): 0.858
Accuracy score (validation): 0.866
Learning rate:  0.5
Accuracy score (training): 0.858
Accuracy score (validation): 0.866
Learning rate:  0.75
Accuracy score (training): 0.859
Accuracy score (validation): 0.864
Learning rate:  1
Accuracy score (training): 0.859
Accuracy score (validation): 0.860


In [25]:
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=12, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)
# predictions

# # test_data['predictions'] = predictions
# # X_train.shape
# # test_data.shape

# # len(X_val)
# # X_val

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))

print("Classification Report")
print(classification_report(y_val, predictions))

Confusion Matrix:
[[1548    1]
 [ 239    0]]
Classification Report
             precision    recall  f1-score   support

          0       0.87      1.00      0.93      1549
          1       0.00      0.00      0.00       239

avg / total       0.75      0.87      0.80      1788

