# Houston 311 Service Requests Data Mining Project

## Group members
Stephen Huang

Levi Villarreal

Joshua Wong

Andrew Young


### Code Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
import time
import datetime
import zipfile

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
import pickle
import warnings
warnings.simplefilter('ignore')

### Import data

In [2]:

# Data is from http://www.houstontx.gov/311/
names = ["CASE NUMBER", "SR LOCATION", "COUNTY", "DISTRICT", "NEIGHBORHOOD", "TAX ID", 
         "TRASH QUAD", "RECYCLE QUAD", "TRASH DAY", "HEAVY TRASH DAY", "RECYCLE DAY", 
         "KEY MAP", "MANAGEMENT DISTRICT", "DEPARTMENT", "DIVISION", "SR TYPE", "QUEUE", 
         "SLA", "STATUS", "SR CREATE DATE", "DUE DATE", "DATE CLOSED", "OVERDUE", "TITLE", 
         "x", "y", "LATITUDE", "LONGITUDE", "CHANNEL TYPE"]

with zipfile.ZipFile("311-Public-Data-Extract-2018-clean.txt.zip","r") as zip_ref:
    zip_ref.extractall("")

data = pd.read_csv('311-Public-Data-Extract-2018-clean.txt', sep="|", header=None, names=names)
print(data.shape)
data.head()

(399953, 29)


Unnamed: 0,CASE NUMBER,SR LOCATION,COUNTY,DISTRICT,NEIGHBORHOOD,TAX ID,TRASH QUAD,RECYCLE QUAD,TRASH DAY,HEAVY TRASH DAY,...,SR CREATE DATE,DUE DATE,DATE CLOSED,OVERDUE,TITLE,x,y,LATITUDE,LONGITUDE,CHANNEL TYPE
0,CASE NUMBER,SR LOCATION,COUNTY,DISTRICT,NEIGHBORHOOD,TAX ID,TRASH QUAD,RECYCLE QUAD,TRASH DAY,HEAVY TRASH DAY,...,SR CREATE DATE,DUE DATE,DATE CLOSED,OVERDUE,Title,x,y,LATITUDE,LONGITUDE,Channel Type
1,101002866096,Intersection 13300 HEMPSTEAD RD&9100 PINEMONT ...,,A,FAIRBANKS / NORTHWEST CROSSING,,,,,,...,2018-01-01 00:38:05,2018-01-02 00:38:05,2018-01-01 01:31:49,-0.96,Traffic Signal Maintenance-101002866096,3073146.72528000000,13869806.76417000000,,,Voice In
2,12202635-101002866097,"2803 W BELLFORT, HOUSTON TX 77054",,K,SOUTH MAIN,,,,,,...,2018-01-01 00:41:49,2018-01-11 00:41:49,2018-01-01 05:45:02,-9.79,Water Leak-101002866097,3104254.38025000000,13809501.20958000000,29.67268442,-95.42283905,Voice In
3,12202636-101002866098,"2115 HUTTON, HOUSTON TX 77026",HARRIS,B,GREATER FIFTH WARD,0170560000001,NE,NW,FRIDAY,1st Thursday,...,2018-01-01 00:45:07,2018-01-03 00:45:07,2018-01-01 09:55:02,-1.62,Water Service-101002866098,3131779.00434000000,13849821.40912000000,29.7811689,-95.33222248,Voice In
4,12202637-101002866099,"14806 STEEPLE CHASE, HOUSTON TX 77489",FORT BEND,K,FORT BEND HOUSTON,9550030090110907,SW,SW,THURSDAY,3rd Thursday,...,2018-01-01 00:51:20,2018-01-06 00:51:20,2018-01-01 16:10:02,-4.36,Water Main Valve-101002866099,3079284.91871000000,13786008.44784000000,29.61018299,-95.50364663,Voice In


## Dimensionality Reduction
Right off the bat, we can see that there are many features that do not matter at all to our project.
We can go ahead and get rid of these to help prevent against the curse of dimensionality.

In [3]:
data = data.drop(columns=['SR LOCATION', 'CASE NUMBER', 'TRASH QUAD', 'RECYCLE QUAD', 'TRASH DAY', 'HEAVY TRASH DAY', 'RECYCLE DAY', 'TAX ID', 'KEY MAP', 'DUE DATE', 'DATE CLOSED', 'TITLE', 'x', 'y', 'LONGITUDE', 'LATITUDE'])
print(data.shape)

data.head()

(399953, 13)


Unnamed: 0,COUNTY,DISTRICT,NEIGHBORHOOD,MANAGEMENT DISTRICT,DEPARTMENT,DIVISION,SR TYPE,QUEUE,SLA,STATUS,SR CREATE DATE,OVERDUE,CHANNEL TYPE
0,COUNTY,DISTRICT,NEIGHBORHOOD,MANAGEMENT DISTRICT,DEPARTMENT,DIVISION,SR TYPE,QUEUE,SLA,STATUS,SR CREATE DATE,OVERDUE,Channel Type
1,,A,FAIRBANKS / NORTHWEST CROSSING,Spring Branch MD,PWE Public Works Engineering,Traffic Operations,Traffic Signal Maintenance,TT_SignalMainDispatch,1,Closed,2018-01-01 00:38:05,-0.96,Voice In
2,,K,SOUTH MAIN,,PWE Public Works Engineering,PU Public Utilities,Water Leak,PU_Water,10,Closed,2018-01-01 00:41:49,-9.79,Voice In
3,HARRIS,B,GREATER FIFTH WARD,,PWE Public Works Engineering,PU Public Utilities,Water Service,PU_Water,2,Closed,2018-01-01 00:45:07,-1.62,Voice In
4,FORT BEND,K,FORT BEND HOUSTON,,PWE Public Works Engineering,PU Public Utilities,Water Main Valve,PU_Water,5,Closed,2018-01-01 00:51:20,-4.36,Voice In


### Reasoning behind dropping the attributes we did

SR Location - Same reasoning as case number.

Case number - This was dropped because the case number is a unique identifier assigned to every 311 request, and thus could not be used to classify anything.

Trash Quad - There are requests that are not related to trash collection. Those requests might be misclassified because they are in a different trash quad as other requests in the dataset.

Recycle Quad - Same as Trash Quad

Trash Day - Same as Trash Quad

Recycle day - Same as Trash Quad

Heavy Trash Day - Same as Trash Quad
 
Tax ID - This ID is unique to the property that requested it.
 
Key Map - This attribute is used in the cities internal system to keep track of requests and does not have any actual relation to the request
 
Due Date - We are interested in predicting overdueness, and that it its own seperate column.
 
Date Closed - Same reasoning as due date.
 
Title - This is a another unique identifier assigned to every 311 request, and thus could not be used to classify anything.
 
x - We already have latitude, and this attribute an approximation of latitude.
 
y - We already have longitude, and this attribute an approximation of longitude.

latitude - This is too speficific for our needs, and we have other location metrics.
 
longitude - same as latitude

## Data Cleaning

Drop all service requests that are not closed

In [4]:
oldNumRows = data.shape[0]
data = data[data.STATUS == 'Closed']
data = data.drop(["STATUS"], axis=1)

print("Number of rows dropped: ", oldNumRows - data.shape[0])

Number of rows dropped:  181


Normalize all counties and drop rows where county is unknown

In [5]:
print('Counties:')
print(data['COUNTY'].value_counts())
print()

oldNumRows = data.shape[0]
data['COUNTY'] = data['COUNTY'].replace(['HARRIS', 'FORT BEND', 'MONTGOMERY'], ['Harris County', 'Fort Bend County', 'Montgomery County'])
data = data[data.COUNTY != 'Unknown']

print('Counties:')
print(data['COUNTY'].value_counts())
print()
print("Number of rows with COUNTY == 'Unknown' dropped: ", oldNumRows - data.shape[0])

Counties:
HARRIS               322110
Harris County         36673
FORT BEND              6868
Unknown                3201
Fort Bend County        379
MONTGOMERY              185
Montgomery County       120
Galveston County         13
Name: COUNTY, dtype: int64

Counties:
Harris County        358783
Fort Bend County       7247
Montgomery County       305
Galveston County         13
Name: COUNTY, dtype: int64

Number of rows with COUNTY == 'Unknown' dropped:  3201


Drop rows where district is unknown

In [6]:
oldNumRows = data.shape[0]
data = data[data.DISTRICT != 'Unknown']

print("Number of rows dropped: ", oldNumRows - data.shape[0])

Number of rows dropped:  552


Normalize all districts

In [7]:
data['MANAGEMENT DISTRICT'] = data['MANAGEMENT DISTRICT'].replace(['HCID #3 TRACT 19 (Upper Kirby)', 'HCID #3 TRACT 47 (Upper Kirby)', 'East End MD', 'Greater Northside', 'Sharpstown'], ['HCID #3 (Upper Kirby)', 'HCID #3 (Upper Kirby)', 'East End', 'Greater Northside MD', 'Sharpstown MD'])
print('Management Districts:')
print(data['MANAGEMENT DISTRICT'].value_counts().sort_index())

Management Districts:
Airline IMPROVEMENT DISTRICT                   23
Airline PID                                     6
Aldin North Expansion Tract2                    6
Aldine North Expansion Tract1                  18
Aldine PID                                    232
Baybrook MD                                   110
East Downtown MD                             1743
East End                                    16767
FB CAD #6                                       6
Five Corners HCID #10B                      11645
Five Corners Improvement Dist(HCID #10B)       59
Greater Greenspoint MD                       1834
Greater Northside MD                        32037
Greater Southeast MD                        12224
HCID #10-A                                  14007
HCID #12                                        6
HCID #16                                        4
HCID #3 (Upper Kirby)                        1743
HCID #4 (Energy Corridor)                     956
HCID #5 (Brays Oaks)        

Drop all rows with unknown channel types

In [8]:
oldNumRows = data.shape[0]
data = data[data['CHANNEL TYPE'] != 'Unknown']

print("Number of rows dropped: ", oldNumRows - data.shape[0])

Number of rows dropped:  20590


Drop all rows with null values

In [9]:
oldNumRows = data.shape[0]
data = data.dropna()

print("Number of rows dropped: ", oldNumRows - data.shape[0])

Number of rows dropped:  226480


## Feature Transformation

Currently, the request date would be draw conclusions from, so we transformed the feature to instead be the month in which the service request was created, to account seasonal differences in service request time.

In [10]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
data['SR CREATE DATE'] = data['SR CREATE DATE'].apply(lambda x : datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
data['SR CREATE DATE'] = data['SR CREATE DATE'].apply(lambda x : months[x.month-1])

data = data.rename(columns={'SR CREATE DATE': 'SR MONTH'})
data.head()

Unnamed: 0,COUNTY,DISTRICT,NEIGHBORHOOD,MANAGEMENT DISTRICT,DEPARTMENT,DIVISION,SR TYPE,QUEUE,SLA,SR MONTH,OVERDUE,CHANNEL TYPE
10,Harris County,I,GREATER HOBBY AREA,HCID #9,PWE Public Works Engineering,Street and Drainage,Street Hazard,ROWM_StreetMain,10,January,-8.83,Voice In
16,Harris County,D,SUNNYSIDE,Five Corners HCID #10B,NS Neighborhood Services,Investigations,Nuisance On Property,NS_Dispatch,180,January,134.19,Voice In
17,Harris County,A,SPRING BRANCH NORTH,Spring Branch MD,SWM Solid Waste Management,General Support Services,Dumpster Complaint,SWM_Complaints,21,January,-0.7,Voice In
20,Harris County,J,SHARPSTOWN,Sharpstown MD,PWE Public Works Engineering,PU Public Utilities,Sewer Wastewater,PU_WasteWater,1,January,-0.96,Voice In
22,Harris County,F,ALIEF,International MD,SWM Solid Waste Management,Collections,Container Problem,SWM_CollectionsSW,10,January,-7.64,Voice In


Currently, the overdue is continuous, which makes it hard to use in classification algorithms. To combat this, we want to bin the values, so that there are only a few possible values.

The values we want bin into are on time tasks (negative values), tasks done within a week past due date (0 < x < 7), tasks done within a month past due date (7 <= x < 30), tasks done over a month past due date (x >= 30).

In [11]:
def bin_overdue (val):
    val = float(val)
    
    if val < 0:
        return "On Time"
    elif val < 7:
        return "Week"
    elif val < 30:
        return "Month"
    else:
        return "More"
    
data["OVERDUE"] = data["OVERDUE"].apply(bin_overdue)
    
print(data['OVERDUE'].value_counts().sort_index())
data.head()

Month       14171
More         9399
On Time    107013
Week        18366
Name: OVERDUE, dtype: int64


Unnamed: 0,COUNTY,DISTRICT,NEIGHBORHOOD,MANAGEMENT DISTRICT,DEPARTMENT,DIVISION,SR TYPE,QUEUE,SLA,SR MONTH,OVERDUE,CHANNEL TYPE
10,Harris County,I,GREATER HOBBY AREA,HCID #9,PWE Public Works Engineering,Street and Drainage,Street Hazard,ROWM_StreetMain,10,January,On Time,Voice In
16,Harris County,D,SUNNYSIDE,Five Corners HCID #10B,NS Neighborhood Services,Investigations,Nuisance On Property,NS_Dispatch,180,January,More,Voice In
17,Harris County,A,SPRING BRANCH NORTH,Spring Branch MD,SWM Solid Waste Management,General Support Services,Dumpster Complaint,SWM_Complaints,21,January,On Time,Voice In
20,Harris County,J,SHARPSTOWN,Sharpstown MD,PWE Public Works Engineering,PU Public Utilities,Sewer Wastewater,PU_WasteWater,1,January,On Time,Voice In
22,Harris County,F,ALIEF,International MD,SWM Solid Waste Management,Collections,Container Problem,SWM_CollectionsSW,10,January,On Time,Voice In


## Export data to a CSV

In [12]:
data.to_csv('houston-311-sanitized.csv', index=False)

## Read in Sanitized Data

In [13]:
data = pd.read_csv('houston-311-sanitized.csv', sep=",", header='infer')
print(data.shape)
data.head()

(148949, 12)


Unnamed: 0,COUNTY,DISTRICT,NEIGHBORHOOD,MANAGEMENT DISTRICT,DEPARTMENT,DIVISION,SR TYPE,QUEUE,SLA,SR MONTH,OVERDUE,CHANNEL TYPE
0,Harris County,I,GREATER HOBBY AREA,HCID #9,PWE Public Works Engineering,Street and Drainage,Street Hazard,ROWM_StreetMain,10.0,January,On Time,Voice In
1,Harris County,D,SUNNYSIDE,Five Corners HCID #10B,NS Neighborhood Services,Investigations,Nuisance On Property,NS_Dispatch,180.0,January,More,Voice In
2,Harris County,A,SPRING BRANCH NORTH,Spring Branch MD,SWM Solid Waste Management,General Support Services,Dumpster Complaint,SWM_Complaints,21.0,January,On Time,Voice In
3,Harris County,J,SHARPSTOWN,Sharpstown MD,PWE Public Works Engineering,PU Public Utilities,Sewer Wastewater,PU_WasteWater,1.0,January,On Time,Voice In
4,Harris County,F,ALIEF,International MD,SWM Solid Waste Management,Collections,Container Problem,SWM_CollectionsSW,10.0,January,On Time,Voice In


## Prep the data for modeling

Split the data into features and labels. The label is the overdueness of the 311 request, which we binned earlier.

Because of the way that sklearn handles catagorical data, we had to encode all of the features so that they could be processed correctly.

Also, we downsample the data so that the models will run quicker. We use `train_test_split` here to ensure that the downsampling is down without affecting the class balance.

In [14]:
num_data = data.copy()
cat_columns = ["COUNTY", "DISTRICT", "NEIGHBORHOOD", "MANAGEMENT DISTRICT", "DEPARTMENT", "DIVISION",
              "SR TYPE", "QUEUE", "SR MONTH", "CHANNEL TYPE"]

for c in cat_columns:
    prev_column = num_data[c]
    # Transform catagorical data to one that sklearn can understand
    encoder = preprocessing.LabelEncoder()
    num_data[c] = encoder.fit_transform(prev_column)

print(num_data.head())
train, test = train_test_split(num_data, train_size = 0.005, test_size = 0.005)

   COUNTY  DISTRICT  NEIGHBORHOOD  MANAGEMENT DISTRICT  DEPARTMENT  DIVISION  \
0       1         8            28                   16          14        27   
1       1         3            70                    4          11        16   
2       1         0            68                   30          15        13   
3       1         9            61                   29          14        21   
4       1         5             3                   20          15         1   

   SR TYPE  QUEUE    SLA  SR MONTH  OVERDUE  CHANNEL TYPE  
0      115     60   10.0         4  On Time             4  
1       80     35  180.0         4     More             4  
2       26     67   21.0         4  On Time             4  
3      108     53    1.0         4  On Time             4  
4       17     65   10.0         4  On Time             4  


In [15]:
down_data = train

labels = down_data['OVERDUE']
df_features = down_data.drop(['OVERDUE'], axis=1)
print(labels.value_counts().sort_index())

Month       58
More        52
On Time    537
Week        97
Name: OVERDUE, dtype: int64


## Decision Trees

In [16]:
features_train, features_test, labels_train, labels_test = train_test_split(df_features, labels, test_size = 0.2)
print('Training set size:',len(labels_train))
print('Test set size:    ',len(labels_test))
dt = Pipeline(steps=[('smt',SMOTE()),('dt',tree.DecisionTreeClassifier())])
dtt = dt.fit(features_train, labels_train)
predict_labels_test = dtt.predict(features_test)
print('Accuracy of decision tree classifier:',accuracy_score(labels_test, predict_labels_test))

Training set size: 595
Test set size:     149
Accuracy of decision tree classifier: 0.5503355704697986


In [17]:
predicted_labels = cross_val_predict(dt,df_features,labels,cv=5)
print('Confusion matrix:\n',confusion_matrix(labels, predicted_labels))
print('\nClassification report:\n',classification_report(labels, predicted_labels))

Confusion matrix:
 [[ 15   9  23  11]
 [ 11  24  15   2]
 [ 37  29 405  66]
 [ 16   3  50  28]]

Classification report:
               precision    recall  f1-score   support

       Month       0.19      0.26      0.22        58
        More       0.37      0.46      0.41        52
     On Time       0.82      0.75      0.79       537
        Week       0.26      0.29      0.27        97

    accuracy                           0.63       744
   macro avg       0.41      0.44      0.42       744
weighted avg       0.67      0.63      0.65       744



## Support Vector Machines (SVM)

In [18]:
svm = Pipeline(steps=[('scaler',StandardScaler()),('smt',SMOTE()),('reduce_dim',PCA()),('clf',SVC(gamma='scale'))])
param_grid = {
    'clf__kernel':('linear','rbf','poly')
}
pipedsvm = GridSearchCV(svm,param_grid,cv=5,scoring='accuracy', iid=True)
accuracies = cross_val_score(pipedsvm.fit(df_features,labels), df_features, labels, cv=5)
print('Official accuracy:',np.mean(accuracies))

Official accuracy: 0.558167648659607


In [19]:
predicted_labels = cross_val_predict(pipedsvm,df_features,labels,cv=5)
print('Confusion matrix:\n',confusion_matrix(labels, predicted_labels))
print('\nClassification report:\n',classification_report(labels, predicted_labels))

Confusion matrix:
 [[ 21  14  11  12]
 [ 11  24  10   7]
 [ 76  44 326  91]
 [ 25  10  25  37]]

Classification report:
               precision    recall  f1-score   support

       Month       0.16      0.36      0.22        58
        More       0.26      0.46      0.33        52
     On Time       0.88      0.61      0.72       537
        Week       0.25      0.38      0.30        97

    accuracy                           0.55       744
   macro avg       0.39      0.45      0.39       744
weighted avg       0.70      0.55      0.60       744



## Neural Networks (NN)

In [20]:
from sklearn.neural_network import MLPClassifier
nn = Pipeline(steps=[('scaler',StandardScaler()),('smt',SMOTE()),('clf',MLPClassifier())])
param_grid = {
    'clf__hidden_layer_sizes':((50,),(60,)),
    'clf__activation':('logistic','tanh','relu')
}
pipednn = GridSearchCV(nn,param_grid,cv=5,scoring='accuracy', iid=True)
accuracies = cross_val_score(pipednn.fit(df_features,labels), df_features, labels, cv=5)
print('Official accuracy:',np.mean(accuracies))

Official accuracy: 0.5500043833534847


In [21]:
predicted_labels = cross_val_predict(pipednn,df_features,labels,cv=5)
print('Confusion matrix:\n',confusion_matrix(labels, predicted_labels))
print('\nClassification report:\n',classification_report(labels, predicted_labels))

Confusion matrix:
 [[ 16  12  11  19]
 [ 12  23   9   8]
 [ 67  34 340  96]
 [ 21   9  27  40]]

Classification report:
               precision    recall  f1-score   support

       Month       0.14      0.28      0.18        58
        More       0.29      0.44      0.35        52
     On Time       0.88      0.63      0.74       537
        Week       0.25      0.41      0.31        97

    accuracy                           0.56       744
   macro avg       0.39      0.44      0.40       744
weighted avg       0.70      0.56      0.61       744



## k-Nearest Neighbor (KNN)

In [22]:
from sklearn.neighbors import KNeighborsClassifier
param_grid = {
    'knn__n_neighbors': list(range(1, 25))
}
knn = Pipeline(steps=[('scaler', StandardScaler()),('smt',SMOTE()),('pca', PCA()), ('knn',KNeighborsClassifier(n_neighbors=7))])
pipedknn = GridSearchCV(knn,param_grid,cv=5,scoring='accuracy', iid=True)
accuracies = cross_val_score(pipedknn.fit(df_features,labels), df_features, labels, cv=5)
print('Average accuracy:',np.mean(accuracies))

Average accuracy: 0.6023977552359767


In [23]:
predicted_labels = cross_val_predict(pipedknn,df_features,labels,cv=5)
print('Confusion matrix:\n',confusion_matrix(labels, predicted_labels))
print('\nClassification report:\n',classification_report(labels, predicted_labels))

Confusion matrix:
 [[ 10   8  25  15]
 [ 11  16  21   4]
 [ 41  34 384  78]
 [ 11  10  41  35]]

Classification report:
               precision    recall  f1-score   support

       Month       0.14      0.17      0.15        58
        More       0.24      0.31      0.27        52
     On Time       0.82      0.72      0.76       537
        Week       0.27      0.36      0.31        97

    accuracy                           0.60       744
   macro avg       0.36      0.39      0.37       744
weighted avg       0.65      0.60      0.62       744



## Naive Bayes
Naive Bayes is not a good option for this dataset due to correlation between attributes. For example, requests from the same neighborhood would be from the same county.

In [24]:
from sklearn.naive_bayes import GaussianNB
nb = Pipeline(steps=[('smt',SMOTE()),('nb',GaussianNB())])
accuracies = cross_val_score(nb,df_features,labels,cv=10)
print('Average accuracy:',np.mean(accuracies))

Average accuracy: 0.36450096130737797


In [25]:
predicted_labels = cross_val_predict(nb,df_features,labels,cv=5)
print('Confusion matrix:\n',confusion_matrix(labels, predicted_labels))
print('\nClassification report:\n',classification_report(labels, predicted_labels))

Confusion matrix:
 [[ 38   6   2  12]
 [ 26  13   6   7]
 [124  48 187 178]
 [ 53   5   6  33]]

Classification report:
               precision    recall  f1-score   support

       Month       0.16      0.66      0.25        58
        More       0.18      0.25      0.21        52
     On Time       0.93      0.35      0.51       537
        Week       0.14      0.34      0.20        97

    accuracy                           0.36       744
   macro avg       0.35      0.40      0.29       744
weighted avg       0.72      0.36      0.43       744



## AdaBoost

In [26]:
from sklearn.ensemble import AdaBoostClassifier
ab = Pipeline(steps=[('smt',SMOTE()),('ab',AdaBoostClassifier(n_estimators=150))])
accuracies = cross_val_score(ab, df_features, labels, cv=5)
print('Average accuracy:',np.mean(accuracies))

Average accuracy: 0.38206423073972745


In [27]:
predicted_labels = cross_val_predict(ab,df_features,labels,cv=5)
print('Confusion matrix:\n',confusion_matrix(labels, predicted_labels))
print('\nClassification report:\n',classification_report(labels, predicted_labels))

Confusion matrix:
 [[ 22   9  14  13]
 [  9  28  10   5]
 [118  26 234 159]
 [ 29   6  25  37]]

Classification report:
               precision    recall  f1-score   support

       Month       0.12      0.38      0.19        58
        More       0.41      0.54      0.46        52
     On Time       0.83      0.44      0.57       537
        Week       0.17      0.38      0.24        97

    accuracy                           0.43       744
   macro avg       0.38      0.43      0.36       744
weighted avg       0.66      0.43      0.49       744



## Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('smt',SMOTE()),('rf',RandomForestClassifier(n_estimators=10))])
accuracies = cross_val_score(rf.fit(df_features,labels), df_features, labels, cv=5)
print('Official accuracy:',np.mean(accuracies))

Official accuracy: 0.6533691550720879


In [47]:
predicted_labels = cross_val_predict(pipedrf,df_features,labels,cv=5)
print('Confusion matrix:\n',confusion_matrix(labels, predicted_labels))
print('\nClassification report:\n',classification_report(labels, predicted_labels))

Confusion matrix:
 [[ 15  12  20  11]
 [ 11  27  12   2]
 [ 38  24 428  47]
 [ 18   4  52  23]]

Classification report:
               precision    recall  f1-score   support

       Month       0.18      0.26      0.21        58
        More       0.40      0.52      0.45        52
     On Time       0.84      0.80      0.82       537
        Week       0.28      0.24      0.26        97

    accuracy                           0.66       744
   macro avg       0.42      0.45      0.43       744
weighted avg       0.68      0.66      0.67       744



## Voting Classifier
SVM, Random Forest, AdaBoost, KNN, and Neural Nets are combined in a voting classifier.

In [52]:
votingclf = Pipeline(steps=[('smt',SMOTE()),('vc',VotingClassifier(estimators=[('svc', pipedsvm), ('rf', rf), ('knn', pipedknn), ('nn', pipednn)], weights=[1,2,1,1], voting='hard'))])
accuracies = cross_val_score(votingclf, df_features, labels, cv=5)
print('Average accuracy:',np.mean(accuracies))

Average accuracy: 0.6195258307368052


In [53]:
predicted_labels = cross_val_predict(votingclf,df_features,labels,cv=5)
print('Confusion matrix:\n',confusion_matrix(labels, predicted_labels))
print('\nClassification report:\n',classification_report(labels, predicted_labels))

Confusion matrix:
 [[ 14  11  19  14]
 [ 10  20  18   4]
 [ 48  36 395  58]
 [ 21   5  43  28]]

Classification report:
               precision    recall  f1-score   support

       Month       0.15      0.24      0.19        58
        More       0.28      0.38      0.32        52
     On Time       0.83      0.74      0.78       537
        Week       0.27      0.29      0.28        97

    accuracy                           0.61       744
   macro avg       0.38      0.41      0.39       744
weighted avg       0.67      0.61      0.64       744



In [54]:
filename = 'finalized_model.sav'
pickle.dump(votingclf, open(filename, 'wb'))