# Houston 311 Service Requests Data Mining Project

## Group members
Stephen Huang

Levi Villarreal

Joshua Wong

Andrew Young


### Code Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
import time
import datetime
import zipfile

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import pickle
import warnings
warnings.simplefilter('ignore')

### Import data

In [None]:

# Data is from http://www.houstontx.gov/311/
names = ["CASE NUMBER", "SR LOCATION", "COUNTY", "DISTRICT", "NEIGHBORHOOD", "TAX ID", 
         "TRASH QUAD", "RECYCLE QUAD", "TRASH DAY", "HEAVY TRASH DAY", "RECYCLE DAY", 
         "KEY MAP", "MANAGEMENT DISTRICT", "DEPARTMENT", "DIVISION", "SR TYPE", "QUEUE", 
         "SLA", "STATUS", "SR CREATE DATE", "DUE DATE", "DATE CLOSED", "OVERDUE", "TITLE", 
         "x", "y", "LATITUDE", "LONGITUDE", "CHANNEL TYPE"]

with zipfile.ZipFile("311-Public-Data-Extract-2018-clean.txt.zip","r") as zip_ref:
    zip_ref.extractall("")

data = pd.read_csv('311-Public-Data-Extract-2018-clean.txt', sep="|", header=None, names=names)
print(data.shape)
data.head()

## Dimensionality Reduction
Right off the bat, we can see that there are many features that do not matter at all to our project.
We can go ahead and get rid of these to help prevent against the curse of dimensionality.

In [None]:
data = data.drop(columns=['SR LOCATION', 'CASE NUMBER', 'TRASH QUAD', 'RECYCLE QUAD', 'TRASH DAY', 'HEAVY TRASH DAY', 'RECYCLE DAY', 'TAX ID', 'KEY MAP', 'DUE DATE', 'DATE CLOSED', 'TITLE', 'x', 'y', 'LONGITUDE', 'LATITUDE'])
print(data.shape)

data.head()

### Reasoning behind dropping the attributes we did

SR Location - Same reasoning as case number.

Case number - This was dropped because the case number is a unique identifier assigned to every 311 request, and thus could not be used to classify anything.

Trash Quad - There are requests that are not related to trash collection. Those requests might be misclassified because they are in a different trash quad as other requests in the dataset.

Recycle Quad - Same as Trash Quad

Trash Day - Same as Trash Quad

Recycle day - Same as Trash Quad

Heavy Trash Day - Same as Trash Quad
 
Tax ID - This ID is unique to the property that requested it.
 
Key Map - This attribute is used in the cities internal system to keep track of requests and does not have any actual relation to the request
 
Due Date - We are interested in predicting overdueness, and that it its own seperate column.
 
Date Closed - Same reasoning as due date.
 
Title - This is a another unique identifier assigned to every 311 request, and thus could not be used to classify anything.
 
x - We already have latitude, and this attribute an approximation of latitude.
 
y - We already have longitude, and this attribute an approximation of longitude.

latitude - This is too speficific for our needs, and we have other location metrics.
 
longitude - same as latitude

## Data Cleaning

Drop all service requests that are not closed

In [None]:
oldNumRows = data.shape[0]
data = data[data.STATUS == 'Closed']
data = data.drop(["STATUS"], axis=1)

print("Number of rows dropped: ", oldNumRows - data.shape[0])

Normalize all counties and drop rows where county is unknown

In [None]:
print('Counties:')
print(data['COUNTY'].value_counts())
print()

oldNumRows = data.shape[0]
data['COUNTY'] = data['COUNTY'].replace(['HARRIS', 'FORT BEND', 'MONTGOMERY'], ['Harris County', 'Fort Bend County', 'Montgomery County'])
data = data[data.COUNTY != 'Unknown']

print('Counties:')
print(data['COUNTY'].value_counts())
print()
print("Number of rows with COUNTY == 'Unknown' dropped: ", oldNumRows - data.shape[0])

Drop rows where district is unknown

In [None]:
oldNumRows = data.shape[0]
data = data[data.DISTRICT != 'Unknown']

print("Number of rows dropped: ", oldNumRows - data.shape[0])

Normalize all districts

In [None]:
data['MANAGEMENT DISTRICT'] = data['MANAGEMENT DISTRICT'].replace(['HCID #3 TRACT 19 (Upper Kirby)', 'HCID #3 TRACT 47 (Upper Kirby)', 'East End MD', 'Greater Northside', 'Sharpstown'], ['HCID #3 (Upper Kirby)', 'HCID #3 (Upper Kirby)', 'East End', 'Greater Northside MD', 'Sharpstown MD'])
print('Management Districts:')
print(data['MANAGEMENT DISTRICT'].value_counts().sort_index())

Drop all rows with unknown channel types

In [None]:
oldNumRows = data.shape[0]
data = data[data['CHANNEL TYPE'] != 'Unknown']

print("Number of rows dropped: ", oldNumRows - data.shape[0])

Drop all rows with null values

In [None]:
oldNumRows = data.shape[0]
data = data.dropna()

print("Number of rows dropped: ", oldNumRows - data.shape[0])

## Feature Transformation

Currently, the request date would be draw conclusions from, so we transformed the feature to instead be the month in which the service request was created, to account seasonal differences in service request time.

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
data['SR CREATE DATE'] = data['SR CREATE DATE'].apply(lambda x : datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
data['SR CREATE DATE'] = data['SR CREATE DATE'].apply(lambda x : months[x.month-1])

data = data.rename(columns={'SR CREATE DATE': 'SR MONTH'})
data.head()

Currently, the overdue is continuous, which makes it hard to use in classification algorithms. To combat this, we want to bin the values, so that there are only a few possible values.

The values we want bin into are on time tasks (negative values), tasks done within a week past due date (0 < x < 7), tasks done within a month past due date (7 <= x < 30), tasks done over a month past due date (x >= 30).

In [None]:
def bin_overdue (val):
    val = float(val)
    
    if val < 0:
        return "On Time"
    elif val < 7:
        return "Week"
    elif val < 30:
        return "Month"
    else:
        return "More"
    
data["OVERDUE"] = data["OVERDUE"].apply(bin_overdue)
    
print(data['OVERDUE'].value_counts().sort_index())
data.head()

## Export data to a CSV

In [None]:
data.to_csv('houston-311-sanitized.csv', index=False)

## Read in Sanitized Data

In [None]:
data = pd.read_csv('houston-311-sanitized.csv', sep=",", header='infer')
print(data.shape)
data.head()

## Prep the data for modeling

Split the data into features and labels. The label is the overdueness of the 311 request, which we binned earlier.

Because of the way that sklearn handles catagorical data, we had to encode all of the features so that they could be processed correctly.

Also, we downsample the data so that the models will run quicker. We use `train_test_split` here to ensure that the downsampling is down without affecting the class balance.

In [None]:
num_data = data.copy()
cat_columns = ["COUNTY", "DISTRICT", "NEIGHBORHOOD", "MANAGEMENT DISTRICT", "DEPARTMENT", "DIVISION",
              "SR TYPE", "QUEUE", "SR MONTH", "CHANNEL TYPE"]

for c in cat_columns:
    prev_column = num_data[c]
    # Transform catagorical data to one that sklearn can understand
    encoder = preprocessing.LabelEncoder()
    num_data[c] = encoder.fit_transform(prev_column)

print(num_data.head())
train, test = train_test_split(num_data, train_size = 0.95, test_size = 0.05)

In [None]:
down_data = train

labels = down_data['OVERDUE']
df_features = down_data.drop(['OVERDUE'], axis=1)
print(labels.value_counts().sort_index())

## Decision Trees

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(df_features, labels, test_size = 0.2)
print('Training set size:',len(labels_train))
print('Test set size:    ',len(labels_test))
dt = tree.DecisionTreeClassifier()
dt = dt.fit(features_train, labels_train)
predict_labels_test = dt.predict(features_test)
print('Accuracy of decision tree classifier:',accuracy_score(labels_test, predict_labels_test))

## Support Vector Machines (SVM)

In [None]:
svm = Pipeline(steps=[('scaler',StandardScaler()),('reduce_dim',PCA()),('clf',SVC(gamma='scale'))])
param_grid = {
    'reduce_dim__n_components': list(range(5, 11)),
    'clf__kernel':('linear','rbf','poly')
}
pipedsvm = GridSearchCV(svm,param_grid,cv=5,scoring='accuracy', iid=True)
accuracies = cross_val_score(pipedsvm.fit(df_features,labels), df_features, labels, cv=10)
print('Official accuracy:',np.mean(accuracies))

## Neural Networks (NN)

In [None]:
from sklearn.neural_network import MLPClassifier
nn = Pipeline(steps=[('scaler',StandardScaler()),('clf',MLPClassifier())])
param_grid = {
    'clf__hidden_layer_sizes':((30,),(40,),(50,),(60,)),
    'clf__activation':('logistic','tanh','relu')
}
pipednn = GridSearchCV(nn,param_grid,cv=5,scoring='accuracy', iid=True)
accuracies = cross_val_score(pipednn.fit(df_features,labels), df_features, labels, cv=5)
print('Official accuracy:',np.mean(accuracies))

## k-Nearest Neighbor (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
param_grid = {
    'knn__n_neighbors': list(range(1, 25))
}
knn = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA()), ('knn',KNeighborsClassifier(n_neighbors=7))])
pipedknn = GridSearchCV(knn,param_grid,cv=5,scoring='accuracy', iid=True)
accuracies = cross_val_score(pipedknn.fit(df_features,labels), df_features, labels, cv=5)
print('Average accuracy:',np.mean(accuracies))

## Naive Bayes
Naive Bayes is not a good option for this dataset due to correlation between attributes. For example, requests from the same neighborhood would be from the same county.

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
accuracies = cross_val_score(nb,df_features,labels,cv=10)
print('Average accuracy:',np.mean(accuracies))

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(n_estimators=150)
accuracies = cross_val_score(ab, df_features, labels, cv=5)
print('Average accuracy:',np.mean(accuracies))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10)
param_grid = {
    'max_depth':list(range(35,55)),
    'min_samples_leaf':(8,10,12),
    'max_features':('sqrt','log2')
}
pipedrf = GridSearchCV(rf,param_grid,cv=5,scoring='accuracy', iid=True)
accuracies = cross_val_score(pipedrf.fit(df_features,labels), df_features, labels, cv=5)
print('Official accuracy:',np.mean(accuracies))

## Voting Classifier
SVM, Random Forest, AdaBoost, KNN, and Neural Nets are combined in a voting classifier.

In [None]:
votingclf = VotingClassifier(estimators=[('svc', pipedsvm), ('rf', rf), ('abdt', ab), ('knn', pipedknn), ('nn', pipednn)], voting='hard')
accuracies = cross_val_score(votingclf, df_features, labels, cv=5)
print('Average accuracy:',np.mean(accuracies))

In [None]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))