In [1]:
%matplotlib inline
import pandas as pd
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder
import numpy as np
import altair as alt
import seaborn as sns
import math

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn import metrics

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from sklearn.linear_model import LinearRegression
from sklearn import svm


plt.style.use('ggplot')

# Crime Classification in San Francisco
## Multi-class classification
https://www.kaggle.com/c/sf-crime

"From 1934 to 1963, San Francisco was infamous for housing some of the world's most notorious criminals on the inescapable island of Alcatraz.

Today, the city is known more for its tech scene than its criminal past. But, with rising wealth inequality, housing shortages, and a proliferation of expensive digital toys riding BART to work, there is no scarcity of crime in the city by the bay.

From Sunset to SOMA, and Marina to Excelsior, this competition's dataset provides nearly 12 years of crime reports from across all of San Francisco's neighborhoods. Given time and location, you must predict the category of crime that occurred."


![image.png](attachment:image.png)

In [2]:
# 878049 rows x 9 columns
data = pd.read_csv('crimetrain.csv', parse_dates=['Dates'])
# train = pd.read_csv('crimetrain.csv')
data.head()

FileNotFoundError: File b'crimetrain.csv' does not exist

In [None]:
data.info()

In [None]:
print("Num of Categories: ", data['Category'].nunique())
print("Num of Descripts: ", data['Descript'].nunique())

In [None]:
data.Resolution.unique()

- TREA = Trespassing or loitering near posted industrial property

# Exploratory Data Analysis

In [None]:
encodeddata = pd.read_csv('crimetrain.csv')
labelencoder = LabelEncoder()
for col in encodeddata.columns:
    encodeddata[col] = labelencoder.fit_transform(encodeddata[col])

In [None]:
plt.figure(figsize = (16,5))
ax = sns.heatmap(encodeddata.corr(), annot=True)

## What is the most popular crime?

In [None]:
data = pd.read_csv('crimetrain.csv', parse_dates=['Dates'])

In [None]:
popcrime = data.groupby('Category').count().reset_index()
popcrime = popcrime.drop(['Dates', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X'], axis=1)
popcrime = popcrime.rename(columns={'Y':'count'}).sort_values(by='count', ascending=False)
popcrime.plot.bar(x='Category', y='count', figsize=(15, 8))
plt.ylabel("count")

## What district is the most dangerous?

In [None]:
dangerous = data.groupby('PdDistrict').count().reset_index()
dangerous = dangerous.drop(['Dates', 'Category', 'Descript', 'DayOfWeek', 'Resolution', 'Address', 'X'], axis=1)
dangerous = dangerous.rename(columns={'Y':'num_of_crimes'}).sort_values(by='num_of_crimes', ascending=False)
dangerous.plot.bar(x='PdDistrict', y='num_of_crimes', figsize=(10, 5))
plt.ylabel('Number of Crimes')

## PdDistrict and Category

In [None]:
cpddist = data.groupby(['Category', 'PdDistrict']).count().reset_index()
for category in cpddist['Category'].unique():
    ddata = cpddist[cpddist['Category'] == category]
    ddata.plot.bar(x='PdDistrict', y='Dates') # doesn't matter which, just looking at count
    plt.xlabel(category)
    plt.ylabel('Count')

## Percentage of crime by PdDistrict

In [None]:
data.head()

In [None]:
# adding a total column
perct = pd.crosstab([data.Category], data.PdDistrict).reset_index()
perct['total'] = perct.sum(axis=1)

# calculating percent for each row        
for district in train.PdDistrict.unique():
    perct[district+'%'] = perct.apply(lambda perct: perct[district]/perct.total*100, axis=1)

# dropping unncessary columns
perct = perct.drop(['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'total'], axis=1)
perct.head()

In [None]:
perct.plot.bar(x='Category', y=['NORTHERN%', 'PARK%', 'INGLESIDE%', 'BAYVIEW%', 'RICHMOND%', 'CENTRAL%', 'TARAVAL%', 'TENDERLOIN%', 'MISSION%', 'SOUTHERN%'], stacked=True, figsize=(21,10))
plt.ylabel('% of PdDistrict')

In [None]:
perct.plot.bar(x='Category', y=['NORTHERN%', 'PARK%', 'INGLESIDE%', 'BAYVIEW%', 'RICHMOND%', 'CENTRAL%', 'TARAVAL%', 'TENDERLOIN%', 'MISSION%', 'SOUTHERN%'], figsize=(21,10))
plt.ylabel('% of PdDistrict')

In [None]:
# what district is the hightest for prostitution? --it's hard to tell looking at the graph so let's pull up the table
p = data[data['Category'] == 'PROSTITUTION'].groupby(['Category', 'PdDistrict']).count().reset_index().sort_values(by='Dates', ascending=False)
p = p.drop(['Dates', 'Descript', 'DayOfWeek', 'Resolution', 'Address', 'X'], axis=1)
p = p.rename(columns={'Y':'count'}).sort_values(by='count', ascending=False)
p

### Takeaways
- PdDistrict may not be the best indicator of Category, but there are definitely some PdDistricts where certain crimes occur more so than others

## Category and Descript

In [None]:
descript = data.groupby(['Category', 'Descript']).count().reset_index()
descript = descript.drop(['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X'], axis=1)
descript = descript.rename(columns={'Y':'count'}).sort_values(by='count', ascending=False)
descript.head(n=10)

## Time Series  

In [None]:
timedata = pd.read_csv('crimetrain.csv', parse_dates=['Dates'])

In [None]:
timedata['Dates_year'] = timedata['Dates'].dt.year
timedata['Dates_month'] = timedata['Dates'].dt.month
timedata['Dates_day'] = timedata['Dates'].dt.day
timedata['Dates_hour'] = timedata['Dates'].dt.hour

fig, ((axis1,axis2),(axis3,axis4)) = plt.subplots(nrows=2, ncols=2)
fig.set_size_inches(18,6)

sns.countplot(data=timedata, x='Dates_year', ax=axis1)
sns.countplot(data=timedata, x='Dates_month', ax=axis2)
sns.countplot(data=timedata, x='Dates_day', ax=axis3)
sns.countplot(data=timedata, x='Dates_hour', ax=axis4)

### Takeaways
- Why are the number of crimes for 2015 so low?
- Number of crimes relatively the same from day to day except for the first of the month and the 31 (because not all months have 31 days)
- Crimes by hour pretty interesting. Descreases from midnight to 5:00am and steadily increases from 8:00am to 6:00pm with a spike at noon

In [None]:
cpddist = data.groupby(['Category', 'DayOfWeek']).count().reset_index()
for category in cpddist['Category'].unique():
    ddata = cpddist[cpddist['Category'] == category]
    ddata.plot.bar(x='DayOfWeek', y='Dates')
    plt.xlabel(category)
    plt.ylabel('Count')

## Number of cases bi-weekly

In [None]:
timedata['Dates_week'] = timedata['Dates'].dt.week
weekly = timedata[['Dates_week', 'Dates_year']]
weekly = pd.crosstab([weekly.Dates_week], weekly.Dates_year).reset_index()
grab_dates = weekly.iloc[:, 1:]
weekly.plot(x='Dates_week', y=grab_dates.columns, figsize=(20, 8))

### Takeaways
- The dataset does not contain data past June 2015
- Crime seems to decrease just a bit in the beginning of summer

## Number of cases hourly by PdDistrict

In [None]:
hourly = timedata[['Dates_hour', 'PdDistrict']]
hourly = pd.crosstab([hourly.Dates_hour], hourly.PdDistrict).reset_index()
grab_dists = hourly.iloc[:, 1:]
hourly.plot(x='Dates_hour', y=grab_dists.columns, figsize=(20, 8))
# plt.xticks(np.arange(min(x), max(x)+1, 1.0))

In [None]:
hourly = timedata[['Dates_hour', 'PdDistrict']]
pd.crosstab([hourly.Dates_hour], hourly.PdDistrict).reset_index().sort_values(by='SOUTHERN', ascending=False).head()

### Takeaways
- The number of crimes fall between midnight and 5:00am for each district
- Number of crimes increases from 5:00am to 8:00am, level off for a little bit until noon
- The number of crimes jumps up around noon
- The greatest number of crimes occur around 6:00pm
- Ingleside seems to be the only (sort of) anomaly 

## Top 10 address with the most crimes across the years

In [None]:
addryear = timedata[(timedata.Address == '800 Block of BRYANT ST') | (timedata.Address == '800 Block of MARKET ST') | 
             (timedata.Address == '2000 Block of MISSION ST') | (timedata.Address == '1000 Block of POTRERO AV') | 
             (timedata.Address == '900 Block of MARKET ST') | (timedata.Address == '0 Block of TURK ST') |
             (timedata.Address == '0 Block of 6TH ST') | (timedata.Address == '300 Block of ELLIS ST') |
             (timedata.Address == '400 Block of ELLIS ST') | (timedata.Address == '16TH ST / MISSION ST')]
addressyear = addryear[['Dates_year', 'Address']]
addressyear = pd.crosstab([addryear.Dates_year], addryear.Address).reset_index()
grab_addresses = addressyear.iloc[:, 1:]
addressyear.plot(x='Dates_year', y=grab_addresses.columns, figsize=(20, 9))
plt.ylabel('Number of Crimes')

In [None]:
timedata.groupby(['Category', 'PdDistrict', 'Address']).count().reset_index().sort_values(by='Dates', ascending=False)

### Takeaways
- The number of crimes by address stays relatively the same (at least for the top 10 most crime ridden areas) over the years except for 800 Block of BRYANT ST and 800 Block of MARKET ST whicy appear to have a lot of peeks and valleys. This may have to do with the fact that both of these addresses are part of the Southern district--the distict known for having the most criminal actitivity.

# Testing

In [None]:
datareorder = data[['Category', 'Descript', 'Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']]
datareorder.head(n=1)

In [None]:
labelencoder = LabelEncoder()
for col in datareorder.columns:
    datareorder[col] = labelencoder.fit_transform(datareorder[col])
datareorder.head()

## GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
model_naive = GaussianNB()

In [None]:
X = datareorder.iloc[:, 1:] 
y = datareorder.iloc[:, 0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model_naive.fit(X_train, y_train)
nb_pred = model_naive.predict(X_test)

In [None]:
accuracy_score(y_test, nb_pred)

In [None]:
confusion_matrix(y_test, nb_pred)
# true positive, false negative
# false positive, true negative

In [None]:
print(classification_report(y_test, nb_pred, target_names = data['Category'].unique()))

## Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier(max_features="auto")

In [None]:
model_tree.fit(X_train, y_train)
tree_pred = model_tree.predict(X_test)

In [None]:
accuracy_score(y_test, tree_pred)

In [None]:
confusion_matrix(y_test, tree_pred)

In [None]:
print(classification_report(y_test, tree_pred, target_names=train['Category'].unique()))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_forest = RandomForestClassifier(max_features="auto") # max_features=20

In [None]:
model_forest.fit(X_train, y_train)
forest_pred = model_forest.predict(X_test)

In [None]:
accuracy_score(y_test, forest_pred)

In [None]:
x = confusion_matrix(y_test, forest_pred)
for item in x:
    print(x)

In [None]:
print(classification_report(y_test, forest_pred, target_names=train['Category'].unique()))

## Does one-hot encoding improve results? --> not sure yet

In [None]:
# onehot = train
# onehot = onehot.drop(['Descript', 'X', 'Y', 'Resolution'], axis=1)
# for col_name in onehot.columns:
#     encoded = pd.get_dummies(onehot[col_name])
#     new_labels = [col_name + str(num) for num in encoded.columns]
#     encoded.columns = new_labels
#     onehot = onehot.join(encoded)

In [None]:
dask = dd.read_csv("crimetrain.csv", parse_dates=['Dates'])
dask.categorize(columns=['Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address'])

In [None]:
enc = DummyEncoder(["Category","Descript","DayOfWeek","PdDistrict", "Resolution", "Address", "Dates", "X", "Y"])

In [None]:
enc

In [None]:
# dask = enc.fit_transform(dask)

## Just for fun; how does dropping the Descript column (the column having the highest correlation with Category) compare to the previous Random Forest model?

In [None]:
dropDes = trainreorder.drop(["Descript"], axis=1)
dropDes.head()

In [None]:
XdropDes = dropDes.iloc[:, 1:] 
ydropDes = dropDes.iloc[:, 0]

In [None]:
XdropDes_train, XdropDes_test, ydropRes_train, ydropDes_test = train_test_split(XdropDes, ydropDes, test_size=0.2)

In [None]:
model_forestDropDes = RandomForestClassifier(max_features="auto") # max_features=20
model_forestDropDes.fit(XdropDes, ydropDes)
forest_predDropDes = model_forestDropDes.predict(XdropDes_test)

In [None]:
accuracy_score(ydropDes_test, forest_predDropDes)

In [None]:
confusion_matrix(ydropDes_test, forest_predDropDes)

In [None]:
print(classification_report(ydropDes_test, forest_predDropDes, target_names=train['Category'].unique()))

## Summary

Out of the GaussianNB, Decision Tree, and Random Forest classifiers, the Random Forest Classifier performed the best concerning accuracy with a score of .92 (remarkably better than the Gaussian's and Decision Tree's .44 and .77 respectively), and most likely the confusion matrix as well, though it is difficult to be certain as the confusion matrix is so big that it is hard to print out in a readable format. Additionally, after removing the Descript column, the Random Forest classifier did noteably worse (accuracy score of .84) as expected given that Descript had the highest correlation with the Category variable that we were trying to predict. One hot encoding everything still needs to be desired as well as figuring out the issues I was encountering with the SVM classifier.