# **Predicting “Successful” Criminal Activity in New York City**

Classification model that predicts whether a crime in NYC will be “successful” or “unsuccessful.”

## **1. Load dataset**

In [34]:
import pandas as pd

pd.set_option('display.max_columns', 50)

df = pd.read_csv('NYPD_Complaint_Data_Current_YTD.csv')

## **2. Data prepocessing**

### Remove unnecessary columns

In [35]:
columns_remove = ['CMPLNT_NUM', 'ADDR_PCT_CD', 'BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_TO_DT', 
                  'CMPLNT_TO_TM', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'KY_CD', 
                  'LOC_OF_OCCUR_DESC', 'PARKS_NM', 'PATROL_BORO', 'PD_CD', 
                  'RPT_DT', 'STATION_NAME', 'TRANSIT_DISTRICT', 'X_COORD_CD',
                  'Y_COORD_CD', 'Latitude', 'Longitude', 'Lat_Lon']

df = df.drop(columns_remove, axis=1)

df.head()

Unnamed: 0,CRM_ATPT_CPTD_CD,LAW_CAT_CD,OFNS_DESC,PD_DESC,PREM_TYP_DESC,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,COMPLETED,MISDEMEANOR,DANGEROUS WEAPONS,"WEAPONS, POSSESSION, ETC",STREET,,,,UNKNOWN,UNKNOWN,E
1,COMPLETED,FELONY,RAPE,RAPE 2,RESIDENCE - PUBLIC HOUSING,18-24,UNKNOWN,M,<18,BLACK,F
2,COMPLETED,MISDEMEANOR,OFF. AGNST PUB ORD SENSBLTY &,AGGRAVATED HARASSMENT 2,RESIDENCE-HOUSE,25-44,BLACK,M,18-24,BLACK,F
3,COMPLETED,FELONY,ROBBERY,"ROBBERY,DELIVERY PERSON",RESIDENCE - APT. HOUSE,UNKNOWN,WHITE HISPANIC,M,25-44,WHITE HISPANIC,M
4,COMPLETED,VIOLATION,HARRASSMENT 2,"HARASSMENT,SUBD 3,4,5",RESIDENCE - PUBLIC HOUSING,45-64,WHITE HISPANIC,F,25-44,BLACK,F


### Remove rows with null values

In [36]:
import numpy as np

# Replace UNKNOWN and erroneous values with nulls
df.replace('UNKNOWN', np.NaN, inplace=True)
df.replace('E', np.NaN, inplace=True)
df.replace('D', np.NaN, inplace=True)
df.replace('U', np.NaN, inplace=True)

print('Number of rows before removing rows with missing values: ' + str(df.shape[0]))

# Remove rows with null values
df.dropna(axis=0, inplace=True)

print('Number of rows after removing rows with missing values: ' + str(df.shape[0]))

Number of rows before removing rows with missing values: 109543
Number of rows after removing rows with missing values: 31597


## **3. Get the feature and target vectors**

In [37]:
# Get the feature vector
X = df.drop('CRM_ATPT_CPTD_CD', axis = 1)

# Get the target vector
y = df['CRM_ATPT_CPTD_CD']

print(X.shape)
print(y.shape)

(31597, 10)
(31597,)


## **4. Encoding**

### Encode the target vector

In [38]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

### Define Multiple Columns Encoder

In [39]:
class MultiColumnLabelEncoder:
    
    def __init__(self, columns=None):
        # Array of column names to encode.
        self.columns = columns

    def fit(self, X):
        return self

    def transform(self, X):
        # Transforms columns of X specified in self.columns using LabelEncoder(). 
        # If no columns specified, transforms all columns in X.
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self, X):
        return self.fit(X).transform(X)

### Encode the feature vector

In [40]:
X = MultiColumnLabelEncoder().fit_transform(X)

## **5. Divide data into training and testing sets**

In [41]:
from sklearn.model_selection import train_test_split

# Randomly choose 30% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Show the shape of the data
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))

(array([0, 1]), array([  345, 21772]))
(array([0, 1]), array([ 148, 9332]))


## **6. Model Selection**

### SVC model and hyperparameter tuning 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# The list of value for hyperparameter C (penalty parameter)
Cs = [1, 10, 100]

# The list of choice for hyperparameter kernel
kernels = ['linear', 'rbf', 'sigmoid']

# The list of [score, setting], where score is the score of the classifier and setting a pair of (C, kernel)
score_settings = []

# For each C
for C in Cs:
    # For each kernel
    for kernel in kernels:
        # Declare the classifier with hyperparameter C, kernel, class_weight, and random_state
        clf = SVC(C = C, kernel = kernel, class_weight = 'balanced', random_state = 0)
        
        # The pipeline, with StandardScaler and clf defined above
        # Implement me
        pipe_clf = Pipeline([('StandardScaler',StandardScaler()),('clf',clf)])

        # Fit the pipeline
        pipe_clf.fit(X_train, y_train)
        # Get the score (rounding to two decimal places)
        score = round(pipe_clf.score(X_test, y_test), 2)
        
        # Get the setting, which is a pair of (C, kernel)
        setting = [C, kernel]

        # Append [score, setting] to score_settings
        score_settings.append([score, setting])
        
# Sort score_settings in descending order of score
score_settings = sorted(score_settings, key = lambda x: x[0], reverse= True)

# Print score_settings
print('The list of [score, setting] is:')
for score_setting in score_settings:
    print(score_setting)
print()

# Print the best setting
print('The best setting is:')
print('C: ' + str(score_settings[0][1][0]))
print('kernel: ' + score_settings[0][1][1])

### 6.2. Random Forest Classifier

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier

# Delcare the model
rf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                            oob_score=False, n_jobs=3, random_state=None, verbose=0, warm_start=False,class_weight='balanced')

# Train the model
# Implement me
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
# Print the classification accuracy
print(precision_recall_fscore_support(y_pred_rf, y_test, average='micro')[0])

### 6.3. SVC Claasifier

In [None]:
from sklearn.svm import SVC
clf = SVC(C=100, kernel='rbf', class_weight='balanced', random_state=0)
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)
# Print the classification accuracy
print(precision_recall_fscore_support(y_pred_clf, y_test, average='micro')[0])

### 6.4. MLP Classifier 

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation = 'logistic', hidden_layer_sizes=100, solver='adam', learning_rate = 'constant')
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
# Print the classification accuracy
print(precision_recall_fscore_support(y_pred_mlp, y_test, average='micro')[0])

### 6.5. DecisionTree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Delcare the model
dt = DecisionTreeClassifier(random_state=0, class_weight='balanced', min_samples_split = 2, min_samples_leaf = 1)

# Train the model
# Implement me
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
# Print the classification accuracy
print(precision_recall_fscore_support(y_pred_dt, y_test, average='micro')[0])

### 6.6. Deep Neural Network with Keras

In [None]:
from keras.layers.core import Dense, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
import numpy
# fix random seed for reproducibility
numpy.random.seed(7)
# create model
model = Sequential()
model.add(Dense(12, input_dim=10, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_train, y_train, epochs=5, batch_size=10)
# evaluate the model
scores = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))