In [1]:
import csv
import numpy as np

def get_features(row):
    features = [3, 9, 10, 12, 15, 19, 20]
    for feature in features:
        if row[feature] == '': return None
    
    return np.array(row)[features]

def trim_data():
    data = []
    with open('FL_cleaned.csv', 'r') as csvfile:
        trafficreader = csv.reader(csvfile, delimiter=',')

        num_rows = 0
        for i, row in enumerate(trafficreader):
            if i % 50 == 1:
#             if i != 0:
                short_row = get_features(row)
                if short_row is not None:
                    data.append(short_row)

    return np.array(data)

data = trim_data()

In [33]:
races = list(np.unique(data[:, 3]))
violations = ['dui', 'speeding', 'license', 'moving violation', 'lights', 'seat belt', 'other']
outcomes = list(np.unique(data[:, 6]))

races, violations, outcomes

(['A', 'A H', 'B', 'B H', 'H', 'H H', 'I', 'I H', 'O', 'O H', 'W', 'W H'],
 ['dui',
  'speeding',
  'license',
  'moving violation',
  'lights',
  'seat belt',
  'other'],
 ['Citation',
  'Faulty Equipment Notice',
  'Felony Arrest',
  'Misdemeanor Arrest',

In [34]:
def preprocess_row(row):
    label = outcomes.index(row[6])
    
    features= []
    features.append(int(row[0].split(':')[0]))
    features.append(0 if row[1] == 'F' else 1)
    features.append(int(row[2]))
    features.append(0 if row[5] == 'FALSE' else 1)
    features.extend([1 if i == races.index(row[3]) else 0 for i in range(len(races))])
    features.extend([1 if violation in row[4].lower() else 0 for violation in violations])
    
#     label = [1 if i == outcomes.index(row[6]) else 0 for i in range(len(outcomes))]
    
    return features, label

def preprocess_data(data):
    X, y = [], []
    for row in data:
        features, label = preprocess_row(row)
        if features == None:
            continue
        X.append(features)
        y.append(label)
        
    return np.array(X), np.array(y)

X, y = preprocess_data(data)

In [36]:
from collections import defaultdict

def filter_data(cond):
    outcomes_buckets = defaultdict(int)
    total = 0.0
    for i in range(X.shape[0]):
        if cond(X[i]): 
            outcomes_buckets[y[i]] += 1
            total += 1
    print("Total ", total)
    results = sorted(outcomes_buckets.items())
    print(results)
    percents = [result[1] / total for result in results]
    print(percents)

print(outcomes)  
print("White outcomes")
filter_data(lambda row: row[4 + races.index('W')] == 1)
print("Black outcomes")
filter_data(lambda row: row[4 + races.index('B')] == 1)

print("dui outcomes")
filter_data(lambda row: row[16 + violations.index('dui')] == 1)

print("speeding outcomes")
filter_data(lambda row: row[16 + violations.index('speeding')] == 1)



White outcomes
Total  41032.0
[(0, 29556), (1, 1039), (2, 57), (3, 839), (4, 9541)]
[0.7203158510430884, 0.025321700136478845, 0.0013891596802495614, 0.020447455644375123, 0.23252583349580816]
Black outcomes
Total  13514.0
[(0, 9947), (1, 320), (2, 67), (3, 625), (4, 2555)]
[0.7360515021459227, 0.023679147550688177, 0.004957821518425337, 0.04624833505993784, 0.1890631937250259]
dui outcomes
Total  339.0
[(0, 84), (2, 12), (3, 243)]
[0.24778761061946902, 0.035398230088495575, 0.7168141592920354]
speeding outcomes
Total  43320.0
[(0, 33777), (2, 50), (3, 768), (4, 8725)]
[0.7797091412742382, 0.0011542012927054479, 0.01772853185595568, 0.20140812557710064]


In [5]:
X.shape, y.shape

((73123, 23), (73123,))

In [6]:
len(np.where(y == 0)[0]), len(np.where(y == 1)[0]), len(np.where(y == 2)[0]), len(np.where(y == 3)[0]), len(np.where(y == 4)[0])

(54268, 1707, 148, 2199, 14801)

In [7]:
# X_final, y_final = [], []

# for i in range(5):
#     indexes = np.where(y == i)[0][:7304]
#     X_final.extend(X[indexes])
#     y_final.extend(y[indexes])
    
# X, y = X_final, y_final

In [8]:
X, y = np.array(X), np.array(y)

# indeces = np.random.choice(len(X), len(X), replace=False)
# X, y = X[indeces], y[indeces]

In [9]:
len(np.where(y == 0)[0]), len(np.where(y == 1)[0]), len(np.where(y == 2)[0]), len(np.where(y == 3)[0]), len(np.where(y == 4)[0])

(54268, 1707, 148, 2199, 14801)

In [10]:
X_train, X_val, X_test = X[:36000], X[36000:36400], X[36400:]
y_train, y_val, y_test = y[:36000], y[36000:36400], y[36000:]

#X_train, X_val, X_test = X[:1400], X[1400:1700], X[1700:]
#y_train, y_val, y_test = y[:1400], y[1400:1700], y[1700:]

In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn import svm

clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10,), max_iter=200, verbose=True, tol=1e-7)
#clf = svm.SVC(decision_function_shape='ovo')

clf.fit(X_train, y_train)

Iteration 1, loss = 2.07575396
Iteration 2, loss = 0.82263919
Iteration 3, loss = 0.77501165
Iteration 4, loss = 0.74926941
Iteration 5, loss = 0.73236175
Iteration 6, loss = 0.71429667
Iteration 7, loss = 0.69743616
Iteration 8, loss = 0.68636161
Iteration 9, loss = 0.67870579
Iteration 10, loss = 0.67330078
Iteration 11, loss = 0.66850644
Iteration 12, loss = 0.66481920
Iteration 13, loss = 0.66100693
Iteration 14, loss = 0.65842105
Iteration 15, loss = 0.65693538
Iteration 16, loss = 0.65414367
Iteration 17, loss = 0.65110966
Iteration 18, loss = 0.64943766
Iteration 19, loss = 0.64788475
Iteration 20, loss = 0.64585579
Iteration 21, loss = 0.64517039
Iteration 22, loss = 0.64383610
Iteration 23, loss = 0.64264901
Iteration 24, loss = 0.64181562
Iteration 25, loss = 0.64139238
Iteration 26, loss = 0.64003372
Iteration 27, loss = 0.63940266
Iteration 28, loss = 0.63886289
Iteration 29, loss = 0.63848206
Iteration 30, loss = 0.63916417
Iteration 31, loss = 0.63814655
Iteration 32, los

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=1e-07, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [17]:
from sklearn.metrics import accuracy_score

accuracy_score(y_val, clf.predict(X_val))

0.755

In [13]:
clf.predict(X_val)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
       0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,
       0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
y_val

array([0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 4, 4, 4, 0, 4, 4, 0, 0, 4, 0, 4, 0,
       0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
       0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0, 3, 4, 0,
       4, 0, 4, 0, 4, 0, 0, 0, 4, 0, 4, 0, 4, 0, 0, 4, 0, 4, 4, 4, 4, 0,
       0, 4, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 0,
       0, 0, 0, 1, 0, 2, 4, 0, 0, 0, 4, 0, 0, 4, 4, 0, 4, 0, 4, 0, 0, 0,
       0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0,
       4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 4,
       4, 0, 4, 0, 0, 4, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 4, 0, 4, 0, 0, 0, 0, 0,
       0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 3,
       0, 1, 0, 4, 0, 0, 3, 0, 0, 0, 0, 4, 4, 0, 0, 0, 4, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:
[coef.shape for coef in clf.coefs_]

[(23, 5)]