In [1]:
import csv
import numpy as np

def get_features(row):
    features = [3, 9, 10, 12, 15, 19, 20]
    for feature in features:
        if row[feature] == '': return None
    
    return np.array(row)[features]

def trim_data():
    data = []
    with open('CT-clean.csv', 'r') as csvfile:
        trafficreader = csv.reader(csvfile, delimiter=',')

        num_rows = 0
        for i, row in enumerate(trafficreader):
            #if i % 50 == 1:
            if i != 0:
                short_row = get_features(row)
                if short_row is not None:
                    data.append(short_row)

    return np.array(data)

data = trim_data()

In [2]:
races = list(np.unique(data[:, 3]))
violations = ['cell phone', 'speeding', 'license', 'moving violation', 'lights', 'seat belt', 'other']
outcomes = list(np.unique(data[:, 6]))

races, violations, outcomes

(['Asian', 'Black', 'Hispanic', 'Native American', 'White'],
 ['cell phone',
  'speeding',
  'license',
  'moving violation',
  'lights',
  'seat belt',
  'other'],

In [3]:
def preprocess_row(row):
    label = outcomes.index(row[6])
    
    features= []
    #features.append(int(row[0].split(':')[0]))
    #features.append(0 if row[1] == 'F' else 1)
    #features.append(int(row[2]))
    features.append(0 if row[5] == 'FALSE' else 1)
    features.extend([1 if i == races.index(row[3]) else 0 for i in range(len(races))])
    features.extend([1 if violation in row[4].lower() else 0 for violation in violations])
    
    #label = [1 if i == outcomes.index(row[6]) else 0 for i in range(len(outcomes))]
    
    return features, label

def preprocess_data(data):
    X, y = [], []
    for row in data:
        features, label = preprocess_row(row)
        if features == None:
            continue
        X.append(features)
        y.append(label)
        
    return np.array(X), np.array(y)

X, y = preprocess_data(data)

In [4]:
X.shape, y.shape

((313093, 13), (313093,))

In [5]:
len(np.where(y == 0)[0]), len(np.where(y == 1)[0]), len(np.where(y == 2)[0]), len(np.where(y == 3)[0]), len(np.where(y == 4)[0])

(7304, 12196, 218822, 47714, 27057)

In [6]:
X_final, y_final = [], []

for i in range(5):
    indexes = np.where(y == i)[0][:7304]
    X_final.extend(X[indexes])
    y_final.extend(y[indexes])
    
X, y = X_final, y_final

In [7]:
X, y = np.array(X), np.array(y)

indeces = np.random.choice(len(X), len(X), replace=False)
X, y = X[indeces], y[indeces]

In [8]:
len(np.where(y == 0)[0]), len(np.where(y == 1)[0]), len(np.where(y == 2)[0]), len(np.where(y == 3)[0]), len(np.where(y == 4)[0])

(7304, 7304, 7304, 7304, 7304)

In [9]:
X_train, X_val, X_test = X[:36000], X[36000:36400], X[36400:]
y_train, y_val, y_test = y[:36000], y[36000:36400], y[36400:]

#X_train, X_val, X_test = X[:1400], X[1400:1700], X[1700:]
#y_train, y_val, y_test = y[:1400], y[1400:1700], y[1700:]

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn import svm

clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10, 8), tol = 1e-15, early_stopping=False, verbose=True)
#clf = svm.SVC(decision_function_shape='ovo')

clf.fit(X_train, y_train)

Iteration 1, loss = 1.62875740
Iteration 2, loss = 1.53464856
Iteration 3, loss = 1.47110844
Iteration 4, loss = 1.43512137
Iteration 5, loss = 1.42211816
Iteration 6, loss = 1.41661137
Iteration 7, loss = 1.41333219
Iteration 8, loss = 1.41216223
Iteration 9, loss = 1.41090567
Iteration 10, loss = 1.41032645
Iteration 11, loss = 1.41016889
Iteration 12, loss = 1.40988783
Iteration 13, loss = 1.40976623
Iteration 14, loss = 1.40949594
Iteration 15, loss = 1.40930449
Iteration 16, loss = 1.40938649
Iteration 17, loss = 1.40914358
Iteration 18, loss = 1.40924766
Iteration 19, loss = 1.40917023
Iteration 20, loss = 1.40912994
Iteration 21, loss = 1.40919281
Iteration 22, loss = 1.40883548
Iteration 23, loss = 1.40941214
Iteration 24, loss = 1.40858730
Iteration 25, loss = 1.40859552
Iteration 26, loss = 1.40878087
Iteration 27, loss = 1.40861044
Training loss did not improve more than tol=0.000000 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 8), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=1e-15, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [11]:
from sklearn.metrics import accuracy_score

accuracy_score(y_val, clf.predict(X_val))

0.365

In [12]:
clf.predict(X_val)

array([0, 0, 2, 4, 2, 0, 4, 1, 0, 0, 4, 0, 3, 4, 4, 0, 0, 4, 0, 1, 4, 0,
       1, 3, 4, 3, 1, 0, 4, 3, 3, 4, 0, 0, 1, 4, 0, 4, 4, 2, 4, 4, 2, 0,
       3, 2, 4, 4, 4, 1, 4, 4, 4, 4, 1, 2, 4, 4, 3, 0, 2, 2, 3, 1, 3, 4,
       2, 1, 4, 3, 4, 1, 2, 0, 4, 4, 0, 4, 4, 2, 3, 0, 0, 3, 4, 2, 4, 4,
       4, 0, 4, 4, 1, 0, 3, 4, 4, 4, 1, 1, 4, 0, 4, 4, 0, 1, 1, 0, 4, 0,
       1, 3, 2, 3, 2, 4, 0, 4, 3, 4, 1, 1, 0, 4, 0, 0, 0, 2, 0, 1, 2, 0,
       2, 3, 0, 1, 4, 4, 0, 0, 4, 0, 4, 2, 2, 0, 4, 4, 4, 3, 4, 4, 4, 1,
       0, 4, 1, 4, 0, 0, 0, 4, 4, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 1, 3,
       2, 4, 0, 0, 0, 3, 1, 2, 2, 1, 2, 4, 0, 4, 4, 4, 3, 2, 4, 3, 1, 2,
       4, 4, 4, 3, 3, 4, 4, 2, 0, 4, 2, 4, 3, 0, 4, 4, 1, 0, 2, 4, 4, 4,
       3, 1, 2, 1, 1, 3, 1, 0, 4, 0, 0, 0, 3, 4, 4, 4, 4, 3, 0, 4, 2, 2,
       3, 2, 0, 1, 0, 0, 1, 4, 2, 1, 3, 4, 0, 4, 4, 4, 4, 3, 0, 0, 1, 4,
       0, 1, 0, 4, 1, 4, 0, 4, 4, 2, 4, 4, 0, 3, 0, 4, 1, 3, 0, 0, 4, 2,
       4, 2, 0, 4, 0, 1, 3, 2, 2, 4, 1, 0, 0, 0, 4,

In [13]:
y_val

array([4, 0, 2, 3, 1, 3, 3, 0, 1, 2, 3, 4, 0, 4, 4, 1, 0, 4, 0, 1, 2, 2,
       0, 0, 4, 3, 1, 3, 3, 4, 3, 2, 1, 2, 0, 4, 0, 3, 3, 2, 2, 3, 4, 0,
       4, 4, 4, 2, 3, 0, 0, 4, 4, 4, 2, 2, 2, 4, 0, 4, 1, 1, 1, 1, 4, 3,
       2, 1, 3, 3, 4, 1, 4, 1, 2, 4, 2, 4, 3, 4, 2, 1, 0, 0, 4, 3, 3, 0,
       4, 1, 1, 4, 0, 1, 3, 3, 2, 3, 1, 3, 3, 2, 3, 4, 3, 0, 1, 1, 0, 1,
       1, 0, 1, 3, 4, 2, 1, 2, 0, 4, 2, 3, 0, 4, 0, 2, 2, 1, 0, 4, 1, 0,
       3, 4, 4, 1, 4, 3, 0, 4, 3, 2, 0, 0, 2, 0, 4, 4, 3, 4, 2, 4, 4, 1,
       1, 4, 0, 3, 0, 0, 1, 3, 3, 2, 0, 4, 1, 1, 2, 3, 3, 1, 0, 2, 0, 4,
       3, 4, 0, 2, 2, 0, 0, 3, 1, 1, 2, 1, 1, 1, 4, 4, 0, 2, 0, 1, 1, 2,
       4, 1, 0, 0, 0, 4, 3, 1, 0, 0, 4, 4, 3, 0, 2, 2, 1, 0, 4, 0, 2, 3,
       3, 1, 4, 1, 0, 3, 1, 2, 4, 1, 0, 2, 4, 3, 3, 0, 3, 3, 2, 3, 2, 2,
       4, 4, 0, 0, 4, 2, 0, 0, 0, 1, 3, 2, 3, 1, 4, 4, 2, 1, 1, 0, 1, 3,
       0, 2, 0, 2, 0, 0, 1, 4, 3, 0, 2, 4, 3, 0, 0, 3, 1, 3, 1, 0, 4, 3,
       3, 3, 0, 4, 2, 3, 0, 0, 2, 4, 1, 4, 2, 2, 1,

In [14]:
[coef.shape for coef in clf.coefs_]

[(13, 10), (10, 8), (8, 5)]