In [1]:
import numpy as np
import csv
import random
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier

In [2]:
f = open('./vectorized_large.csv')
data_0 = list()
data_1 = list()

In [3]:
csvreader = csv.reader(f)
for row in csvreader:
    row = [float(r) for r in row]
    if row[0] == 0:
        data_0.append(row)
    else:
        data_1.append(row)
f.close()

In [4]:
assert(len(data_0) == len(data_1))
print(len(data_0))

6400


In [5]:
training_set = np.array(data_0[:5000] + data_1[:5000])
dev_set = np.array(data_0[5000:] + data_1[5000:])

training_labels = training_set[:, 0].astype(int)
training_data = training_set[:, 1:].astype('float32')
dev_labels = dev_set[:, 0].astype(int)
dev_data = dev_set[:, 1:].astype('float32')

In [6]:
# normalize features
_mean = np.mean(training_data, axis=0)
_std = np.std(training_data, axis=0)
training_data = (training_data - _mean) / _std
dev_data = (dev_data - _mean) / _std
print(_mean.tolist())
print(_std.tolist())
print(np.amin(training_data, axis=0).tolist())
print(np.amax(dev_data, axis=0).tolist())

[35.91339874267578, 3.4475998878479004, 67.10360717773438, 0.6936891674995422, 0.6221207976341248, 0.9910552501678467]
[96.64471435546875, 14.680747985839844, 138.3404998779297, 0.8288413286209106, 0.35737043619155884, 0.0016996299382299185]
[-0.37160229682922363, -0.23483815789222717, -0.5985492467880249, -0.8369384407997131, -1.7408289909362793, -10.385951042175293]
[20.219280242919922, 12.230466842651367, 16.901025772094727, 8.815089225769043, 1.0573879480361938, 1.324949026107788]


In [8]:
clf = LogisticRegression()
clf.fit(training_data, training_labels)
clf.score(dev_data, dev_labels)

0.7389285714285714

In [9]:
clf = AdaBoostClassifier()
clf.fit(training_data, training_labels)
clf.score(dev_data, dev_labels)

0.8835714285714286

In [10]:
clf = CatBoostClassifier(iterations=10, learning_rate=1, depth=10)
clf.fit(training_data, training_labels, verbose=False)
clf.score(dev_data, dev_labels)

0.9553571428571429

In [11]:
clf = MLPClassifier()
clf.fit(training_data, training_labels)
clf.score(dev_data, dev_labels)

0.9221428571428572

In [12]:
clf = RandomForestClassifier()
clf.fit(training_data, training_labels)
clf.score(dev_data, dev_labels)

0.9721428571428572

In [13]:
outf = open("final_model.pickle", 'wb')
pickle.dump(clf, outf)
outf.close()