In [44]:
import json
import os
import random

import numpy as np
import pickle

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

## Get Training Data

In [2]:
def get_inputs(data_dir):
    data_files = os.listdir(data_dir)
    
    # open pre-embedded data
    feature_list = []
    label_list = []
    for data_file in data_files:
        with open(os.path.join(data_dir, data_file), 'rb') as f:
            features, labels = pickle.load(f)
            feature_list.append(features)
            label_list.append(labels)
    features = np.concatenate(feature_list)
    labels = [label for labels in label_list for label in labels]
    
    # split into train and dev set
    train_features = features[0:int(0.8*len(features))]
    train_labels = labels[0:int(0.8*len(features))]
    dev_features = features[int(0.8*len(features)):len(features)]
    dev_labels = labels[int(0.8*len(features)):len(features)]
    
    train_labels = np.array(train_labels).astype('int32')
    dev_labels = np.array(dev_labels).astype('int32')
    
    print('{} train data points'.format(len(train_features)))
    print('{} dev data points'.format(len(dev_features)))
    return (train_features, train_labels, dev_features, dev_labels)

In [56]:
bert_model = 'train_uncased_large_max200'
train_features, train_labels, dev_features, dev_labels = get_inputs('/home/eugenet/final_project/cached_data/{}/'.format(bert_model))

20000 train data points
5000 dev data points


# Classify

## Linear SVM

In [57]:
%%time
svc = LinearSVC(random_state=42, max_iter=2000)
svc.fit(train_features, train_labels)
print(svc.score(dev_features, dev_labels))

0.8816
CPU times: user 1min 51s, sys: 484 ms, total: 1min 51s
Wall time: 1min 51s




## Logistic Regression

In [58]:
%%time
lc = LogisticRegression(random_state=42)
lc.fit(train_features, train_labels)
print(lc.score(dev_features, dev_labels))



0.8864
CPU times: user 19.4 s, sys: 456 ms, total: 19.8 s
Wall time: 19.4 s
