In [1]:
import numpy as np
from scipy.special import expit
from scipy import optimize, sparse
from scipy.stats import logistic
from sklearn import cross_validation, datasets

In [2]:
boston = datasets.load_boston()
X, y = boston.data, np.round(boston.target)
idx = np.argsort(y)
X = X[idx]
y = y[idx]

In [3]:
import mord

In [4]:
c = mord.LogisticIT() #Default parameters: alpha=1.0, verbose=0, maxiter=10000
c.fit(np.array([[0,0,0,1],[0,1,0,0],[1,0,0,0]]), np.array([1,2,3]))
c.predict(np.array([0,0,0,1]))
c.predict(np.array([0,1,0,0]))
c.predict(np.array([1,0,0,0]))

array([3])

In [5]:
word_vector_path = 'wordvectors.txt'
train_reviews_path = 'training_reviews.txt'
train_labels_path = 'training_labels.txt'
test_reviews_path = 'test_reviews.txt'
test_labels_path = 'test_labels.txt'

In [6]:
def get_vectors(path):
    word_vectors = {}
    with open(path, 'r') as f:
        for line in f:
            line = line.strip().split()
            word, vector = line[0], np.array([float(x) for x in line[1:]])
            word_vectors[word] = vector
    return word_vectors

In [7]:
def sum_word_vectors(words, word_vectors):
    vectors = []
    for word in words:
        try:
            vectors.append(word_vectors[word])
        except KeyError:
            pass
    vectors = np.array(vectors)
    return vectors.sum(axis=0)

In [8]:
def prod_word_vectors(words, word_vectors):
    vector = np.ones(word_vectors.values()[0].size)
    for word in words:
        try:
            np.multiply(vector, word_vectors[word])
        except KeyError:
            pass
    return vector

In [9]:
vectors = get_vectors(word_vector_path)

In [10]:
X = []
y = []
samples = 0
with open(train_reviews_path, 'r') as f, open(train_labels_path, 'r') as g:
    for line, label in zip(f, g):
        line = line.split()
        X.append(prod_word_vectors(line, vectors))
        y.append(int(label.strip()))
        samples += 1
        
    X = np.array(X)
    y = np.array(y)
    print X.shape
    print samples
    print y.shape

(5000, 128)
5000
(5000,)


In [11]:
X_test = []
y_test = []
samples = 0
with open(test_reviews_path, 'r') as f, open(test_labels_path, 'r') as g:
    for line, label in zip(f, g):
        line = line.split()

        X_test.append(sum_word_vectors(line, vectors))
        y_test.append(int(label.strip()))
        samples += 1

    X_test = np.array(X_test)
    y_test = np.array(y_test)
    print X_test.shape
    print samples
    print y_test.shape

(500, 128)
500
(500,)


In [12]:
c = mord.LogisticIT() #Default parameters: alpha=1.0, verbose=0, maxiter=10000
c.fit(X, y) 

LogisticIT(alpha=1.0, max_iter=1000, verbose=0)

In [13]:
data = []
for label, truth in zip(c.predict(X), y):
    data.append(label == truth)
print sum(data) / float(len(data))

0.2


In [14]:
import ologr

In [15]:
d = ologr.OrdinalLogisticRegressionAT()
w, theta = d.train(X, y)

In [16]:
atdata = []
#print w
#print theta
labels = np.unique(y)
print labels
for row, truth in zip(X, y):
    label = labels[d.predict(w, theta, row)]
    atdata.append(label == truth)
print sum(atdata) / float(len(atdata))

[1 2 3 4 5]
0.2


In [20]:
e = ologr.OrdinalLogisticRegressionIT()
wit, thetait = e.train(X, y)

In [23]:
itdata = []
#print w
#print theta
labels = np.unique(y)
print labels
for row, truth in zip(X, y):
    label = labels[e.predict(wit, thetait, row)]
    #print label, truth
    itdata.append(label == truth)
print sum(itdata) / float(len(itdata))

[1 2 3 4 5]
0.2
