In [403]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from nltk.corpus import stopwords
from stemming.porter2 import stem
from random import shuffle
from re import sub
from numpy.random import rand
from numpy import shape, exp, log, array

In [29]:
neg_l = list("-1 %s" % line.rstrip() for line in open('../../data/rt-polaritydata/rt-polarity.neg'))
pos_l = list("+1 %s" % line.rstrip() for line in open('../../data/rt-polaritydata/rt-polarity.pos'))

sen_l = neg_l + pos_l

shuffle(sen_l)
print(len(list(filter(lambda l: l[0:2] == '+1', sen_l))))
print(len(list(filter(lambda l: l[0:2] == '-1', sen_l))))

5330
5331


In [57]:
stop_words = stopwords.words('english') + [',', '.']
check = lambda x: x not in stop_words

In [90]:
y = list(); features = list()

for line in sen_l:
    y.append(line[0:2])
    features.append(line[3:])
    
cv = CountVectorizer()
vocab_size = len(set(sum(map(tokenize, features), [])))

In [149]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = lambda text: list(map(stem, sub("[^a-zA-Z]", " ", text).split())),
    lowercase = True,
    stop_words = 'english',
    max_features = int(vocab_size * 0.5) # ユニークな単語数のうち6割を使う、int()で整数にしている
)

In [212]:
X = vectorizer.fit_transform(features).todense()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

In [153]:
model.score(X_test, y_test)

0.74367718101733449

In [402]:
# cost function
def J(w, X, y):
    return y.T.dot(log(h(w, X))) + (1-y).T.dot(log(1-h(w, X)))


# sigmoid function
def h(w, X):
    z = X.dot(w)
    return 1 / (1 + exp(-z))


# compute gradient
def grad(w, X, y):
    return X.T.dot(h(w, X) - y)


# check if iteration is converged
def diff(w, w_tmp, X, y):
    return abs(J(w, X, y) - J(w_tmp, X, y))


def train(X, y):
    w = rand(X.shape[1], 1)
    
    for t in range(100):
        if t % 10 == 0:
            print(t)
            print(J(w, X, y))
            
        w_tmp = w
        w = w - 0.01 * grad(w, X, y)
        if diff(w, w_tmp, X, y) < 0.1:
            break
    return w


def predict(w, X, threshold=0.7):
    return array([int(res) for res in h(w, X) > threshold]).reshape(X.shape[0], 1)


def score(w, X, y):
    res = predict(w, X)
    num_correct = len(list(filter(lambda x: x==1, res == y))) 
    return num_correct / res.shape[0]

In [398]:
y_train_i = array([int(y == '+1') for y in y_train]).reshape(y_train_i.shape[0], 1)
y_test_i = array([int(y == '+1') for y in y_test]).reshape(y_test_i.shape[0], 1)

In [399]:
w = train(X_train, y_train_i)

0
[[-17496.12237696]]
10
[[-11927.2257922]]
20
[[-10247.21586559]]
30
[[-8593.88915274]]
40
[[-7392.19875577]]
50
[[-6458.30053609]]
60
[[-5708.0345958]]
70
[[-5089.88234699]]
80
[[-4567.26959835]]
90
[[-4115.43266873]]


In [400]:
score(w, X_test, y_test_i)

0.6882637112816141