In [2]:
import nltk
import csv


train_file = csv.reader(open('train.data'), delimiter='\t')
next(train_file)
train_set = [x for x in train_file]
test_file = csv.reader(open('test.data'), delimiter='\t')
next(test_file)
test_set = [x for x in test_file]

In [3]:
train_data, train_label = [line[2].lower() for line in train_set], [line[1] for line in train_set]

# Подготовка данных
Каждое ревью, с помощью токенизации, стемминга и удаления стоп слов, преобразуем в последовательность слов.
Кроме того, формируем отдельный список sents, содержащий в себе все предложения.

In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


stemmer = PorterStemmer()
sw = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'[\w\']+')

sents = []
for i in range(len(train_data)):
    train_data[i] = nltk.sent_tokenize(train_data[i])
    for j in range(len(train_data[i])):
        tokens = tokenizer.tokenize(train_data[i][j])
        train_data[i][j]  = [stemmer.stem(w) for w in tokens if not w in sw]
#         train_data[i][j]  = [w for w in tokens if not w in sw]
    
    words = []
    for s in train_data[i]:
        sents.append(s)
        for w in s:
            words.append(w)
    train_data[i] = words

In [5]:
from collections import Counter


def create_bow_with_freq(data):
    result = Counter()
    for s in data:
        result.update(s)
    return list(result.items())

Считаем частоту каждого слова в обучающей выборке

In [6]:
train_bow = create_bow_with_freq(sents)

Выбираем 100 наиболее частотных слов

In [7]:
most_frequent_word = sorted(train_bow, key=lambda x: x[1], reverse=True)[:100]
most_frequent_word[:10]

[('n', 251915),
 ('place', 99857),
 ('good', 90131),
 ('like', 84485),
 ('food', 80425),
 ('go', 70479),
 ('get', 68699),
 ('time', 67857),
 ('one', 63277),
 ('great', 58061)]

In [8]:
mfw_id = {}
mfw_set = None

i = 0
w_lst = []
for w,_ in most_frequent_word:
    mfw_id[w] = i
    i += 1
    
    w_lst.append(w)

mfw_set = set(w_lst)

Считаем idf для выделенных частотных слов

In [9]:
from math import log


mfw_idf = {}

for rev in train_data:
    r = set(rev)
    for w in r:
        c = mfw_idf.get(w, 0)
        mfw_idf[w] = c + 1

for w in mfw_idf:
    mfw_idf[w] = log(len(train_data) / mfw_idf[w])

Разбиваем всю обучающую выборку на train и validate

In [10]:
from sklearn.cross_validation import train_test_split

x_train, x_validate, y_train, y_validate = train_test_split(train_data, train_label, test_size=0.5, random_state=42)



In [11]:
def evaluate(y_true, y_pred):
    return sum(int(y_t == y_p) for y_t, y_p in zip(y_true, y_pred)) * 100 / len(y_true)

In [12]:
from collections import defaultdict
import itertools
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.metrics.scores import precision, recall


def evaluate_classifier(features_extractor, train, train_l, test, test_l):
    """
         features_extractor - function for extraction features from review. 
         train, test - samples
    """
    train_feats = [(features_extractor(review), sent) for review, sent in zip(train, train_l)]
    test_feats = [(features_extractor(review), sent) for review, sent in zip(test, test_l)]
 
    classifier = NaiveBayesClassifier.train(train_feats)
    classifier.show_most_informative_features()
    
    predictions = []
    for i, (feats, label) in enumerate(test_feats):
        observed = classifier.classify(feats)
        predictions.append(observed)
    
    return evaluate(test_l, predictions)

# Модель 1
Пробуем Naive Bayes. Так как каждое ревью - это просто список слов, то признаками будут просто слова входящие в этот список.
### Результат на validate: 
39.20

In [13]:
def word_feats(words):
    return dict([(word, True) for word in words])

val = evaluate_classifier(word_feats, x_train, y_train, x_validate, y_validate)
print (val)

Most Informative Features
            unprofession = True                1 : 5      =    160.6 : 1.0
                incompet = True                1 : 5      =    106.7 : 1.0
              disrespect = True                1 : 4      =    100.7 : 1.0
                unaccept = True                1 : 5      =     64.0 : 1.0
                    ined = True                1 : 5      =     61.5 : 1.0
                   appal = True                1 : 5      =     61.1 : 1.0
                   crook = True                1 : 4      =     52.5 : 1.0
                 unappet = True                1 : 5      =     47.8 : 1.0
                  navoid = True                1 : 5      =     47.8 : 1.0
               dishonest = True                1 : 4      =     46.8 : 1.0
39.2026837260103


# Модель 2
Попробуем Naive Bayes, но теперь к признакам добавляем еще и популярные биграмы.
### Результат на validate:
34.78

In [14]:
from nltk.collocations import BigramCollocationFinder


def freq_scorer(n_ii, n_ix_xi_tuple, n_xx):
    return n_ii / n_xx

def bigram_word_feats(words, score_fn=freq_scorer, n=50):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

val = evaluate_classifier(bigram_word_feats, x_train, y_train, x_validate, y_validate)
print (val)

Most Informative Features
          ('food', 'ok') = True                2 : 5      =    161.4 : 1.0
            unprofession = True                1 : 5      =    160.6 : 1.0
                incompet = True                1 : 5      =    106.7 : 1.0
              disrespect = True                1 : 4      =    100.7 : 1.0
           ('give', '2') = True                2 : 5      =     97.2 : 1.0
     ('food', 'mediocr') = True                2 : 5      =     80.2 : 1.0
      ('food', 'poison') = True                1 : 5      =     74.8 : 1.0
        ('even', 'wors') = True                1 : 5      =     72.4 : 1.0
        ('high', 'hope') = True                2 : 5      =     65.1 : 1.0
                unaccept = True                1 : 5      =     64.0 : 1.0
34.77726634420346


# W2V
Обучим Word2Vec на предложениях из обучающей выборки

In [15]:
import gensim


model = gensim.models.Word2Vec(sents)

In [16]:
import numpy as np

def rev2vec(rev, vec_len, model):
    vec = [0] * (vec_len + 100)
    
    j = 0
    for w in rev:
        if w in model:
            v = model[w]
            j += 1
            for i in range(vec_len):
                vec[i] += v[i]
        if w in mfw_set:
            vec[mfw_id[w] + vec_len] = mfw_idf[w]
    
    for i in range(vec_len):
        vec[i] /= j
    
    return np.array(vec)

def prepare2(data, model, vec_len=100):
    new_data = []
    for r in data:
        new_data.append(rev2vec(r,vec_len,model))
    return np.array(new_data)

## Построение обучающей выборки
Воспользуемся векторами из Word2Vec.

Каждому ревью ставится в соответствие вектор длины 200, где первые 100 элементов - это центроид векторов слов, входящих в данное ревью, оставшиеся 100 - idf для слов, которые входят в ревью и в 100 наиболее частотных слов.

In [17]:
train_vecs = prepare2(x_train, model=model)

# Модели 3,4
Попробуем на построенных векторах обучить SVM (Random Forest).
### Результаты на validate:
<b>SVM</b>: 52.02<br/>
<b>RF</b>: 41.39
### Модификации
Была предпринята попытка использовать предобученные Word2Vec вектора. И частоту слова в ревью вместо idf.
К сожалению, ни по отдельности ни вместе данные модификации не привели к улучшению результата.

In [18]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier


clf = svm.SVC(decision_function_shape='ovo')
# clf = RandomForestClassifier(n_estimators=10)
clf.fit(train_vecs,  y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
predictions = []
for i in range(len(x_validate)):
    observed = clf.predict([ rev2vec(x_validate[i], 100, model) ])
    predictions.append(observed[0])

print (evaluate(y_validate, predictions))

51.9815883913247


# Построение обучающей выборки
Сделаем так, чтобы все ревью имели одинаковую длину (количество слов). И каждое слово в ревью представим в виде соответствующего ему вектора из Word2Vec

In [20]:
import numpy as np


rev_len = 80
vec_len = 100

def prepare(X, model):
    for i in range(len(X)):
        s = X[i]
        words = []
        j = 0
        for w in s:
            if w in model:
                if j < rev_len:
                    words.append(np.array(model[w]))
                    j += 1
        
        while j < rev_len:
            a = np.empty(vec_len)
            a.fill(0)
            words.append(a)
            j += 1
        
        X[i] = np.array(words)
    return np.array(X)

In [21]:
n_outs = 5
n_hidden = 80
data_len = 50000

In [23]:
train_data = prepare(train_data[:data_len], model)

In [24]:
def prepare_labels(Y):
    for i in range(len(Y)):
        l = int(Y[i]) - 1
        vec = [0] * n_outs
        vec[l] = 1
        vec = np.array(vec)
        Y[i] = vec
    return np.array(Y)   

Выполним one-hot encoding для классов (оценок ревью)

In [25]:
train_label = prepare_labels(train_label[:data_len])

# Модель 5
Воспользуемся RNN с тремя LSTM слоями.
### Результаты: 
55.20
### Результаты на kaggle:
53.3

In [26]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Convolution2D, Flatten, Reshape
from keras.optimizers import SGD
from keras.layers import Masking, Dense, Dropout, Activation
from keras.layers import LSTM, SimpleRNN, GRU


def compile_model(model, learning_rate=0.01, d=1e-6):
    sgd = SGD(lr=learning_rate, decay=d) 
    model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['categorical_accuracy'])
    
    return model

def train_model(model, train_dataset, train_labels, bs=80, epoch=20):
    model.fit(train_dataset, train_labels, batch_size=bs, nb_epoch=epoch)

def rnn_model(seq_len):
    model = Sequential([
        Masking(mask_value=0., input_shape=(seq_len, vec_len)),
        LSTM(n_hidden, input_shape=(seq_len, vec_len), return_sequences=True),
        LSTM(n_hidden, return_sequences=True),
        LSTM(n_hidden),
        Dense(n_outs, activation='softmax')
    ])
    return model

Using TensorFlow backend.


In [27]:
model2 = compile_model(rnn_model(rev_len))

In [28]:
train_model(model2, train_data, train_label)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
for i in range(len(test_set)):
    sent = test_set[i][1]
    
    sent = nltk.sent_tokenize(sent)
    for j in range(len(sent)):
        tokens = tokenizer.tokenize(sent[j])
        sent[j] = [stemmer.stem(w) for w in tokens if not w in sw]
    
    words = []
    for s in sent:
        for w in s:
            words.append(w)
    test_set[i][1] = words

In [30]:
for i in range(len(test_set)):
    for j in range(len(test_set[i][1])):
        test_set[i][1][j] = test_set[i][1][j].lower()

In [31]:
def transform(s):
    words = []
    j = 0
    for w in s:
        if w in model:
            if j < rev_len:
                words.append(np.array(model[w]))
                j += 1

    while j < rev_len:
        a = np.empty(vec_len)
        a.fill(0)
        words.append(a)
        j += 1
    return np.array(words)

In [32]:
def predict(s):
    x = np.array([transform(s)])
    return np.argmax(model2.predict( x )[0]) + 1

In [33]:
test_out = open('test.out', 'w')
writer = csv.writer(test_out)
writer.writerow(['ID', 'Sentiment'])
for i, s in test_set:
    writer.writerow([i, predict(s)])
    
test_out.flush()
test_out.close()