In [25]:
import numpy as np
from pymystem3 import Mystem
import csv
from collections import defaultdict

In [26]:
mystem = Mystem()

def tokenize(line):
    line = line.replace(',', '')
    line = line.replace('.', '')
    line = line.replace('!', '')
    line = line.replace('?', '')
    line = line.replace('\n', '')
    line = line.replace(')', '')
    line = line.replace('(', '')
    line = line.replace(':', '')
    line = line.replace(';', '')
    line = line.lower()
    line = line.replace('ё', 'е')
    words = mystem.lemmatize(line)
    return list(filter(lambda x: x not in ' \n-', words))

In [27]:
def load_data():
    with open('train.txt') as data:
        return list(map(tokenize, data.readlines()))
    
def load_test():
    with open('test.txt') as data:
        return list(map(tokenize, data.readlines()))

def load_labels():
    with open('labels.txt') as data:
        return list(map(int, data.readlines()))

In [7]:
train = load_data()
labels = load_labels()

labels = np.array(labels)

In [51]:
test_data = load_test()

In [9]:
mean_rate = int(round(np.mean(labels)))

# return mean rating
def mean_rating():
    for _ in test_data:
        print(mean_rate)

# Extracting word sentiment features

In [10]:
tokenize('так-себе дядя, ведь не даром Москва спалённая пожаром французу отдана?')

['так',
 'себе',
 'дядя',
 'ведь',
 'не',
 'даром',
 'москва',
 'спалить',
 'пожар',
 'француз',
 'отдавать']

In [31]:
word_tone = defaultdict(lambda: 0)

with open('sentiment_words.csv', encoding='cp1251') as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=';')
    next(csv_reader, None)
    for row in csv_reader:
        mark = int(row[3])
        if mark != 0:
            word_tone[row[0]] = mark

In [37]:
word_tone['хрень'] = -4
word_tone['хреновый'] = -4
word_tone['отстой'] = -4
word_tone['бред'] = -4
word_tone['бредятина'] = -4
word_tone['топ'] = 4
word_tone['топовый'] = 4

In [44]:
def word_tone_features(data):
    l = []
    for example in data:
        positive = 1
        negative = 1
        
        prev = []
        for word in example:
            tone = word_tone[word]
            if 'не' in prev[-2:]:
                tone = -tone
            
            if tone > 0:
                positive += tone
            else:
                negative -= tone
            prev.append(word)
                
        positive /= len(example)
        negative /= len(example)
        l.append((len(example), positive, negative, positive / negative, positive * negative, positive * positive, negative * negative))
        
    return l

In [45]:
X = np.array(word_tone_features(train))
print(X.shape)

(20000, 7)


In [52]:
from sklearn.linear_model import LinearRegression, Ridge, LassoLars

reg = LinearRegression().fit(X, labels)

In [53]:
test_X = word_tone_features(test_data)
result = reg.predict(test_X)

result = np.round(result)

for x in result:
    print(int(x) if x <= 10 else 10)

8
8
8
7
8
8
8
8
8
8
8
8
8
8
9
8
10
7
8
7
7
10
8
7
7
8
7
7
8
8
5
8
8
9
7
8
8
8
9
8
8
8
8
7
7
8
7
8
7
9
8
9
8
8
8
8
6
8
8
8
8
8
7
9
9
9
8
8
8
9
9
7
9
8
7
8
7
8
8
8
7
8
9
9
9
8
7
10
10
8
9
7
8
8
8
7
9
8
8
8
8
7
8
8
8
7
8
7
8
6
8
9
8
8
8
8
8
9
8
7
8
7
8
8
7
8
7
7
8
8
7
7
7
8
10
8
7
8
9
7
8
8
8
8
8
9
7
8
7
8
9
8
8
8
8
9
8
8
7
8
7
9
8
7
8
7
8
8
7
8
7
8
8
8
8
9
7
8
8
8
8
8
9
8
8
8
9
8
8
8
8
9
7
6
7
8
7
7
8
8
5
8
7
8
8
8
8
8
9
8
8
9
8
8
7
8
8
7
8
8
7
8
8
9
8
8
8
7
8
9
8
8
7
9
10
8
10
7
7
7
9
8
8
8
6
7
8
8
8
8
8
7
8
8
8
9
7
8
7
8
8
8
8
7
10
8
9
8
7
8
8
9
8
8
8
8
9
8
8
7
9
8
8
8
8
8
7
8
8
8
6
7
8
8
8
8
5
7
7
8
7
8
8
9
8
9
8
8
8
10
8
9
7
9
8
7
8
9
8
7
8
9
7
8
8
7
8
9
10
9
7
9
9
8
7
9
8
8
8
8
7
8
8
7
8
8
8
8
8
8
9
9
7
8
8
8
8
8
8
8
7
8
10
9
7
8
8
8
8
8
8
7
8
7
8
10
7
8
8
3
8
7
8
7
5
8
8
8
8
8
9
8
9
7
8
9
9
7
8
8
8
8
8
8
10
8
8
8
8
7
8
8
7
8
8
8
7
8
7
8
8
8
7
8
9
7
9
7
9
7
8
8
8
8
8
7
8
7
8
8
8
8
9
9
9
8
8
7
9
7
8
8
8
8
8
8
8
8
7
7
10
8
8
7
8
8
7
8
7
7
8
8
8
7
8
9
8
8
8
8
8
6
8
8
8
7
8
8
7
8
8
9
8


In [55]:
from sklearn.svm import SVR
ridge_reg = SVR(gamma='scale', C=1.0, epsilon=0.2).fit(X, labels)

In [54]:
test_X = word_tone_features(test_data)
result = ridge_reg.predict(test_X)

result = np.round(result)

for x in result:
    print(int(x))

ValueError: X.shape[1] = 7 should be equal to 3, the number of features at training time

In [20]:
from sklearn.neural_network import MLPRegressor


model = MLPRegressor(hidden_layer_sizes=(16, 16),
                        activation='relu',
                        solver='adam',
                        learning_rate='adaptive',
                        max_iter=1000,
                        learning_rate_init=0.01,
                        alpha=0.01)

In [29]:
nn = model.fit(X, labels)
test_X = word_tone_features(test_data)
result = nn.predict(test_X)

result = np.round(result)

for x in result:
    print(int(x))

9
8
8
8
7
7
6
7
7
7
8
9
7
7
8
8
8
8
8
8
6
7
9
8
8
8
9
5
8
8
8
9
7
8
9
7
8
7
8
8
9
9
9
8
9
9
7
8
8
8
7
8
8
8
8
8
7
7
9
9
9
8
9
8
9
8
8
8
7
7
8
7
8
7
9
8
8
9
9
8
8
8
8
9
8
9
8
8
7
9
8
8
8
8
7
8
10
8
8
9
8
8
9
8
8
9
8
8
8
7
7
6
9
8
7
8
8
8
8
8
8
8
7
8
8
7
9
8
9
8
8
8
8
9
8
8
8
9
7
8
8
9
8
9
7
7
8
8
8
7
10
8
8
9
8
8
8
8
7
8
9
9
8
8
7
8
8
9
8
8
8
8
8
9
8
8
7
7
7
8
8
8
9
9
8
9
8
8
7
9
8
7
8
8
9
8
8
6
9
7
7
7
9
7
9
5
9
9
9
9
7
8
8
8
7
7
9
6
7
7
7
8
8
9
8
9
9
7
7
9
9
9
5
9
8
8
8
9
7
8
9
8
8
8
8
9
8
8
9
8
8
8
9
8
8
7
7
7
7
9
8
8
7
7
9
8
9
8
9
8
9
8
8
7
8
8
8
9
7
8
8
7
8
9
9
9
9
8
7
9
8
8
7
9
7
8
8
7
8
8
7
8
7
8
9
8
8
7
8
8
7
8
7
8
7
8
9
8
7
8
9
8
7
7
7
7
9
8
9
8
8
7
7
9
8
9
8
8
7
7
8
8
8
8
8
8
8
7
8
7
8
8
6
8
9
9
9
8
8
9
8
8
9
7
8
8
8
8
8
8
7
9
9
8
8
9
6
8
8
8
9
8
8
8
8
9
8
7
8
9
9
8
7
7
8
8
8
8
9
8
7
6
8
9
8
8
7
8
7
8
9
9
8
8
8
8
7
8
8
8
8
7
8
8
8
7
8
8
9
8
7
7
7
9
9
8
8
8
8
7
8
8
8
8
9
8
7
9
7
8
9
8
8
9
8
8
8
8
8
8
7
8
8
8
7
8
8
8
8
7
7
8
8
8
8
7
8
8
8
7
8
7
8
7
8
7
7
8
8
8
7
8
8
8
8
8
7
9
8


In [30]:
print(labels.min())

1
