# Imports

In [10]:
import pandas as pd
import numpy as np
import re

import torch
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import TweetTokenizer

# Constants

In [11]:
POSITIVE_TWEETS_CSV = 'positive.csv'
NEGATIVE_TWEETS_CSV = 'negative.csv'

VOCAB_SIZE = 5000

# Load Data

In [12]:
tweets_col_number = 3

negative_tweets = pd.read_csv(
    'negative.csv', header=None, delimiter=';')[[tweets_col_number]]
positive_tweets = pd.read_csv(
    'positive.csv', header=None, delimiter=';')[[tweets_col_number]]

# Stemmer

In [13]:
stemer = RussianStemmer()
regex = re.compile('[^а-яА-Я ]')
stem_cache = {}

def get_stem(token):
    stem = stem_cache.get(token, None)
    if stem:
        return stem
    token = regex.sub('', token).lower()
    stem = stemer.stem(token)
    stem_cache[token] = stem
    return stem

# Create Vocabulary 

In [14]:
stem_count = Counter()
tokenizer = TweetTokenizer()

def count_unique_tokens_in_tweets(tweets):
    for _, tweet_series in tweets.iterrows():
        tweet = tweet_series[3]
        tokens = tokenizer.tokenize(tweet)
        for token in tokens:
            stem = get_stem(token)
            stem_count[stem] += 1

count_unique_tokens_in_tweets(negative_tweets)
count_unique_tokens_in_tweets(positive_tweets)

In [15]:
print("Total unique stems found: ", len(stem_count))

Total unique stems found:  91780


In [16]:
vocab = sorted(stem_count, key=stem_count.get, reverse=True)[:VOCAB_SIZE]
print(vocab[:100])

['', 'не', 'я', 'и', 'в', 'на', 'а', 'что', 'так', 'с', 'эт', 'как', 'у', 'мен', 'мне', 'все', 'но', 'он', 'ты', 'теб', 'ну', 'мо', 'то', 'уж', 'по', 'был', 'ещ', 'за', 'да', 'вот', 'же', 'тольк', 'нет', 'сегодн', 'о', 'прост', 'бы', 'над', 'когд', 'хоч', 'очен', 'к', 'сам', 'ден', 'будет', 'мы', 'от', 'хорош', 'из', 'есл', 'тепер', 'тож', 'буд', 'сво', 'год', 'даж', 'завтр', 'нов', 'дом', 'до', 'там', 'ест', 'вообщ', 'ег', 'вс', 'дела', 'пот', 'одн', 'для', 'больш', 'хот', 'спасиб', 'мог', 'сейчас', 'е', 'себ', 'нас', 'блин', 'раз', 'кто', 'дума', 'утр', 'котор', 'любл', 'поч', 'зна', 'говор', 'лучш', 'нич', 'без', 'ил', 'вы', 'друг', 'тут', 'чтоб', 'всем', 'бол', 'люд', 'сдела', 'сказа']


In [20]:
idx = 2
print("stem: {}, count: {}"
      .format(vocab[idx], stem_count.get(vocab[idx])))

stem: я, count: 66045


In [21]:
token_2_idx = {vocab[i] : i for i in range(VOCAB_SIZE)}
len(token_2_idx)

5000

# Convert text to vectors

In [22]:
def tweet_to_vector(tweet, show_unknowns=False):
    vector = np.zeros(VOCAB_SIZE, dtype=np.int_)
    for token in tokenizer.tokenize(tweet):
        stem = get_stem(token)
        idx = token_2_idx.get(stem, None)
        if idx is not None:
            vector[idx] = 1
        elif show_unknowns:
            print("Unknown token: {}".format(token))
    return vector

In [23]:
tweet = negative_tweets.iloc[1][3]
print("tweet: {}".format(tweet))
print("vector: {}".format(tweet_to_vector(tweet)[:10]))
print(vocab[5])

tweet: Коллеги сидят рубятся в Urban terror, а я из-за долбанной винды не могу :(
vector: [1 1 1 0 1 0 1 0 0 0]
на


In [24]:
tweet_vectors = np.zeros(
    (len(negative_tweets) + len(positive_tweets), VOCAB_SIZE), 
    dtype=np.float32)
tweets = []
for ii, (_, tweet) in enumerate(negative_tweets.iterrows()):
    tweets.append(tweet[3])
    tweet_vectors[ii] = tweet_to_vector(tweet[3])
for ii, (_, tweet) in enumerate(positive_tweets.iterrows()):
    tweets.append(tweet[3])
    tweet_vectors[ii + len(negative_tweets)] = tweet_to_vector(tweet[3])

# Preparing labels

In [25]:
labels = np.append(
    np.zeros(len(negative_tweets), dtype=np.int64), 
    np.ones(len(positive_tweets), dtype=np.int64))

# Preparing the data for the training

In [26]:
X = torch.from_numpy(tweet_vectors)
y = torch.from_numpy(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Creating a neural network architecture

In [27]:
class senNet(torch.nn.Module):
    def __init__(self):
        super(senNet,self).__init__()
        self.fc1=torch.nn.Linear(VOCAB_SIZE,125)
        self.act1=torch.nn.ReLU()
        self.fc2=torch.nn.Linear(125,25)
        self.act2=torch.nn.ReLU()
        self.fc3=torch.nn.Linear(25,2)
        
    
    def forward(self,x):
        x=self.fc1(x)
        x=self.act1(x)
        x=self.fc2(x)
        x=self.act2(x)
        x=self.fc3(x)
        return x

sen_net = senNet()

In [28]:
loss=torch.nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(sen_net.parameters(), lr=1.0e-3)

# Training the neural network

In [30]:
batch_size = 100

test_accuracy_history = []
test_loss_history = []

# X_test = X_test.to(device)
# y_test = y_test.to(device)

for epoch in range(5):
    order = np.random.permutation(len(X_train))
    
    for start_index in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        
        batch_indexes = order[start_index:start_index+batch_size]
        
        X_batch = X_train[batch_indexes] #.to(device)
        y_batch = y_train[batch_indexes] #.to(device)
        
        preds = sen_net.forward(X_batch) 
        
        loss_value = loss(preds, y_batch)
        loss_value.backward()
        
        optimizer.step()
    
    
    test_preds = sen_net.forward(X_test)
    test_loss_history.append(loss(test_preds, y_test))
    
    accuracy = (test_preds.argmax(dim=1) == y_test).float().mean()
    test_accuracy_history.append(accuracy)
    print(accuracy)

tensor(0.7280)
tensor(0.7372)
tensor(0.7388)
tensor(0.7336)
tensor(0.7262)


# Testing our neural network

In [31]:
def test_tweet(tweet):
    tweet_vector = tweet_to_vector(tweet, True)
    tweet_vector = torch.from_numpy(tweet_vector).float()
    print(tweet_vector)
    
    print('forw:',sen_net.forward(tweet_vector))
    positive_prob = sen_net.forward(tweet_vector)[1]
    print('Original tweet: {}'.format(tweet))
    print('P(positive) = {:.5f}. Result: '.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')

In [32]:
tweets_for_testing = [
    "Ужасное приложение",
    "Я ухожу отсюда, сплошные мошенники",
    "Отвратительная поддержка",
    "Прекрасное приложение!",
    "Лучшее приложение в маркете!",
    "Не могу связаться с поддержкой банка.",
    "Приложение не открывается после обновления!!!!"
]
for tweet in tweets_for_testing:
    test_tweet(tweet) 
    print("---------")

tensor([0., 0., 0.,  ..., 0., 0., 0.])
forw: tensor([ 2.0739, -1.4104], grad_fn=<AddBackward0>)
Original tweet: Ужасное приложение
P(positive) = -1.41044. Result:  Negative
---------
Unknown token: мошенники
tensor([1., 0., 1.,  ..., 0., 0., 0.])
forw: tensor([ 1.3401, -0.8680], grad_fn=<AddBackward0>)
Original tweet: Я ухожу отсюда, сплошные мошенники
P(positive) = -0.86801. Result:  Negative
---------
tensor([0., 0., 0.,  ..., 0., 0., 0.])
forw: tensor([ 4.4666, -3.8291], grad_fn=<AddBackward0>)
Original tweet: Отвратительная поддержка
P(positive) = -3.82913. Result:  Negative
---------
tensor([1., 0., 0.,  ..., 0., 0., 0.])
forw: tensor([-1.7567,  1.8760], grad_fn=<AddBackward0>)
Original tweet: Прекрасное приложение!
P(positive) = 1.87600. Result:  Positive
---------
tensor([1., 0., 0.,  ..., 0., 0., 0.])
forw: tensor([-2.0574,  2.1914], grad_fn=<AddBackward0>)
Original tweet: Лучшее приложение в маркете!
P(positive) = 2.19135. Result:  Positive
---------
tensor([1., 1., 0.,  ..., 

# Converting the model to Pickle format

In [33]:
import dill as pickle
filename = 'model_v1.pk'
with open('./models/'+filename, 'wb') as file:
    pickle.dump(sen_net,file)

# Trying to load the model back

In [34]:
with open('./models/'+filename ,'rb') as f:
    loaded_model = pickle.load(f)

## Testing again

In [35]:
def test_tweet(tweet):
    tweet_vector = tweet_to_vector(tweet, True)
    tweet_vector = torch.from_numpy(tweet_vector).float()
    print(tweet_vector)
    
    print('forw:',type(loaded_model))
    positive_prob = loaded_model.forward(tweet_vector)[1]
    print('Original tweet: {}'.format(tweet))
    print('P(positive) = {:.5f}. Result: '.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')

In [37]:
tweets_for_testing = [
    "Ужасное приложение",
    "Я ухожу отсюда, сплошные мошенники",
    "Отвратительная поддержка",
    "Прекрасное приложение!",
    "Лучшее приложение в маркете!",
    "Не могу связаться с поддержкой банка.",
    "Приложение не открывается после обновления!!!"
]
for tweet in tweets_for_testing:
    test_tweet(tweet) 
    print("---------")

tensor([0., 0., 0.,  ..., 0., 0., 0.])
forw: <class '__main__.senNet'>
Original tweet: Ужасное приложение
P(positive) = -1.41044. Result:  Negative
---------
Unknown token: мошенники
tensor([1., 0., 1.,  ..., 0., 0., 0.])
forw: <class '__main__.senNet'>
Original tweet: Я ухожу отсюда, сплошные мошенники
P(positive) = -0.86801. Result:  Negative
---------
tensor([0., 0., 0.,  ..., 0., 0., 0.])
forw: <class '__main__.senNet'>
Original tweet: Отвратительная поддержка
P(positive) = -3.82913. Result:  Negative
---------
tensor([1., 0., 0.,  ..., 0., 0., 0.])
forw: <class '__main__.senNet'>
Original tweet: Прекрасное приложение!
P(positive) = 1.87600. Result:  Positive
---------
tensor([1., 0., 0.,  ..., 0., 0., 0.])
forw: <class '__main__.senNet'>
Original tweet: Лучшее приложение в маркете!
P(positive) = 2.19135. Result:  Positive
---------
tensor([1., 1., 0.,  ..., 0., 0., 0.])
forw: <class '__main__.senNet'>
Original tweet: Не могу связаться с поддержкой банка.
P(positive) = 0.19291. Res

# Thank you for your attention!