In [1]:
import re
from collections import Counter

import nltk.classify.util
import pandas as pd
from nltk.classify import NaiveBayesClassifier
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import TweetTokenizer


In [2]:
POSITIVE_TWEETS_CSV = 'datasets/negative.csv'
NEGATIVE_TWEETS_CSV = 'datasets/positive.csv'



VOCAB_SIZE = 5000

tweets_col_number = 3

negative_tweets = pd.read_csv(
    POSITIVE_TWEETS_CSV, header=None, delimiter=';')[[tweets_col_number]]
positive_tweets = pd.read_csv(
    NEGATIVE_TWEETS_CSV, header=None, delimiter=';')[[tweets_col_number]]

stemer = RussianStemmer()
regex = re.compile('[^а-яА-Я ]')
stem_cache = {}


def get_stem(token):
    stem = stem_cache.get(token, None)
    if stem:
        return stem
    token = regex.sub('', token).lower()
    stem = stemer.stem(token)
    stem_cache[token] = stem
    return stem


stem_count = Counter()
tokenizer = TweetTokenizer()

In [3]:
def count_unique_tokens_in_tweets(tweets):
    for _, tweet_series in tweets.iterrows():
        tweet = tweet_series[3]
        tokens = tokenizer.tokenize(tweet)
        for token in tokens:
            stem = get_stem(token)
            stem_count[stem] += 1


count_unique_tokens_in_tweets(negative_tweets)

In [4]:
count_unique_tokens_in_tweets(positive_tweets)

In [5]:
print("Total unique stems found: ", len(stem_count))

Total unique stems found:  91780


In [6]:
vocab = sorted(stem_count, key=stem_count.get, reverse=True)[:VOCAB_SIZE]
print(vocab[:100])

['', 'не', 'я', 'и', 'в', 'на', 'а', 'что', 'так', 'с', 'эт', 'как', 'у', 'мен', 'мне', 'все', 'но', 'он', 'ты', 'теб', 'ну', 'мо', 'то', 'уж', 'по', 'был', 'ещ', 'за', 'да', 'вот', 'же', 'тольк', 'нет', 'сегодн', 'о', 'прост', 'бы', 'над', 'когд', 'хоч', 'очен', 'к', 'сам', 'ден', 'будет', 'мы', 'от', 'хорош', 'из', 'есл', 'тепер', 'тож', 'буд', 'сво', 'год', 'даж', 'завтр', 'нов', 'дом', 'до', 'там', 'ест', 'вообщ', 'ег', 'вс', 'дела', 'пот', 'одн', 'для', 'больш', 'хот', 'спасиб', 'мог', 'сейчас', 'е', 'себ', 'нас', 'блин', 'раз', 'кто', 'дума', 'утр', 'котор', 'любл', 'поч', 'зна', 'говор', 'лучш', 'нич', 'без', 'ил', 'вы', 'друг', 'тут', 'чтоб', 'всем', 'бол', 'люд', 'сдела', 'сказа']


In [7]:
idx = 2
print("stem: {}, count: {}"
      .format(vocab[idx], stem_count.get(vocab[idx])))

token_2_idx = {vocab[i]: i for i in range(VOCAB_SIZE)}
len(token_2_idx)

print(token_2_idx['сказа'])


stem: я, count: 66045
99


In [8]:
def tweet_to_feachure(tweet, show_unknowns=False):
    vector = []
    for token in tokenizer.tokenize(tweet):
        stem = get_stem(token)        
        if stem:
            vector.append(stem)
        elif show_unknowns:
            print("Unknown token: {}".format(token))
    return dict([('contains-word(%s)' % w, True) for w in vector])


tweet = negative_tweets.iloc[1][3]
print("tweet: {}".format(tweet))
print("tweet_to_feachure: {}".format(tweet_to_feachure(tweet)))
print(vocab[5])

tweet: Коллеги сидят рубятся в Urban terror, а я из-за долбанной винды не могу :(
tweet_to_feachure: {'contains-word(коллег)': True, 'contains-word(сид)': True, 'contains-word(руб)': True, 'contains-word(в)': True, 'contains-word(а)': True, 'contains-word(я)': True, 'contains-word(изз)': True, 'contains-word(долба)': True, 'contains-word(винд)': True, 'contains-word(не)': True, 'contains-word(мог)': True}
на


In [9]:
tweets = []

neg_tweets = []
pos_tweets = []

for ii, (_, tweet) in enumerate(negative_tweets.iterrows()):
    tweets.append(tweet[3])
    neg_tweets.append((tweet_to_feachure(tweet[3]),'neg'))
for ii, (_, tweet) in enumerate(positive_tweets.iterrows()):
    tweets.append(tweet[3])
    pos_tweets.append((tweet_to_feachure(tweet[3]),'pos'))

In [10]:
print(pos_tweets[1])

In [11]:
negcutoff = len(neg_tweets) * 3 / 4
poscutoff = len(pos_tweets) * 3 / 4

In [12]:
trainfeats = neg_tweets[:int(negcutoff)] + pos_tweets[:int(poscutoff)]
testfeats = neg_tweets[int(negcutoff):] + pos_tweets[int(poscutoff):]

In [13]:
print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

train on 170125 instances, test on 56709 instances


In [14]:
classifier = NaiveBayesClassifier.train(trainfeats)

In [15]:
print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

accuracy: 0.7102047294080305
Most Informative Features


  contains-word(царевич) = True              pos : neg    =     78.9 : 1.0
  contains-word(шумахер) = True              neg : pos    =     35.2 : 1.0
   contains-word(погибл) = True              neg : pos    =     28.4 : 1.0
 contains-word(калашник) = True              neg : pos    =     28.4 : 1.0
  contains-word(позитив) = True              pos : neg    =     26.8 : 1.0
contains-word(соболезнован) = True              neg : pos    =     26.1 : 1.0
   contains-word(сконча) = True              neg : pos    =     22.2 : 1.0
   contains-word(сметан) = True              pos : neg    =     21.1 : 1.0
   contains-word(ублюдк) = True              neg : pos    =     19.5 : 1.0
  contains-word(пичальк) = True              neg : pos    =     19.5 : 1.0


In [19]:
testfeats[0]

({'contains-word(плох)': True,
  'contains-word(у)': True,
  'contains-word(мен)': True,
  'contains-word(сейчас)': True,
  'contains-word(по)': True,
  'contains-word(втор)': True,
  'contains-word(круг)': True,
  'contains-word(так)': True,
  'contains-word(как)': True,
  'contains-word(не)': True,
  'contains-word(долеч)': True,
  'contains-word(в)': True,
  'contains-word(прошл)': True,
  'contains-word(раз)': True,
  'contains-word(аккуратн)': True},
 'neg')

In [20]:
import pickle

model_train = open('models/tweet_model_train_features.pickle', 'wb')
pickle.dump(trainfeats,model_train)
model_train.close()
model_test = open('models/tweet_model_test_features.pickle', 'wb')
pickle.dump(testfeats,model_test)
model_test.close()

# model = open('models/tweet_model.pickle', 'wb')
# 
# pickle.dump(classifier,model)
# 
# model.close()

In [25]:
saved_model = open('models/tweet_model.pickle', 'rb')

loaded_classifier = pickle.load(saved_model)

saved_model.close()

print('accuracy:', nltk.classify.util.accuracy(loaded_classifier, testfeats))
classifier.show_most_informative_features()

accuracy: 0.7102047294080305
Most Informative Features
  contains-word(царевич) = True              pos : neg    =     78.9 : 1.0
  contains-word(шумахер) = True              neg : pos    =     35.2 : 1.0
 contains-word(калашник) = True              neg : pos    =     28.4 : 1.0
   contains-word(погибл) = True              neg : pos    =     28.4 : 1.0
  contains-word(позитив) = True              pos : neg    =     26.8 : 1.0
contains-word(соболезнован) = True              neg : pos    =     26.1 : 1.0
   contains-word(сконча) = True              neg : pos    =     22.2 : 1.0
   contains-word(сметан) = True              pos : neg    =     21.1 : 1.0
   contains-word(ублюдк) = True              neg : pos    =     19.5 : 1.0
  contains-word(почемуу) = True              neg : pos    =     19.5 : 1.0
