In [1]:
import pickle

saved_model = open('models/tweet_model.pickle', 'rb')

loaded_classifier = pickle.load(saved_model)

saved_model.close()

In [2]:
import re
from collections import Counter

import nltk.classify.util
import pandas as pd
from nltk.classify import NaiveBayesClassifier
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import TweetTokenizer

POSITIVE_COMMENTS_CSV = 'datasets/clean_comments_neg.csv'
NEGATIVE_COMMENTS_CSV = 'datasets/clean_comments_pos.csv'

VOCAB_SIZE = 5000

tweets_col_number = 6

negative_comments = pd.read_csv(
    POSITIVE_COMMENTS_CSV, header=None, dialect='excel-tab')[[tweets_col_number]]
positive_comments = pd.read_csv(
    NEGATIVE_COMMENTS_CSV, header=None, dialect='excel-tab')[[tweets_col_number]]


  parser = TextFileReader(filepath_or_buffer, **kwds)


In [3]:
stemer = RussianStemmer()
regex = re.compile('[^а-яА-Я ]')
stem_cache = {}


def get_stem(token):
    stem = stem_cache.get(token, None)
    if stem:
        return stem
    token = regex.sub('', token).lower()
    stem = stemer.stem(token)
    stem_cache[token] = stem
    return stem


stem_count = Counter()
tokenizer = TweetTokenizer()

In [4]:
def count_unique_tokens_in_tweets(tweets):
    for _, tweet_series in tweets.iterrows():
        tweet = tweet_series[6]
        tokens = tokenizer.tokenize(tweet)
        for token in tokens:
            stem = get_stem(token)
            stem_count[stem] += 1


count_unique_tokens_in_tweets(negative_comments)

In [5]:
count_unique_tokens_in_tweets(positive_comments)

In [6]:
vocab = sorted(stem_count, key=stem_count.get, reverse=True)[:VOCAB_SIZE]
print(vocab[:100])

['в', 'и', 'не', 'эт', 'на', 'что', 'с', 'а', 'так', 'как', 'для', 'я', 'то', 'есл', 'по', 'но', 'котор', 'курс', 'все', 'из', 'к', 'он', 'у', 'задач', 'был', 'можн', 'вы', 'ил', 'нужн', 'бы', 'прост', 'сам', 'решен', 'за', 'будет', 'ест', 'от', 'же', 'мы', 'тольк', 'ответ', 'дан', 'задан', 'при', 'числ', 'строк', 'очен', 'одн', 'функц', 'может', 'уж', 'перв', 'значен', 'код', 'чтоб', 'над', 'друг', 'ещ', 'больш', 'сво', 'мен', 'вот', 'пот', 'о', 'ег', 'нет', 'раз', 'быт', 'работа', 'мне', 'получ', 'вопрос', 'пример', 'без', 'тест', 'посл', 'поня', 'перемен', 'реш', 'тут', 'когд', 'до', 'правильн', 'чем', 'услов', 'сдела', 'случа', 'спасиб', 'дела', 'тем', 'программ', 'кажд', 'их', 'ваш', 'вам', 'должн', 'хот', 'метод', 'про', 'поч']


In [7]:
def comment_to_feachure(tweet, show_unknowns=False):
    vector = []
    for token in tokenizer.tokenize(tweet):
        stem = get_stem(token)        
        if stem:
            vector.append(stem)
        elif show_unknowns:
            print("Unknown token: {}".format(token))
    return dict([('contains-word(%s)' % w, True) for w in vector])


tweet = negative_comments.iloc[1][6]
print("comment: {}".format(tweet))
print("comment_to_feachure: {}".format(comment_to_feachure(tweet)))
print(vocab[5])

comment: финансовые ресурсы на проект
comment_to_feachure: {'contains-word(финансов)': True, 'contains-word(ресурс)': True, 'contains-word(на)': True, 'contains-word(проект)': True}
что


In [8]:
comments = []

neg_comments = []
pos_comments = []

pos = 6
for ii, (_, tweet) in enumerate(negative_comments.iterrows()):
    comments.append(tweet[pos])
    neg_comments.append((comment_to_feachure(tweet[pos]), 'neg'))
for ii, (_, tweet) in enumerate(positive_comments.iterrows()):
    comments.append(tweet[pos])
    pos_comments.append((comment_to_feachure(tweet[pos]), 'pos'))

In [9]:
negcutoff = len(neg_comments) * 3 / 4
poscutoff = len(pos_comments) * 3 / 4

In [10]:
trainfeats = neg_comments[:int(negcutoff)] + pos_comments[:int(poscutoff)]
testfeats = neg_comments[int(negcutoff):] + pos_comments[int(poscutoff):]

In [11]:
print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

train on 13953 instances, test on 4651 instances


In [12]:
comments_classifier = NaiveBayesClassifier.train(trainfeats)

In [13]:
print('accuracy:', nltk.classify.util.accuracy(comments_classifier, testfeats))
comments_classifier.show_most_informative_features()

accuracy: 0.8750806278219738
Most Informative Features


contains-word(медиаграмотн) = True              neg : pos    =    116.5 : 1.0
   contains-word(послан) = True              neg : pos    =     90.6 : 1.0
contains-word(журналистик) = True              neg : pos    =     90.6 : 1.0
    contains-word(вернм) = True              neg : pos    =     90.6 : 1.0
     contains-word(нест) = True              neg : pos    =     74.1 : 1.0
      contains-word(тих) = True              neg : pos    =     56.7 : 1.0
     contains-word(фигн) = True              neg : pos    =     45.7 : 1.0
    contains-word(мужик) = True              neg : pos    =     39.7 : 1.0
   contains-word(диалог) = True              neg : pos    =     35.4 : 1.0
      contains-word(мим) = True              neg : pos    =     34.0 : 1.0


In [14]:
from nltk.classify import SklearnClassifier
from sklearn.tree import DecisionTreeClassifier

dts_classifier = SklearnClassifier(DecisionTreeClassifier()).train(trainfeats)

print('accuracy:', nltk.classify.util.accuracy(dts_classifier, testfeats))

accuracy: 0.8176736185766502


In [15]:
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC

svm_classifier = SklearnClassifier(SVC()).train(trainfeats)


print('accuracy:', nltk.classify.util.accuracy(svm_classifier, testfeats))



accuracy: 0.8976564179746291


In [16]:
print('accuracy:', nltk.classify.util.accuracy(loaded_classifier, testfeats))
loaded_classifier.show_most_informative_features()

accuracy: 0.67598365942808
Most Informative Features
  contains-word(царевич) = True              pos : neg    =     78.9 : 1.0
  contains-word(шумахер) = True              neg : pos    =     35.2 : 1.0
 contains-word(калашник) = True              neg : pos    =     28.4 : 1.0
   contains-word(погибл) = True              neg : pos    =     28.4 : 1.0
  contains-word(позитив) = True              pos : neg    =     26.8 : 1.0
contains-word(соболезнован) = True              neg : pos    =     26.1 : 1.0
   contains-word(сконча) = True              neg : pos    =     22.2 : 1.0
   contains-word(сметан) = True              pos : neg    =     21.1 : 1.0
   contains-word(ублюдк) = True              neg : pos    =     19.5 : 1.0
  contains-word(почемуу) = True              neg : pos    =     19.5 : 1.0


In [17]:
print('accuracy:', nltk.classify.util.accuracy(loaded_classifier, testfeats))
loaded_classifier.show_most_informative_features()

accuracy: 0.67598365942808
Most Informative Features
  contains-word(царевич) = True              pos : neg    =     78.9 : 1.0
  contains-word(шумахер) = True              neg : pos    =     35.2 : 1.0
 contains-word(калашник) = True              neg : pos    =     28.4 : 1.0
   contains-word(погибл) = True              neg : pos    =     28.4 : 1.0
  contains-word(позитив) = True              pos : neg    =     26.8 : 1.0
contains-word(соболезнован) = True              neg : pos    =     26.1 : 1.0
   contains-word(сконча) = True              neg : pos    =     22.2 : 1.0
   contains-word(сметан) = True              pos : neg    =     21.1 : 1.0
   contains-word(ублюдк) = True              neg : pos    =     19.5 : 1.0
  contains-word(почемуу) = True              neg : pos    =     19.5 : 1.0


In [18]:
model = open('models/comment_model.pickle', 'wb')

pickle.dump(comments_classifier,model)

model.close()

In [19]:
neg_comments[1]

({'contains-word(финансов)': True,
  'contains-word(ресурс)': True,
  'contains-word(на)': True,
  'contains-word(проект)': True},
 'neg')

In [20]:
comments_classifier.classify(comment_to_feachure('хорошо'))

'pos'

In [21]:
tokenizer_model = open('models/tokenizer_comment.pickle', 'wb')

pickle.dump(tokenizer,tokenizer_model)

tokenizer_model.close()

In [22]:
stem_model = open('models/stem_cache_comment.pickle', 'wb')

pickle.dump(stem_cache,stem_model)

stem_model.close()

In [23]:
stem_count_model = open('models/stem_count_comment.pickle', 'wb')

pickle.dump(stem_count, stem_count_model)

stem_count_model.close()


In [24]:
model_tweet_test = open('models/tweet_model_test_features.pickle', 'rb')

test_tweet = pickle.load(model_tweet_test)

model_tweet_test.close()

print('accuracy tweet NaiveBayes:', nltk.classify.util.accuracy(loaded_classifier, test_tweet))
print('accuracy Stepic NaiveBayes:', nltk.classify.util.accuracy(comments_classifier, test_tweet))
print('accuracy svm:', nltk.classify.util.accuracy(svm_classifier, test_tweet))
print('accuracy dts:', nltk.classify.util.accuracy(dts_classifier, test_tweet))


accuracy tweet NaiveBayes: 0.7102047294080305


accuracy Stepic NaiveBayes: 0.5124936077165883


accuracy svm: 0.5065862561498174


accuracy dts: 0.5019485443227706
