In [1]:
import pandas as pd
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import emoji
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from pymystem3 import Mystem

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/maxd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/maxd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train_pd = pd.read_csv('data/rusentitweet_train.csv')
test_pd = pd.read_csv('data/rusentitweet_test.csv')

In [3]:
options = ['positive', 'negative']
train_pd_f = train_pd[train_pd['label'].isin(options)]
test_pd_f = test_pd[test_pd['label'].isin(options)]

In [4]:
def filter_regex():
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    emoji_pattern = ''.join(re.escape(u) for u in emojis)
    pattern = '[^\w\s!?' + emoji_pattern + ']'
    return re.compile(pattern)

def preprocess(text):
    # удаление тегов
    text = re.sub(r"@\w+", "", text)

    # удаление ссылок
    text = re.sub(r"(https|http):\/\/\S+", "", text)

    # удаление всего, кроме слов, эмоджи и знаков '!', '?'
    filter = filter_regex()
    text = filter.sub(repl='', string=text)

    text = text.lower()
    text = text.replace("ё", "e")
    return text

In [5]:
train_pd_f['text'] = train_pd_f['text'].apply(preprocess)
test_pd_f['text'] = test_pd_f['text'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_pd_f['text'] = train_pd_f['text'].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_pd_f['text'] = test_pd_f['text'].apply(preprocess)


In [6]:
def stemm(text):
    stemmer = SnowballStemmer("russian")
    stop_words = stopwords.words("russian")
    emojies = emoji.distinct_emoji_list(text)
    text = emoji.replace_emoji(text)
    tokens = word_tokenize(text)
    stemmed_tokens = []
    for token in tokens:
        if token not in stop_words:
            stemmed_tokens.append(stemmer.stem(token))
    stemmed_tokens += emojies
    return ' '.join(stemmed_tokens)

In [7]:
stemmed_train = train_pd_f.copy()
stemmed_train['text'] = stemmed_train['text'].apply(stemm)

stemmend_test = test_pd_f.copy()
stemmend_test['text'] = stemmend_test['text'].apply(stemm)

In [8]:
vectorizer = CountVectorizer()
x_train_stemmed = vectorizer.fit_transform(stemmed_train['text'])
x_test_stemmed = vectorizer.transform(stemmend_test['text'])

In [9]:
tfidf_transformer = TfidfTransformer()
x_train_stemmend_tfidf = tfidf_transformer.fit_transform(x_train_stemmed)
x_test_stemmend_tfidf = tfidf_transformer.transform(x_test_stemmed)

In [10]:
label_mapper = {'positive' : 1, 'negative' : 0}

y_train = train_pd_f['label'].map(label_mapper)
y_test = test_pd_f['label'].map(label_mapper)

In [11]:
lr_stemmed = LogisticRegression()
lr_stemmed.fit(x_train_stemmend_tfidf, y_train)
y_predicted_lr_stemmed = lr_stemmed.predict(x_test_stemmend_tfidf)

In [12]:
forest_stemmed = RandomForestClassifier()
forest_stemmed.fit(x_train_stemmend_tfidf, y_train)
y_predicted_forest_stemmed = forest_stemmed.predict(x_test_stemmend_tfidf)

In [13]:
words = vectorizer.get_feature_names_out()

In [14]:
print('Логистическая регрессия:')
print(classification_report(y_test, y_predicted_lr_stemmed))
print('ROC-AUC:', roc_auc_score(y_test, y_predicted_lr_stemmed))
print()

coefs = lr_stemmed.coef_[0]
indexes_lr = coefs.argsort()

top_bad_words_lr = []
for i in indexes_lr[:5]:
    top_bad_words_lr.append((words[i], coefs[i]))
    

top_good_words_lr = []
for i in indexes_lr[-5:]:
    top_bad_words_lr.append((words[i], coefs[i]))

print("Пять слов с набольшим позитивным окрасом:")
print(top_good_words_lr)
print("Пять слов с набольшим негативным окрасом:")
print(top_bad_words_lr[::-1])

Логистическая регрессия:
              precision    recall  f1-score   support

           0       0.73      0.93      0.82       660
           1       0.84      0.53      0.65       483

    accuracy                           0.76      1143
   macro avg       0.78      0.73      0.73      1143
weighted avg       0.77      0.76      0.74      1143

ROC-AUC: 0.726853943158291

Пять слов с набольшим позитивным окрасом:
[]
Пять слов с набольшим негативным окрасом:
[('любл', 3.522681141616364), ('красив', 3.462515274065095), ('прекрасн', 2.802872553173106), ('мил', 2.791670074398225), ('лучш', 2.7767779187864705), ('вообщ', -1.9195604110562385), ('сук', -2.311246049541277), ('нах', -2.4755237119964275), ('пиздец', -2.7423784209095072), ('блят', -3.3267499784489982)]


In [15]:
print('Случайный лес:')
print(classification_report(y_test, y_predicted_forest_stemmed))
print('ROC-AUC:', roc_auc_score(y_test, y_predicted_forest_stemmed))
print()

coefs = forest_stemmed.feature_importances_
indexes_lr = coefs.argsort()

top_words_forest = []
for i in indexes_lr[-10:]:
    top_words_forest.append(words[i])

print("Десять слов с набольшим окрасом:")
print(top_words_forest)

Случайный лес:
              precision    recall  f1-score   support

           0       0.78      0.71      0.74       660
           1       0.65      0.72      0.68       483

    accuracy                           0.71      1143
   macro avg       0.71      0.72      0.71      1143
weighted avg       0.72      0.71      0.72      1143

ROC-AUC: 0.715829098437794

Десять слов с набольшим окрасом:
['пиздец', 'классн', 'мил', 'крут', 'эт', 'прекрасн', 'лучш', 'блят', 'красив', 'любл']


In [16]:
mystem = Mystem()
def lemmatize(text):
    lemmas = mystem.lemmatize(text)
    stop_words = stopwords.words("russian")
    lemmas = [lemma for lemma in lemmas if lemma not in stop_words and lemma.isalnum() or emoji.is_emoji(lemma) or lemma == '!' or lemma == '?']
    lemmatized_text = ' '.join(lemmas)

    return lemmatized_text

In [17]:
lemmatized_train = train_pd_f.copy()
lemmatized_train['text'] = lemmatized_train['text'].apply(lemmatize)

lemmatized_test = test_pd_f.copy()
lemmatized_test['text'] = lemmatized_test['text'].apply(lemmatize)

In [18]:
vectorizer = CountVectorizer()
x_train_lemmatized = vectorizer.fit_transform(lemmatized_train['text'])
x_test_lemmatized = vectorizer.transform(lemmatized_test['text'])

In [19]:
tfidf_transformer = TfidfTransformer()
x_train_lemmatized_tfidf = tfidf_transformer.fit_transform(x_train_lemmatized)
x_test_lemmatized_tfidf = tfidf_transformer.transform(x_test_lemmatized)

In [20]:
lr_lemmatized = LogisticRegression()
lr_lemmatized.fit(x_train_lemmatized_tfidf, y_train)
y_predicted_lr_lemmatized = lr_lemmatized.predict(x_test_lemmatized_tfidf)

In [21]:
forest_lemmatized = RandomForestClassifier()
forest_lemmatized.fit(x_train_lemmatized_tfidf, y_train)
y_predicted_forest_lemmatize = forest_lemmatized.predict(x_test_lemmatized_tfidf)

In [22]:
words = vectorizer.get_feature_names_out()

In [23]:
print('Логистическая регрессия:')
print(classification_report(y_test, y_predicted_lr_lemmatized))
print('ROC-AUC:', roc_auc_score(y_test, y_predicted_lr_lemmatized))
print()

coefs = lr_lemmatized.coef_[0]
indexes_lr = coefs.argsort()

top_bad_words_lr = []
for i in indexes_lr[:5]:
    top_bad_words_lr.append(words[i])

top_good_words_lr = []
for i in indexes_lr[-5:]:
    top_good_words_lr.append(words[i])

print("Пять слов с набольшим позитивным окрасом:")
print(top_good_words_lr)
print("Пять слов с набольшим негативным окрасом:")
print(top_bad_words_lr[::-1])

Логистическая регрессия:
              precision    recall  f1-score   support

           0       0.74      0.92      0.82       660
           1       0.84      0.55      0.66       483

    accuracy                           0.76      1143
   macro avg       0.79      0.74      0.74      1143
weighted avg       0.78      0.76      0.75      1143

ROC-AUC: 0.7356907585168455

Пять слов с набольшим позитивным окрасом:
['милый', 'прекрасный', 'красивый', 'хороший', 'любить']
Пять слов с набольшим негативным окрасом:
['сука', 'умирать', 'нахуй', 'пиздец', 'блять']


In [24]:
print('Случайный лес:')
print(classification_report(y_test, y_predicted_forest_lemmatize))
print('ROC-AUC:', roc_auc_score(y_test, y_predicted_forest_lemmatize))
print()

coefs = forest_lemmatized.feature_importances_
indexes_lr = coefs.argsort()

top_words_forest = []
for i in indexes_lr[-10:]:
    top_words_forest.append(words[i])

print("Десять слов с набольшим окрасом:")
print(top_words_forest)

Случайный лес:
              precision    recall  f1-score   support

           0       0.79      0.71      0.75       660
           1       0.65      0.75      0.69       483

    accuracy                           0.72      1143
   macro avg       0.72      0.73      0.72      1143
weighted avg       0.73      0.72      0.72      1143

ROC-AUC: 0.7257011104837191

Десять слов с набольшим окрасом:
['прекрасный', 'вау', 'любовь', 'нравиться', 'пиздец', 'это', 'красивый', 'блять', 'хороший', 'любить']
