# **1. Загрузка датасета в библиотеку Pandas.**



In [1]:
import pandas as pd
import numpy as np

In [2]:
neg_tweets = pd.read_csv("/content/drive/MyDrive/negative.csv")

In [3]:
pos_tweets = pd.read_csv("/content/drive/MyDrive/positive.csv")

In [4]:
pos_tweets = pos_tweets[['tweet', 'class']]
neg_tweets = neg_tweets[['tweet', 'class']]
print(f"Positive tweets shape: {pos_tweets.shape}")
print(f"Negative tweets shape: {neg_tweets.shape}")

Positive tweets shape: (114910, 2)
Negative tweets shape: (111922, 2)


# **2. Лемматизация и удаление стоп-слов.**


   Лемматизация - процесс приведения словоформы к лемме — её нормальной (словарной) форме. Алгоритмы лемматизации на языке python реализованы, например, в библиотеке NLTK.

In [5]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m984.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=940cf0bbb435cfd3a43b9d8990f092dd12f8619c72b0b1d21b108fdd863368d8
  Stored in directory: /root

In [7]:
import re

from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords

patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
stopwords_ru = stopwords.words('russian')
morph = MorphAnalyzer()

def lemmatize(doc: pd.DataFrame):
    doc = re.sub(patterns, ' ', doc)
    tokens = []
    for token in doc.split():
        if token and token not in stopwords_ru:
            token = token.strip()
            token = morph.normal_forms(token)[0]
            tokens.append(token)

    if len(tokens) > 2:
        return tokens
    return None

In [8]:
pos_data = pos_tweets.iloc[:, 0]
neg_data = neg_tweets.iloc[:, 0]
pos_data = pos_data.apply(lemmatize)
neg_data = neg_data.apply(lemmatize)

In [9]:
pos_data = pos_data.to_frame()
neg_data = neg_data.to_frame()

In [10]:
neg_data = neg_data.assign(cl = 0)
neg_data

Unnamed: 0,tweet,cl
0,"[коллега, сидеть, рубиться, долбать, винд, мочь]",0
1,"[говорить, обещаной, год, ждать]",0
2,"[желать, хороший, полёт, удачный, посадка, быт...",0
3,"[обновить, какой, леший, работать, простоплеер]",0
4,"[котёнок, вчера, носик, разбить, плакать, расс...",0
...,...,...
111917,"[но, каждый, хотеть, исправлять]",0
111918,"[скучать, вправлять, мозг, равно, скучать]",0
111919,"[вот, школа, говно, это, идти]",0
111920,"[тауриэль, грусть, обнять]",0


In [11]:
pos_data = pos_data.assign(cl = 1)
pos_data

Unnamed: 0,tweet,cl
0,"[да, таки, немного, похожий, но, мальчик, равно]",1
1,"[ну, идиотка, испугаться]",1
2,"[кто, угол, сидеть, погибать, голод, ещё, порц...",1
3,"[вот, значит, страшилка, но, блин, посмотреть,...",1
4,"[любить, я, знать, бля]",1
...,...,...
114905,"[спасть, родительский, дом, свой, кровать, про...",1
114906,"[эх, мы, немного, решить, сократить, путь, леж...",1
114907,"[что, происходить, я, эфир, звучать, любимый, ...",1
114908,"[любимый, подарить, ты, звезда, имя, звезда, п...",1


In [12]:
pos_data = pos_data.dropna(ignore_index=True)
pos_data

Unnamed: 0,tweet,cl
0,"[да, таки, немного, похожий, но, мальчик, равно]",1
1,"[ну, идиотка, испугаться]",1
2,"[кто, угол, сидеть, погибать, голод, ещё, порц...",1
3,"[вот, значит, страшилка, но, блин, посмотреть,...",1
4,"[любить, я, знать, бля]",1
...,...,...
109452,"[спасть, родительский, дом, свой, кровать, про...",1
109453,"[эх, мы, немного, решить, сократить, путь, леж...",1
109454,"[что, происходить, я, эфир, звучать, любимый, ...",1
109455,"[любимый, подарить, ты, звезда, имя, звезда, п...",1


In [13]:
neg_data = neg_data.dropna(ignore_index=True)
neg_data

Unnamed: 0,tweet,cl
0,"[коллега, сидеть, рубиться, долбать, винд, мочь]",0
1,"[говорить, обещаной, год, ждать]",0
2,"[желать, хороший, полёт, удачный, посадка, быт...",0
3,"[обновить, какой, леший, работать, простоплеер]",0
4,"[котёнок, вчера, носик, разбить, плакать, расс...",0
...,...,...
104718,"[но, каждый, хотеть, исправлять]",0
104719,"[скучать, вправлять, мозг, равно, скучать]",0
104720,"[вот, школа, говно, это, идти]",0
104721,"[тауриэль, грусть, обнять]",0


In [14]:
result_df = pd.concat([pos_data, neg_data])
result_df

Unnamed: 0,tweet,cl
0,"[да, таки, немного, похожий, но, мальчик, равно]",1
1,"[ну, идиотка, испугаться]",1
2,"[кто, угол, сидеть, погибать, голод, ещё, порц...",1
3,"[вот, значит, страшилка, но, блин, посмотреть,...",1
4,"[любить, я, знать, бля]",1
...,...,...
104718,"[но, каждый, хотеть, исправлять]",0
104719,"[скучать, вправлять, мозг, равно, скучать]",0
104720,"[вот, школа, говно, это, идти]",0
104721,"[тауриэль, грусть, обнять]",0


In [15]:
result_df = result_df.sample(frac=1).reset_index(drop=True)
result_df

Unnamed: 0,tweet,cl
0,"[новый, год, сугроб, салат, оливье, си, ю, ин,...",1
1,"[закрывать, быть, твой, запись, ахахи]",1
2,"[сегодня, учитель, география, сказать, гулять,...",1
3,"[вроде, ничего, такой, делать, страшно]",0
4,"[физр, делать, маша, причёска, первый, получит...",1
...,...,...
214175,"[я, секунда, прийти, смс, интересно, это, набл...",1
214176,"[большивик, каток, мочь, подруга, угомониться]",1
214177,"[я, казаться, понять, испраить, мой, заикание]",1
214178,"[весь, удачный, день, сильно, баловаться, уйти...",1


# **3. Разбиение на тренировочную и тестовую выборки.**

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
x_train, x_test, y_train, y_test = train_test_split(result_df['tweet'], result_df['cl'], random_state=42, test_size=0.2)

In [18]:
x_train = [str(element) for element in x_train]

In [19]:
x_test = [str(element) for element in x_test]

# **4. CountVectorizer**

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

**1. Униграммы**

In [21]:
vectorizer_unigram = CountVectorizer(lowercase=False)
x_train_unigram_cv = vectorizer_unigram.fit_transform(x_train)
x_test_unigram_cv = vectorizer_unigram.transform(x_test)

In [28]:
unigram_lr = LogisticRegression(random_state=42)
unigram_lr.fit(x_train_unigram_cv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
y_pred_unigram = unigram_lr.predict(x_test_unigram_cv)

In [30]:
print(classification_report(y_test, y_pred_unigram))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72     20704
           1       0.73      0.74      0.74     22132

    accuracy                           0.73     42836
   macro avg       0.73      0.73      0.73     42836
weighted avg       0.73      0.73      0.73     42836



**2. Биграммы**

In [31]:
vectorizer_bigram = CountVectorizer(lowercase=False, ngram_range=(2,2))
x_train_bigram_cv = vectorizer_bigram.fit_transform(x_train)
x_test_bigram_cv = vectorizer_bigram.transform(x_test)

In [32]:
bigram_lr = LogisticRegression(random_state=42)

In [33]:
bigram_lr.fit(x_train_bigram_cv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
y_pred_bigram = bigram_lr.predict(x_test_bigram_cv)

In [35]:
print(classification_report(y_test, y_pred_bigram))

              precision    recall  f1-score   support

           0       0.69      0.58      0.63     20704
           1       0.65      0.75      0.70     22132

    accuracy                           0.67     42836
   macro avg       0.67      0.66      0.66     42836
weighted avg       0.67      0.67      0.66     42836



**3. Триграммы**

In [36]:
vectorizer_trigram = CountVectorizer(lowercase=False, ngram_range=(3,3))
x_train_trigram_cv = vectorizer_bigram.fit_transform(x_train)
x_test_trigram_cv = vectorizer_bigram.transform(x_test)

In [37]:
trigram_lr = LogisticRegression(random_state=42)

In [38]:
trigram_lr.fit(x_train_trigram_cv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
y_pred_trigram = trigram_lr.predict(x_test_trigram_cv)

In [40]:
print(classification_report(y_test, y_pred_trigram))

              precision    recall  f1-score   support

           0       0.69      0.58      0.63     20704
           1       0.65      0.75      0.70     22132

    accuracy                           0.67     42836
   macro avg       0.67      0.66      0.66     42836
weighted avg       0.67      0.67      0.66     42836



# **5. TfidfVectorizer**

$$
TFIDF(t,d) = TF(t,d) * IDF(t)
$$

$$
IDF(t) = {ln(\frac{n}{DF(t)}) + 1}
$$


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer



**1.   Униграммы**



In [42]:
tfidf_vectorizer_unigram = TfidfVectorizer(lowercase=False, ngram_range=(1,1))
x_train_unigram_tfidf = tfidf_vectorizer_unigram.fit_transform(x_train)
x_test_unigram_tfidf = tfidf_vectorizer_unigram.transform(x_test)

In [43]:
tfidf_unigram_lr = LogisticRegression(random_state=42)
tfidf_unigram_lr.fit(x_train_unigram_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
y_pred_unigram_tfidf = tfidf_unigram_lr.predict(x_test_unigram_tfidf)

In [45]:
print(classification_report(y_test, y_pred_unigram_tfidf))

              precision    recall  f1-score   support

           0       0.73      0.70      0.71     20704
           1       0.73      0.76      0.74     22132

    accuracy                           0.73     42836
   macro avg       0.73      0.73      0.73     42836
weighted avg       0.73      0.73      0.73     42836



**2. Биграммы**

In [46]:
tfidf_vectorizer_bigram = TfidfVectorizer(lowercase=False, ngram_range=(2,2))
x_train_bigram_tfidf = tfidf_vectorizer_bigram.fit_transform(x_train)
x_test_bigram_tfidf = tfidf_vectorizer_bigram.transform(x_test)

In [47]:
tfidf_bigram_lr = LogisticRegression(random_state=42)
tfidf_bigram_lr.fit(x_train_bigram_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
y_pred_bigram_tfidf = tfidf_bigram_lr.predict(x_test_bigram_tfidf)

In [49]:
print(classification_report(y_test, y_pred_bigram_tfidf))

              precision    recall  f1-score   support

           0       0.69      0.57      0.62     20704
           1       0.65      0.76      0.70     22132

    accuracy                           0.67     42836
   macro avg       0.67      0.67      0.66     42836
weighted avg       0.67      0.67      0.67     42836





**3. Триграммы**

In [50]:
tfidf_vectorizer_trigram = TfidfVectorizer(lowercase=False, ngram_range=(3,3))
x_train_trigram_tfidf = tfidf_vectorizer_trigram.fit_transform(x_train)
x_test_trigram_tfidf = tfidf_vectorizer_trigram.transform(x_test)

In [51]:
tfidf_trigram_lr = LogisticRegression(random_state=42)
tfidf_trigram_lr.fit(x_train_trigram_tfidf, y_train)

In [52]:
y_pred_trigram_tfidf = tfidf_trigram_lr.predict(x_test_trigram_tfidf)

In [53]:
print(classification_report(y_test, y_pred_trigram_tfidf))

              precision    recall  f1-score   support

           0       0.80      0.21      0.33     20704
           1       0.56      0.95      0.71     22132

    accuracy                           0.59     42836
   macro avg       0.68      0.58      0.52     42836
weighted avg       0.68      0.59      0.52     42836



# **6. Результаты**

In [54]:
cv_results = dict()

cv_results['unigram'] = 0.73
cv_results['bigram'] =  0.66
cv_results['trigram'] = 0.66

In [55]:
tfidf_results = dict()

tfidf_results['unigram'] = 0.73
tfidf_results['bigram'] = 0.66
tfidf_results['trigram'] = 0.51

In [56]:
average_tfidf = sum(tfidf_results.values())/len(tfidf_results)
average_tfidf

0.6333333333333334

In [57]:
average_cv = sum(cv_results.values())/len(cv_results.values())
average_cv

0.6833333333333335