# Imports

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import random

import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_auc_score, f1_score, auc, accuracy_score

#!pip install catboost
from catboost import CatBoostClassifier


SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Functions

In [2]:
def accuracy_by_label(y_true, y_pred, full_accuracy = True):
    
    if full_accuracy:
        print(f'Full accuracy: {accuracy_score(y_true, y_pred)}')
    
    labels = np.unique(y_true)
    
    for label in labels:
        label_indices = (y_true == label).nonzero()[0]
        y_true_label = y_true[label_indices]
        y_pred_label = y_pred[label_indices]
        print(f'Label "{label}" \t| Accuracy: {accuracy_score(y_true_label, y_pred_label)}')

# Data loading

In [3]:
russian_news_sentiment_path = '../data/external/russian_news_sentiment.json'
news_with_target_path = '../data/processed/news_with_target.csv'

In [4]:
# Внешние данные
russian_news_sentiment = pd.read_json(russian_news_sentiment_path)
russian_news_sentiment['sentiment'] = russian_news_sentiment['sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1}) 
russian_news_sentiment = russian_news_sentiment[['text','sentiment']]
russian_news_sentiment.head()

Unnamed: 0,text,sentiment
0,Досудебное расследование по факту покупки ЕНПФ...,-1
1,Медики рассказали о состоянии пострадавшего му...,-1
2,"Прошел почти год, как железнодорожным оператор...",-1
3,По итогам 12 месяцев 2016 года на территории р...,-1
4,Астана. 21 ноября. Kazakhstan Today - Агентств...,-1


In [5]:
# Размеченные данные
news_with_target = pd.read_csv(news_with_target_path)
news_with_target = news_with_target[['content','target']]
news_with_target.columns = ['text','sentiment']
news_with_target = news_with_target.dropna()
news_with_target.head()

Unnamed: 0,text,sentiment
0,​Росалкогольрегулирование (РАР) направило «Янд...,0
1,"В пресс-службе «Яндекса» заявили РБК, что дата...",0
2,"Во вторник, 23 июля, группа сенаторов во главе...",0
3,Совет директоров Yandex N.V. (головная компани...,1
4,"Аналитики компании Group-IB, специализирующейс...",-1


In [6]:
# Объединение датасетов в один
text_sentiment_df = pd.concat((russian_news_sentiment, news_with_target))
text_sentiment_df = text_sentiment_df[text_sentiment_df['sentiment'].isin([-1,1])]
text_sentiment_df.head()

Unnamed: 0,text,sentiment
0,Досудебное расследование по факту покупки ЕНПФ...,-1
1,Медики рассказали о состоянии пострадавшего му...,-1
2,"Прошел почти год, как железнодорожным оператор...",-1
3,По итогам 12 месяцев 2016 года на территории р...,-1
4,Астана. 21 ноября. Kazakhstan Today - Агентств...,-1


# Datasets for models

In [7]:
X = text_sentiment_df['text'].to_numpy()
y = text_sentiment_df['sentiment'].to_numpy()

# Выделяем 80% на train и 20% на проверку (10% на test и 10% на validation)
X_train, X_check, y_train, y_check = train_test_split(X, y, test_size=0.2, random_state=SEED)

# 10% на test и 10% на validation (по 50% от 20% В X_check, y_check)
X_val, X_test, y_val, y_test = train_test_split(X_check, y_check, test_size=0.1, random_state=SEED)


print(f'X_train {X_train.shape} | y_train {y_train.shape}')
print(f'X_val {X_val.shape} | y_val {y_val.shape}')
print(f'X_test {X_test.shape} | y_test {y_test.shape}')

X_train (3664,) | y_train (3664,)
X_val (825,) | y_val (825,)
X_test (92,) | y_test (92,)


# Features

### Count Vectorizer

In [8]:
count_vectorizer = CountVectorizer(max_features=1000, min_df=5)

X_train_count = count_vectorizer.fit_transform(X_train)
X_val_count = count_vectorizer.transform(X_val)
X_test_count = count_vectorizer.transform(X_test)

### TF-IDF Vectorizer

In [9]:
tf_idf_vectorizer = TfidfVectorizer(max_features=1000, min_df=5)

X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_val_tf_idf = tf_idf_vectorizer.transform(X_val)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)

# Models

## Logistic Regression

### (Count Vectorizer)

In [10]:
logit_count = LogisticRegression(random_state=SEED, max_iter = 200)

logit_count_grid_searcher = GridSearchCV(estimator=logit_count, 
                                   param_grid={
                                       'C': np.linspace(1,30,20),
                                       'solver':['liblinear','lbfgs']
                                   },
                                   scoring='accuracy',
                                   n_jobs=-1,
                                   cv=5,
                                   verbose=1
                                  )

logit_count_grid_searcher.fit(X_train_count, y_train)
print('Accuracy: ', logit_count_grid_searcher.best_score_)
print('Best_params: ', logit_count_grid_searcher.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   37.1s finished


Accuracy:  0.841430903764006
Best_params:  {'C': 1.0, 'solver': 'liblinear'}


In [11]:
logit_count_prediction_val = logit_count_grid_searcher.predict(X_val_count)
accuracy_by_label(y_val, logit_count_prediction_val)

Full accuracy: 0.833939393939394
Label "-1" 	| Accuracy: 0.7395833333333334
Label "1" 	| Accuracy: 0.8845437616387337


### (TF-IDF Vectorizer)

In [12]:
logit_tfidf = LogisticRegression(random_state=SEED, max_iter = 100)

logit_tfidf_grid_searcher = GridSearchCV(estimator=logit_tfidf, 
                                   param_grid={
                                       'C': np.linspace(1,30,20),
                                       'solver':['liblinear','lbfgs']
                                   },
                                   scoring='accuracy',
                                   n_jobs=-1,
                                   cv=5,
                                   verbose=1
                                  )

logit_tfidf_grid_searcher.fit(X_train_tf_idf, y_train)
print('Accuracy: ', logit_tfidf_grid_searcher.best_score_)
print('Best_params: ', logit_tfidf_grid_searcher.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 185 out of 200 | elapsed:    6.9s remaining:    0.5s


Accuracy:  0.8662700631434557
Best_params:  {'C': 2.526315789473684, 'solver': 'liblinear'}


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    7.2s finished


In [13]:
logit_tfidf_prediction_val = logit_tfidf_grid_searcher.predict(X_val_tf_idf)
accuracy_by_label(y_val, logit_tfidf_prediction_val)

Full accuracy: 0.8557575757575757
Label "-1" 	| Accuracy: 0.7118055555555556
Label "1" 	| Accuracy: 0.9329608938547486


## CatBoostClassifier

### (Count Vectorizer)

In [14]:
cbc_count = CatBoostClassifier(verbose=False)
cbc_count.fit(X_train_count, y_train)

<catboost.core.CatBoostClassifier at 0x13715608850>

In [15]:
cbc_count_prediction_val = cbc_count.predict(X_val_count)
accuracy_by_label(y_val, cbc_count_prediction_val)

Full accuracy: 0.8581818181818182
Label "-1" 	| Accuracy: 0.71875
Label "1" 	| Accuracy: 0.9329608938547486
