In [122]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import scipy
import plotly.express as px

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [123]:
# sentiment = pd.read_csv('/home/jupyter/mnt/s3/hack-data/hse/sentiment.csv')

In [124]:
# banki = pd.read_csv('/home/jupyter/mnt/s3/hack-data/hse/banki.csv')

In [125]:
# bank_sentiment = pd.read_csv('/home/jupyter/mnt/s3/hack-data/hse/bank-sentiment.csv')

In [126]:
train = pd.read_csv('/home/jupyter/mnt/s3/hack-data/hse/train.csv', index_col=0)

In [127]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove non-alphabetic characters
    text = re.sub(r'[^а-яА-Я\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text, language='russian')
    
    # Remove stop words
    stop_words = set(stopwords.words('russian'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem tokens
    stemmer = SnowballStemmer('russian')
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join tokens back into a string
    text = ' '.join(tokens)
    
    return text

In [128]:
train['clean_text'] = train['sentence'].apply(preprocess_text)

In [129]:
train.isnull().sum()

sentence          0
1category         0
2category     18362
sentiment         0
clean_text        0
dtype: int64

In [131]:
train['sentiment'].value_counts()

−    10192
+     6262
?     2907
Name: sentiment, dtype: int64

In [112]:
labels = ['1category', '2category', 'sentiment']

In [132]:
# Sentiment classification
from sklearn.model_selection import train_test_split
X = train['clean_text']
y = train['sentiment']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [133]:
X_train

690                            успешн работ благ альфабанк
12486    сотрудник хам присутств управля попустительств...
19715    зачет операционист дельн совет респект итподра...
3890     ясн дел надоел обслужива банк так некомпетентн...
78       счита эт прор лидерств финансов услуг централь...
                               ...                        
20092                             перв век систематизирова
12570    итог счита произвед обма действ сторон банк вы...
14108    такж отдельн хоч поблагодар сотрудник банк оче...
11515    хамск сказа телефон посмотр внимательн ваш док...
21339    эт окол квартир те врем год год сбербанк реш о...
Name: clean_text, Length: 14520, dtype: object

In [134]:
y_train

690      +
12486    −
19715    +
3890     −
78       +
        ..
20092    ?
12570    −
14108    +
11515    −
21339    −
Name: sentiment, Length: 14520, dtype: object

In [146]:
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import roc_auc_score

In [142]:
classifier = xgb.XGBClassifier(objective='multi:softmax')

In [143]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [145]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [157]:
roc_auc_score(y_test, classifier.predict_proba(X_test), multi_class='ovr')

0.8481686299354066

In [169]:
preds = classifier.predict(X_test)

In [160]:
y_test.head(20)

17308    −
17426    −
7006     +
18169    −
21247    −
18033    −
8757     −
4001     −
4585     −
11221    −
7491     −
8848     −
14783    +
1511     ?
15483    ?
617      +
2230     ?
17315    −
5938     +
21440    −
Name: sentiment, dtype: object