In [310]:
#!c1.32
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import scipy
import plotly.express as px

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [311]:
#!c1.32
import warnings
warnings.filterwarnings("ignore")

In [207]:
train = pd.read_csv('/home/jupyter/mnt/s3/hack-data/hse/train.csv')

In [208]:
train = train.drop([train.columns[0]], axis=1)

In [210]:
#!c1.32
# %pip install razdel
import re
import pymorphy2
import razdel
from nltk.corpus import stopwords

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove non-alphabetic characters
    text = re.sub(r'[^а-яА-Я\s]', '', text)

    # Tokenize text
    tokens = [token.text for token in razdel.tokenize(text)]

    # Remove stop words
    stop_words = set(stopwords.words('russian'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize tokens
    morph = pymorphy2.MorphAnalyzer()
    tokens = [morph.parse(token)[0].normal_form for token in tokens]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

In [211]:
#!c1.32
train['clean_text'] = train['sentence'].apply(preprocess_text)

In [212]:
#!c1.32
train.to_csv('last_text.csv')

In [250]:
train.isnull().sum()

sentence          0
1category         0
2category     18362
sentiment         0
clean_text        0
dtype: int64

In [9]:
train['sentiment'].value_counts()

−    10192
+     6262
?     2907
Name: sentiment, dtype: int64

In [261]:
train

Unnamed: 0,sentence,1category,2category,sentiment,clean_text
0,При этом всегда получал качественные услуги.,Communication,,+,получать качественный услуга
1,"Не вижу, за что хотя бы 2 поставить, сервис на 1!",?,,−,видеть хотя поставить сервис
2,"Вот так ""Мой любимый"" банк МКБ меня обманул.",?,,−,любимый банк мкб обмануть
3,Отвратительное отношение к клиентам.,Communication,,−,отвратительный отношение клиент
4,"Всегда в любое время дня и ночи помогут, ответ...",Communication,,+,любой время день ночь помочь ответить решить
...,...,...,...,...,...
19356,Никогда и ни в коем случае не открывайте счет ...,Communication,,−,кой случай открывать счёт недостойный доверие ...
19357,ТИ откровенно забили на качество и развивают с...,Quality,,−,ти откровенно забить качество развивать свой м...
19358,"Я считаю, это прорыв и лидерство финансовых ус...",?,,+,считать это прорыв лидерство финансовый услуга...
19359,"Писал мужчина очень доходчиво, не финансовым я...",Communication,,+,писать мужчина очень доходчиво финансовый язык...


In [251]:
labels = ['1category', '2category', 'sentiment']

In [252]:
# Sentiment classification
from sklearn.model_selection import train_test_split
X = train['clean_text']
y = train['sentiment']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [154]:
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import roc_auc_score

In [142]:
classifier = xgb.XGBClassifier(objective='multi:softmax')

In [143]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [145]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [176]:
print('roc_auc_score base xgb ovr: ', roc_auc_score(y_test, classifier.predict_proba(X_test), multi_class='ovr'))

roc_auc_score base xgb ovr:  0.8481686299354066


In [257]:
# With embedding features
# %pip install sentence_transformers

In [270]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Define a function to extract sentence embeddings from a DataFrame
def get_sentence_embeddings(df, text_column='text'):
    # Tokenize the text column and convert to input IDs
    encoded_input = tokenizer(list(df[text_column]), padding=True, truncation=True, return_tensors="pt")

    # Get the model's outputs
    with torch.no_grad():
        outputs = model(**encoded_input)

    # Get the embeddings for each sentence
    embeddings = mean_pooling(outputs, encoded_input['attention_mask'])

    # Create a new DataFrame with the embeddings
    embedding_cols = [f"embedding_{i}" for i in range(embeddings.shape[1])]
    embeddings_df = pd.DataFrame(embeddings.numpy(), columns=embedding_cols)

    # Combine the original DataFrame with the new DataFrame
    df.reset_index(drop=True, inplace=True)
    embeddings_df.reset_index(drop=True, inplace=True)
    combined_df = pd.concat([df, embeddings_df], axis=1)

    return combined_df

# Example usage


In [277]:
X_best_emb = pd.DataFrame(X)

In [280]:
#!c1.32
X_best_emb = get_sentence_embeddings(X_best_emb, 'clean_text')


                                              clean_text  ...  embedding_767
0                           получать качественный услуга  ...      -0.006486
1                           видеть хотя поставить сервис  ...      -0.059996
2                              любимый банк мкб обмануть  ...      -0.043312
3                        отвратительный отношение клиент  ...      -0.007149
4           любой время день ночь помочь ответить решить  ...      -0.077008
...                                                  ...  ...            ...
19356  кой случай открывать счёт недостойный доверие ...  ...      -0.004653
19357  ти откровенно забить качество развивать свой м...  ...      -0.018135
19358  считать это прорыв лидерство финансовый услуга...  ...       0.029666
19359  писать мужчина очень доходчиво финансовый язык...  ...      -0.094332
19360  дать ситуация сильно выбить колея вместо заним...  ...      -0.005209

[19361 rows x 769 columns]


In [281]:
# #!c1.32
# X_best_emb.to_csv('best_emb.csv')

In [283]:
#!c1.32
X_best_emb = X_best_emb.drop('clean_text', axis=1)

In [157]:
# def generate_embeddings(text_series):
#     # Tokenize text
#     input_ids = tokenizer.batch_encode_plus(text_series.tolist(),
#                                              padding=True,
#                                              truncation=True,
#                                              return_tensors='pt')
#     # Generate embeddings
#     with torch.no_grad():
#         last_hidden_states = model(input_ids['input_ids'])[0]  # Last hidden state of the top layer
#         sentence_embeddings = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
#     # Return a DataFrame with the sentence embeddings
#     return pd.DataFrame(sentence_embeddings)

In [312]:
#!c1.32
X_best_emb

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11,embedding_12,embedding_13,embedding_14,embedding_15,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25,embedding_26,embedding_27,embedding_28,embedding_29,embedding_30,embedding_31,embedding_32,embedding_33,embedding_34,embedding_35,embedding_36,embedding_37,embedding_38,embedding_39,...,embedding_728,embedding_729,embedding_730,embedding_731,embedding_732,embedding_733,embedding_734,embedding_735,embedding_736,embedding_737,embedding_738,embedding_739,embedding_740,embedding_741,embedding_742,embedding_743,embedding_744,embedding_745,embedding_746,embedding_747,embedding_748,embedding_749,embedding_750,embedding_751,embedding_752,embedding_753,embedding_754,embedding_755,embedding_756,embedding_757,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
0,0.033561,0.241628,-0.020504,-0.029144,0.027451,-0.067558,0.036489,-0.035385,-0.098121,-0.001461,0.052536,-0.034371,0.015503,0.121725,-0.049692,-0.099421,0.020750,0.118351,0.088236,0.068724,0.054692,-0.070634,0.012135,-0.083231,-0.097430,-0.074589,0.038302,0.067926,0.039850,0.078138,0.176265,-0.015241,-0.051467,0.017638,0.073454,-0.018251,0.003457,0.009003,-0.260475,0.022136,...,-0.077746,0.057607,0.019336,0.067675,0.118187,0.062028,-0.029584,0.045351,-0.051511,-0.109085,-0.134220,0.067069,-0.072714,-0.069804,0.109219,0.014135,0.026050,-0.034886,0.064578,0.091832,0.066961,0.019315,-0.050085,0.035246,-0.025142,-0.015900,0.049331,-0.112987,0.040169,0.009495,0.075411,0.035300,0.092569,0.053626,0.000133,0.064224,-0.028944,-0.001428,0.048150,-0.006486
1,0.003222,0.121184,-0.021057,0.019642,0.018313,0.009631,0.081309,-0.043712,0.052399,-0.000024,0.105758,0.025378,-0.046738,0.094740,0.053608,-0.084529,-0.005280,0.084593,0.015002,0.076434,0.011415,-0.042510,0.043153,-0.032506,-0.050887,-0.039456,0.014861,0.062467,0.066549,0.023584,0.099447,0.015771,-0.020750,0.044826,0.063929,-0.008714,0.014999,0.012527,-0.186087,0.044810,...,-0.054326,-0.013633,-0.032734,0.011431,0.048294,0.073611,-0.066328,0.034614,0.031308,0.003696,-0.088454,0.080578,-0.035522,0.019205,0.030349,0.011634,-0.021328,-0.008215,0.055587,0.015706,0.025840,-0.008856,0.001684,-0.005401,-0.002800,-0.055406,0.048858,-0.081276,0.077215,-0.039150,0.163801,-0.017198,0.036937,0.058067,-0.009065,0.043891,0.008761,0.059182,-0.028284,-0.059996
2,-0.000346,0.075674,-0.020425,0.095101,0.014179,0.029522,0.029704,-0.042396,0.039335,0.111208,-0.007800,0.042671,0.053530,0.086570,0.038352,0.020343,0.026367,0.024028,0.019951,0.014605,0.030056,-0.047338,0.046745,-0.072279,-0.025455,0.013345,0.068229,0.038113,0.059616,0.064831,0.094317,-0.028070,-0.030072,0.091732,0.069149,-0.051505,-0.051563,0.019834,-0.131399,-0.037943,...,0.011830,-0.067960,-0.031858,-0.002232,0.046729,0.010518,-0.015273,0.022009,-0.008994,-0.013429,-0.017760,0.108474,-0.001345,0.026438,0.048841,0.051880,0.005091,0.034893,0.055792,-0.002181,0.047556,-0.055997,0.133109,-0.020609,0.081528,-0.006247,-0.119111,-0.045735,-0.121504,-0.031943,-0.004047,0.021128,0.033958,0.075990,0.004530,0.063491,0.029621,0.081379,0.011964,-0.043312
3,0.055516,0.165853,-0.019206,0.029021,0.030465,0.063678,0.176546,-0.065510,0.083418,-0.041515,0.028089,0.048684,-0.026218,-0.118382,-0.019871,0.078905,-0.019787,0.146729,-0.038124,0.004529,0.122164,0.023543,0.085979,-0.000988,-0.084974,-0.041781,0.146133,0.043074,0.054345,0.025379,0.183745,-0.054708,-0.025231,0.098661,0.063788,0.004096,0.069814,0.069698,-0.167185,0.067296,...,-0.120063,0.024045,-0.114192,-0.006113,0.056840,-0.012574,0.095878,-0.003479,0.181798,-0.037902,-0.051318,0.066170,-0.097861,-0.054045,0.089349,0.093794,-0.039893,0.033478,0.073508,0.041982,-0.015158,-0.021435,0.071309,0.078042,-0.082248,-0.039738,0.035065,-0.095385,-0.039955,-0.023927,0.074583,0.077772,-0.056700,-0.010030,0.039664,0.067684,0.091304,-0.010839,-0.008864,-0.007149
4,0.005066,0.098936,-0.019082,-0.048949,-0.088563,0.031567,0.067510,-0.018606,-0.032498,0.011958,0.106345,-0.011000,-0.019002,-0.109740,-0.000045,-0.035395,-0.031601,0.055502,0.072337,0.066895,-0.070712,-0.061158,0.033087,-0.024895,-0.060536,-0.075645,-0.053182,0.004787,0.102579,0.025283,0.047770,0.042572,-0.037630,0.055251,0.048532,-0.047140,0.008772,-0.010662,-0.110448,0.057402,...,-0.082282,-0.045841,0.056180,0.008914,0.064587,0.154941,-0.088713,-0.060990,0.122343,0.050800,-0.070383,0.058556,0.007708,0.074127,0.036600,0.006530,-0.057406,0.007214,0.016676,0.019938,0.051743,-0.040039,0.008813,-0.012542,-0.038191,-0.058006,-0.049079,-0.033324,0.105425,-0.055338,0.210500,-0.135738,0.032607,0.044237,-0.034184,-0.035729,0.009461,0.059493,-0.073302,-0.077008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19356,0.000859,0.190487,-0.016859,0.129469,-0.000028,-0.028348,0.103735,-0.140008,0.075496,0.057322,-0.020262,0.037291,0.047823,0.158538,0.006381,0.134618,0.025317,-0.024971,0.174324,0.022400,0.070812,-0.042047,0.059702,-0.043955,0.015048,0.002346,0.148943,0.090439,0.101152,0.027369,0.078211,-0.044023,-0.010375,0.162489,0.089912,-0.094445,0.026637,0.011328,-0.111342,0.058115,...,0.004723,0.088034,-0.130248,0.045140,0.056150,-0.021118,-0.073407,0.009010,0.099365,-0.019841,0.054088,0.172221,-0.076232,-0.043356,0.040483,0.064913,-0.008509,0.107167,0.161598,0.067158,-0.008049,-0.055603,0.222658,0.052421,-0.004490,-0.069614,-0.081410,-0.032854,-0.128941,-0.061860,-0.037068,-0.001656,0.035053,0.090665,-0.041925,0.078109,0.109257,0.014225,0.012792,-0.004653
19357,0.032032,0.061537,-0.019264,0.002868,0.004534,0.050557,0.027254,-0.102295,-0.006774,0.007101,0.049832,0.120977,-0.032588,0.223147,0.087384,0.006617,-0.017253,0.113171,-0.064779,0.040170,0.066414,-0.055323,0.102652,-0.046222,-0.042363,-0.073428,0.136881,0.061939,0.117052,-0.020631,0.084884,0.082405,-0.119001,0.108797,0.083144,-0.000054,0.050601,0.036466,-0.109827,0.077345,...,-0.114720,-0.002162,-0.061978,0.012523,-0.008894,0.135873,-0.046067,-0.027038,0.013262,0.005744,-0.016292,0.104308,-0.086204,-0.034912,-0.010030,-0.032328,-0.059210,-0.010189,0.060439,-0.042580,0.051341,-0.052456,-0.090050,0.058605,-0.007622,-0.001086,0.072451,-0.022359,-0.099756,-0.000540,0.065365,-0.026523,-0.043882,0.023413,0.036737,-0.109647,0.004175,0.070602,-0.049907,-0.018135
19358,-0.042555,-0.036387,-0.016100,-0.049482,-0.021552,-0.012359,0.011173,-0.108123,0.100553,0.072379,0.062237,-0.018137,-0.182154,0.164171,0.130403,-0.067014,0.057596,0.005598,-0.033289,0.004255,0.071176,-0.106822,0.023198,-0.017833,-0.056661,0.041402,-0.015770,0.009025,0.051816,0.013990,0.061542,-0.019067,-0.077444,0.156600,0.048715,-0.092869,-0.096613,0.017904,0.014588,-0.004086,...,0.106031,-0.031616,-0.083514,-0.022451,0.007529,0.052532,-0.353515,0.139265,-0.103176,-0.117686,0.036847,0.121815,0.013956,0.001873,0.090729,0.036160,-0.075879,0.017030,0.098183,0.014578,0.113561,-0.033536,0.025928,-0.064397,0.030518,-0.101179,0.112897,-0.008685,-0.069024,0.049386,0.071565,-0.004581,0.002399,0.065972,0.036679,-0.002718,0.056693,0.017727,0.005990,0.029666
19359,-0.046310,0.032858,-0.013776,0.002300,0.081776,0.031498,0.084283,-0.063685,0.054541,0.034563,-0.023627,0.001755,0.064758,0.198033,0.103672,-0.044455,-0.006525,0.006121,0.193940,-0.007223,0.041154,-0.015628,0.011494,-0.061298,-0.094090,-0.092930,0.074920,-0.009801,0.046079,0.140374,0.085737,0.025180,-0.012207,0.067050,0.038777,-0.055421,-0.014615,-0.015750,-0.176912,-0.116083,...,0.065639,0.063990,-0.086146,-0.019464,0.121880,-0.011878,-0.096177,0.033733,-0.025875,-0.063330,0.051564,0.079271,-0.016830,0.089070,0.132223,0.098517,-0.090881,0.034999,0.035420,-0.034058,0.096813,-0.157179,0.214725,-0.185385,0.019248,0.095712,-0.164586,0.017719,-0.113055,0.015780,0.051612,-0.016716,0.093506,0.087237,-0.039637,0.093844,0.117690,0.068180,-0.032256,-0.094332


In [160]:
# #!c1.32
# X_emb = generate_embeddings(X)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [161]:
# X_emb.to_csv('embedding_train_new_pymorphy.csv')

In [288]:
#!g1.1
X_emb_train, X_emb_test, y_emb_train, y_emb_test = train_test_split(X_best_emb, y, test_size=0.25, random_state=42)

In [303]:
#!g1.1
# Define the training and validation pools
from catboost import CatBoostClassifier, Pool


train_pool = Pool(X_emb_train, y_emb_train)
val_pool = Pool(X_emb_test, y_emb_test)

# Define the CatBoost classifier with CUDA
model = CatBoostClassifier(
    task_type='GPU',
    devices='0:1',
    loss_function='MultiClass',
    eval_metric='AUC',
    learning_rate=0.008,
    iterations=4200,
    depth=10,
    verbose=100,
    l2_leaf_reg=4
)

  self._init_pool(data, label, cat_features, text_features, embedding_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count)


In [304]:
#!g1.1
#fit the model to the data
model.fit(train_pool, eval_set=val_pool)

AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8532577	best: 0.8532577 (0)	total: 74.5ms	remaining: 5m 13s
100:	test: 0.9058735	best: 0.9059212 (98)	total: 6.62s	remaining: 4m 28s
200:	test: 0.9144675	best: 0.9144675 (200)	total: 13.1s	remaining: 4m 21s
300:	test: 0.9210121	best: 0.9210121 (300)	total: 19.6s	remaining: 4m 13s
400:	test: 0.9262850	best: 0.9262850 (400)	total: 25.9s	remaining: 4m 5s
500:	test: 0.9302868	best: 0.9302868 (500)	total: 32.2s	remaining: 3m 57s
600:	test: 0.9333210	best: 0.9333210 (600)	total: 38.5s	remaining: 3m 50s
700:	test: 0.9356649	best: 0.9356649 (700)	total: 44.7s	remaining: 3m 42s
800:	test: 0.9376701	best: 0.9376701 (800)	total: 50.7s	remaining: 3m 35s
900:	test: 0.9393077	best: 0.9393077 (900)	total: 56.6s	remaining: 3m 27s
1000:	test: 0.9405672	best: 0.9405672 (1000)	total: 1m 2s	remaining: 3m 20s
1100:	test: 0.9415744	best: 0.9415744 (1100)	total: 1m 8s	remaining: 3m 12s
1200:	test: 0.9424833	best: 0.9424833 (1200)	total: 1m 14s	remaining: 3m 5s
1300:	test: 0.9431763	best: 0.9431772

<catboost.core.CatBoostClassifier at 0x7f256694a3a0>

In [305]:
#!g1.1
print('roc_auc_score 1200 catboost embedding+new rubert ovr: ', roc_auc_score(y_emb_test, model.predict_proba(X_emb_test), multi_class='ovr'))

roc_auc_score 1200 catboost embedding+new rubert ovr:  0.9440055418417646


In [307]:
#!g1.1
model.save_model('best_model_catboost')  

In [None]:
#!g1.1


In [None]:
#!g1.1


In [None]:
#!g1.1


In [None]:
#!g1.1


In [None]:
#!g1.1


In [None]:
#!g1.1


In [35]:
#!c1.32
classifier_emb = xgb.XGBClassifier(max_depth=10, n_estimators = 800, objective='multi:softmax')

In [36]:
#!c1.32
classifier_emb.fit(X_emb_train, y_emb_train) 

XGBClassifier(max_depth=10, n_estimators=800, objective='multi:softprob')

In [37]:
#!c1.32
print('roc_auc_score base xgb embedding rubert ovr: ', roc_auc_score(y_emb_test, classifier_emb.predict_proba(X_emb_test), multi_class='ovr'))

roc_auc_score base xgb embedding rubert ovr:  0.9268632227997534


In [146]:
# #!c1.32
# model_file = "xgb_model.bin"
# classifier_emb.save_model(model_file)

In [78]:
#!c1.32
classifier_emb.predict_proba(X_emb_test)

array([[7.8812439e-04, 3.0480765e-04, 9.9890709e-01],
       [6.8348052e-04, 4.3926499e-04, 9.9887723e-01],
       [9.9733162e-01, 2.7542564e-04, 2.3929384e-03],
       ...,
       [8.1660348e-04, 4.9940941e-01, 4.9977395e-01],
       [4.2800098e-03, 9.9232894e-01, 3.3910370e-03],
       [6.2781095e-04, 3.1640078e-04, 9.9905580e-01]], dtype=float32)

In [None]:
#!c1.32


In [74]:
#!c1.32
loaded_model = xgb.Booster()
loaded_model.load_model("xgb_model.bin")

In [88]:
#!c1.32
print('roc_auc_score base xgb embedding rubert ovr: ', roc_auc_score(y_emb_test, loaded_model.predict(xgb.DMatrix(X_emb_test)), multi_class='ovr'))

roc_auc_score base xgb embedding rubert ovr:  0.9268632227997534
