In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import re
import optuna
import webbrowser

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report, recall_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB

import nltk
# from nltk import word_tokenize - нужно nltk.download('punkt')

from nltk import wordpunct_tokenize, wordnet
from nltk.stem import wordnet as WordNetLem
from nltk.stem import SnowballStemmer, StemmerI

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import doc2vec
from gensim.models import LdaModel, LdaMulticore
from gensim.models import LsiModel
from gensim import models

from tqdm import tqdm
from matplotlib import pyplot as plt

In [3]:
#coffee = pd.read_csv('./data/coffee.csv')
#phrases = pd.read_csv('./data/phrases.csv')
# sentences = pd.read_csv('./data/sentences.csv')

pd.read_xml("./data/SentiRuEval_rest_markup_train.xml")

Unnamed: 0,id,meta,scores,text,aspects,categories
0,3976,,,"День 8-го марта прошёл, можно и итоги подвести...",,
1,30808,,,Отмечали в этом ресторане день рождение на пер...,,
2,16137,,,Для встречи с друзьями было выбрано данное зав...,,
3,14031,,,Хочу поделиться своим впечатлением от посещени...,,
4,2495,,,Добрый день! Были вчера с друзьями в этом кафе...,,
...,...,...,...,...,...,...
196,36079,,,Отличный ресторан или скорее кафе. Очень уютна...,,
197,20086,,,Неделю назад зашли посмотреть на новое заведен...,,
198,35863,,,"Не так давно были в Жан-Жаке с подругой, вот р...",,
199,11770,,,"Была пару раз в пабе, очень понравилось. Вкусн...",,


In [122]:
df_tmp = sentences[sentences['rating'] == 5].dropna().iloc[:1000]
df_tmp

Unnamed: 0,sentence,rating,index
10,пить кофе рим париж вкусный капучий фундучный ...,5.0,1
11,десерт необычный,5.0,1
12,ребята барист больший молодец,5.0,1
13,улучшить,5.0,1
14,маловатый место посадкой придумать,5.0,1
...,...,...,...
1546,вкусно готовить,5.0,297
1547,единственный хотеться пожелать быстроить доста...,5.0,297
1548,всё отлично,5.0,297
1549,хотеть попробовать вкуснеинуть рол доступной ц...,5.0,297


In [54]:
df_tmp = sentences[sentences['rating'] == 5].dropna().iloc[:1000]
df_tmp.dropna(inplace=True) # в sentences 18 пропусков
df_tmp = df_tmp.reset_index().drop(columns='index')
df_tmp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp.dropna(inplace=True) # в sentences 18 пропусков


Unnamed: 0,level_0,sentence,rating
0,0,глубинка страна весь свой проявление,4.0
1,1,ассортимент столовския интерьер качество цена ...,4.0
2,2,хотеть трасса,4.0
3,3,поесть желудок бастовать значит риск оправдать,4.0
4,4,номер ночлег аналогичный толк,4.0
...,...,...,...
971,994,сервис вызывать вопрос,4.0
972,995,большоя заказ дарить комплимент,4.0
973,996,спасибо,4.0
974,997,забывать сварить кофе заказ,4.0


In [55]:
df_tmp['rating'].value_counts()

5.0    663
1.0    116
3.0     85
4.0     75
2.0     37
Name: rating, dtype: int64

# функции


In [56]:
RANDOM_STATE = 42

def show_topics( vectorizer_x=None, model=None, n_words=20):
    feature_names = np.array(vectorizer_x.get_feature_names_out())
    top_words = []

    for topic_weights in model.components_:
        top_keywords_locs = (-topic_weights).argsort()[:n_words]
        top_words.append(feature_names.take(top_keywords_locs))
    return top_words


def objective_MultinomialNB_classif(trial):
    alpha = trial.suggest_float('alpha', 0, 1000)
    # fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    # penalty = trial.suggest_categorical('penalty', ['l2', 'none'])

    params_tmp = {                  'alpha': alpha,
                 }
    preds_proba, preds = predict_data(MultinomialNB, params_tmp, X, y)

    return round(f1_score(y, preds, average='macro'), 6)


def objective_LogReg_classif(trial):
    C = trial.suggest_float('C', 0.001, 1000)
    # fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    # penalty = trial.suggest_categorical('penalty', ['l2', 'none'])

    params_tmp = {'random_state': 42,
                  'C': C,
                  'penalty': 'l2',
                  'fit_intercept': True,
                  'max_iter': 1000
                 }
    preds_proba, preds = predict_data(LogisticRegression, params_tmp, X, y)

    return round(f1_score(y, preds, average='macro'), 6)


def predict_data(type_model, params_tmp: dict, x_tmp: pd.DataFrame, y_tmp: pd.DataFrame, verbose: bool = False):

    # нужно преобразовать слои в параметр array
    model = type_model(**params_tmp)

    preds_proba = []
    preds = []

    sfk = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    for train_index, test_index in sfk.split(x_tmp, y_tmp):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # pt_ylo = QuantileTransformer(n_quantiles=len(X_train))
        # #pt_ylo = PowerTransformer()
        # X_train = pd.DataFrame(pt_ylo.fit_transform(X_train), columns=X_train.columns)
        # X_test = pd.DataFrame(pt_ylo.transform(X_test), columns=X_test.columns)

        model.fit(X_train, y_train)

        pred_prob = pd.DataFrame(model.predict_proba(X_test),
                                 columns=model.classes_,
                                 index=test_index)
        preds_proba.append(pred_prob)

        pred = pd.DataFrame(model.predict(X_test),
                            columns=['pred'],
                            index=test_index)
        preds.append(pred)

    preds_proba = pd.concat(preds_proba).sort_index()
    preds = pd.concat(preds).sort_index()
    if verbose:
        print_result_model(type_model.__name__, preds_proba, preds)

    return preds_proba, preds

def print_and_save_optuna_study(model, study_tmp, name_tmp, params_tmp, save_html=True):

    print(model.__name__, " -- лучший скор -- ", round(study_tmp.best_value, 5))
    print(study_tmp.best_params)

    if save_html:
        fig = optuna.visualization.plot_slice(study_tmp, params=params_tmp).to_html(f'{name_tmp}.html')
        with open(f'{name_tmp}.html', 'w', encoding='utf8') as f:
            f.write(fig)
        webbrowser.open(f'{name_tmp}.html', new=2)

# кодирование

In [57]:
vectorizer_x = CountVectorizer(analyzer='word',
                             min_df=1,
                             ngram_range=(2, 3)
                              )

res_vectorizer = vectorizer_x.fit_transform(df_tmp['sentence'])
res_vectorizer.shape, res_vectorizer.max()

((976, 7168), 2)

In [58]:
res_vectorizer.dtype = 'int8'

In [59]:
X = pd.DataFrame(res_vectorizer.toarray())
y = df_tmp['rating']

In [42]:
"""Подбор для LogReg_classif"""
study_neiro = optuna.create_study(direction="maximize")
study_neiro.optimize(objective_MultinomialNB_classif, n_trials=100, n_jobs=-1)
params = study_neiro.best_params.keys()
name = 'LogReg_classif'
print_and_save_optuna_study(MLPClassifier, study_neiro, name, params)

MLPClass, _ = predict_data(MLPClassifier, study_neiro.best_params, X, y, verbose=True)

[I 2023-12-21 15:13:20,767] A new study created in memory with name: no-name-be4ae9d2-5c87-4c8f-926c-e34e49c5178c
[I 2023-12-21 15:13:23,409] Trial 1 finished with value: 0.430952 and parameters: {'alpha': 224.54089102341246}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:13:23,499] Trial 2 finished with value: 0.430952 and parameters: {'alpha': 402.0809255599992}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:13:23,599] Trial 0 finished with value: 0.430952 and parameters: {'alpha': 590.935772278392}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:13:23,607] Trial 3 finished with value: 0.430952 and parameters: {'alpha': 415.4636359950037}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:13:25,958] Trial 4 finished with value: 0.430952 and parameters: {'alpha': 762.4433357939225}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:13:25,966] Trial 5 finished with value: 0.430952 and parameters: {'alpha': 939.7165240750852}. Best is trial 1 with val

[I 2023-12-21 15:13:58,696] Trial 54 finished with value: 0.430952 and parameters: {'alpha': 559.2562120777743}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:13:59,158] Trial 55 finished with value: 0.430952 and parameters: {'alpha': 639.1028770058283}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:14:00,501] Trial 56 finished with value: 0.430952 and parameters: {'alpha': 992.191127541329}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:14:00,985] Trial 57 finished with value: 0.430952 and parameters: {'alpha': 641.9049927760474}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:14:01,537] Trial 58 finished with value: 0.430952 and parameters: {'alpha': 614.4556884644596}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:14:01,858] Trial 59 finished with value: 0.430952 and parameters: {'alpha': 742.3968863520441}. Best is trial 1 with value: 0.430952.
[I 2023-12-21 15:14:03,154] Trial 60 finished with value: 0.430952 and parameters: {'alpha': 62

KeyboardInterrupt: 

In [60]:
preds_proba, preds = predict_data(MultinomialNB, , X, y)
preds.mean()

pred    5.0
dtype: float64

In [120]:
y_test.value_counts()

5.0    130
1.0     27
4.0     18
3.0     13
2.0      8
Name: rating, dtype: int64

In [103]:
from sklearn.metrics import accuracy_score

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# cross_val_score(MultinomialNB(alpha=500000), X, y)
model = MultinomialNB(**{'alpha': 0.3})
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(accuracy_score(y_test, preds))
preds



# preds_proba = []
# preds = []

# sfk = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# for train_index, test_index in sfk.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     model.fit(X_train, y_train)

#     pred_prob = pd.DataFrame(model.predict_proba(X_test),
#                              columns=model.classes_,
#                              index=test_index)
#     preds_proba.append(pred_prob)

#     pred = pd.DataFrame(model.predict(X_test),
#                         columns=['pred'],
#                         index=test_index)
#     preds.append(pred)

# preds_proba = pd.concat(preds_proba).sort_index()
# preds = pd.concat(preds).sort_index()

0.6632653061224489


array([5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5.])

In [63]:
preds.mean()

pred    5.0
dtype: float64

In [106]:
LDA_model = LatentDirichletAllocation(n_components=6,
                                          #learning_method='online',
                                          random_state=42,
                                          n_jobs=-1)
LDA_model.fit(res_vectorizer)



pd.DataFrame(show_topics(vectorizer_x, LDA_model, 10))

LatentDirichletAllocation(n_components=6, n_jobs=-1, random_state=42)