In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from nltk import FreqDist

from tqdm.auto import tqdm
import re
import contractions
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

from bs4 import MarkupResemblesLocatorWarning
import warnings

## Подготовка датасета

### Чтение датасета

In [None]:
questions = pd.read_csv("dataset/Questions.csv", encoding="ISO-8859-1")
questions.head(5)

In [None]:
tags = pd.read_csv("dataset/Tags.csv", encoding="ISO-8859-1")
tags.head(5)

In [None]:
questions.info()

In [None]:
tags.info()

### Группировка и объединение тегов

In [None]:
tags["Tag"] = tags["Tag"].astype(str)
grouped_tags = tags.groupby("Id")["Tag"].apply(lambda tags: " ".join(tags))
grouped_tags.head(10)

In [None]:
grouped_tags.shape

In [None]:
df_grouped_tags = grouped_tags.reset_index(name='Tags')
df_grouped_tags.columns = ['Id', 'Tags']
df_grouped_tags.head(5)

### Удаление ненужных колонок

In [None]:
questions.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)
questions.head(5)

### Объединение вопросов и тегов

In [None]:
data = questions.merge(df_grouped_tags, on='Id')
data.head(10)

### Анализ и работа с показателем рейтинга (Score)

#### Метрики

In [None]:
print(f"Minimum Score: {data['Score'].min()}")
print(f"Maximum Score: {data['Score'].max()}")

print(f"Total count {data["Score"].count()}")
print(f"Count (Score > 0) {data[data["Score"] > 0]["Score"].count()}")
print(f"Count (Score > 5) {data[data["Score"] > 5]["Score"].count()}")
print(f"Count (Score > 10) {data[data["Score"] > 20]["Score"].count()}")
print(f"Count (Score > 20) {data[data["Score"] > 10]["Score"].count()}")

print(f"Describe\n{data["Score"].describe()}")

#### График распределения рейтинга

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

# График 1: Все значения Score
ax1.hist(data["Score"], bins=100, color='green', edgecolor='black')
ax1.set_title('Распределение всего рейтинга (Score)')
ax1.set_xlabel('Score')
ax1.set_ylabel('Количество вопросов')
ax1.set_yscale('log')
ax1.grid(axis='x', linestyle='--', alpha=0.7)

# График 2: Только Score < 5
ax2.hist(data[data['Score'] < 5]["Score"], bins=100, color='red', edgecolor='black')
ax2.set_title('Распределение рейтинга (Score < 5)')
ax2.set_xlabel('Score')
ax2.set_ylabel('Количество вопросов')
ax2.set_yscale('log')
ax2.grid(axis='x', linestyle='--', alpha=0.7)

# Автоматическая настройка отступов между графиками
plt.tight_layout()
plt.show()

#### Удаление записей с низким показателем рейтига, удаление ненужных

In [None]:
data = data[data['Score'] > 5]
data.drop(columns=['Id', 'Score'], inplace=True)

print(data.shape)
data.info()

In [None]:
data.head(10)

### Подготовка тегов 

#### Пребразование строки тегов в список тегов

In [None]:
data['Tags'] = data['Tags'].apply(lambda x: x.split())
data.head(10)

#### Удаление редких тегов

In [None]:
flat_series = data['Tags'].explode()

unique_tags_count = flat_series.nunique()
tag_counts = flat_series.value_counts()
total_tags = flat_series.count()

print(f"Total tags: {total_tags}")
print(f"Unique tags: {unique_tags_count}")
print(tag_counts.head(10))

In [None]:
COMMON_TAGS_COUNT = 100

In [None]:
keywords = FreqDist(flat_series)
tags_features = [word[0] for word in keywords.most_common(COMMON_TAGS_COUNT)]

In [None]:
fig, ax = plt.subplots(figsize=(25, 6))

labels, frequencies = zip(*keywords.most_common(COMMON_TAGS_COUNT))
ax.bar(range(len(labels)), frequencies)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=90)

ax.yaxis.set_major_locator(ticker.MaxNLocator(20)) 
ax.yaxis.set_minor_locator(ticker.AutoMinorLocator(5))
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('Топ-100 самых частых тегов')
plt.xlabel('Теги')
plt.ylabel('Частота')
plt.show()

In [None]:
tags_features_set = set(tags_features)

mask = data["Tags"].apply(lambda tags: any(tag in tags_features_set for tag in tags))
filtered_data = data[mask].copy()

filtered_data["Tags"] = filtered_data["Tags"].apply(
    lambda tags: [tag for tag in tags if tag in tags_features_set]
)

data = filtered_data[filtered_data["Tags"].apply(len) > 0]

data

### Подготовка заголовка и описания

#### Изначальный вид описания


In [None]:
for idx, text in data["Body"].head(5).items():
    print(f"=== Запись {idx} ===")
    print(text[:200] + "...")  # Первые 200 символов для краткости
    print("\n")

#### Инициализация зависимостей и настройка NLTK

In [None]:
tqdm.pandas()

nltk.download('punkt')       # Токенизатор
nltk.download('stopwords')   # Стоп-слова
nltk.download('wordnet')     # Лемматизатор
nltk.download('punkt_tab')   # Таблицы для токенизации (требуется для некоторых версий NLTK)

# Указываем путь для сохранения данных (если нужно)
nltk.data.path.append("/home/ql/nltk_data") 

# Проверяем загрузку
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("""
            FAILED (btw)
          """)

#### Конвейер обработки

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punct = set('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')

def preprocess_text(text: str) -> str:
    """
    Полная предобработка текста:
    1. Удаление HTML-тегов
    2. Расширение сокращений
    3. Удаление спецсимволов и цифр
    4. Лемматизация
    5. Удаление стоп-слов
    """
    if pd.isna(text):
        return ""
    if not isinstance(text, str) or text.strip() == '':
        return ''
    warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
    # HTML -> текст
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Расширение сокращений и очистка
    text = contractions.fix(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)  # Удаляем всё кроме букв и пробелов
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Токенизация и лемматизация
    tokens = word_tokenize(text)
    processed = [
        lemmatizer.lemmatize(token, pos='v')  # Сначала глаголы
        for token in tokens
        if token not in stop_words and token not in punct
    ]
    return ' '.join(processed)

#### Обработка

In [None]:
# Применяем обработку к Title и Body
for column in ('Title', 'Body'):
    tqdm.pandas(desc=f"Processing {column}")
    data[column] = data[column].progress_apply(preprocess_text)

data = data.dropna(subset=['Title', 'Body', 'Tags'], how='any')
print(data[data["Body"].isnull()].count())
print(data[data["Title"].isnull()].count())
print(data[data["Tags"].isnull()].count())


# Сохраняем результат
data.to_csv('processed_dataset/processed_questions.csv', index=False)

data.info()

#### Итог

In [None]:
for idx, text in data["Body"].head(5).items():
    print(f"=== Запись {idx} ===")
    print(text[:200] + "...")
    print("\n")

In [None]:
# data
data["Title"].explode()

## Обучение модели

In [None]:
from ast import literal_eval

from tensorflow import keras
from tensorflow.keras import Model, regularizers, metrics, layers, optimizers, callbacks

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

import pickle
import pandas as pd

import tensorflow as tf
import numpy as np

### Подготовка данных

In [None]:
data = pd.read_csv("processed_dataset/processed_questions.csv", encoding="ISO-8859-1",
    dtype={
        "Title": str,
        "Body": str
    },
    converters={
        "Tags": lambda x: literal_eval(x)
    }
)

In [None]:
X1 = data['Body']
X2 = data['Title']
Y = data['Tags']

# Заменяем NaN на пустые строки в X1 и X2
X1 = X1.fillna('')
X2 = X2.fillna('')

# Убедимся, что все элементы являются строками
X1 = X1.astype(str)
X2 = X2.astype(str)

In [None]:
multilabel_binarizer = MultiLabelBinarizer()
y_bin = multilabel_binarizer.fit_transform(Y)

pickle.dump(multilabel_binarizer, open("vectorizers/TagsVectorizer.pickle", "wb"))

In [None]:
vectorizer_X1 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

vectorizer_X2 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

In [None]:
X1_tfidf = vectorizer_X1.fit_transform(X1)
X2_tfidf = vectorizer_X2.fit_transform(X2)
X_tfidf = hstack([X1_tfidf, X2_tfidf])

pickle.dump(vectorizer_X1, open("vectorizers/BodyVectorizer.pickle", "wb"))
pickle.dump(vectorizer_X2, open("vectorizers/TitleVectorizer.pickle", "wb"))

In [None]:
TEST_SPLIT_FRACTION = 0.2

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size = TEST_SPLIT_FRACTION, random_state = 0)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### Обучение моделей

In [None]:
COMMON_TAGS_COUNT = 100
EPOCHS_COUNT = 1

#### 1

In [None]:
class GroupAwareRegularizer(regularizers.Regularizer):
    def __init__(self, groups):
        """
        groups: список словарей вида [
            {'class_indices': [0,1,2], 'penalty': 0.3},  # Доминирующие классы
            {'class_indices': [25,37], 'penalty': 0.1},   # Редкие классы
            {'default_penalty': 0.05}                     # Остальные
        ]
        """
        self.groups = groups
        self.penalty_map = self._build_penalty_map()
        
    def _build_penalty_map(self):
        penalty_map = tf.ones(COMMON_TAGS_COUNT, dtype=tf.float32)
        default = next((g['default_penalty'] for g in self.groups if 'default_penalty' in g), 0.0)
        
        for group in self.groups:
            if 'class_indices' in group:
                indices = tf.constant(group['class_indices'], dtype=tf.int32)
                updates = tf.fill([len(indices)], group['penalty'])
                penalty_map = tf.tensor_scatter_nd_update(
                    penalty_map, 
                    tf.expand_dims(indices, -1), 
                    updates
                )
        penalty_map = tf.where(penalty_map == 0.0, default, penalty_map)
        return penalty_map

    def __call__(self, x):
        """
        x: тензор активаций формы (batch_size, COMMON_TAGS_COUNT)
        """
        # Штрафуем активации пропорционально их величине и группе
        penalties = self.penalty_map * tf.reduce_mean(tf.square(x), axis=0)
        return tf.reduce_sum(penalties)

    def get_config(self):
        return {'groups': self.groups}

def focal_loss_model():
    inputs = keras.Input(shape=(2000,))
    
    # Feature extraction
    x = layers.Dense(1024, activation='swish', 
                    kernel_regularizer=regularizers.l1_l2(0.001, 0.01))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    
    # Bottleneck with skip-connection
    x_skip = layers.Dense(512, activation='swish')(x)
    x = layers.BatchNormalization()(x_skip)
    x = layers.Dropout(0.3)(x)
    x = layers.Add()([x, x_skip])  # Residual connection
    
    # Decision block
    x = layers.Dense(256, activation='swish')(x)
    
    outputs = layers.Dense(
        COMMON_TAGS_COUNT, 
        activation='sigmoid',
        kernel_regularizer=GroupAwareRegularizer(  # Изменили на kernel_regularizer!
            groups=[
                {
                    'class_indices': [0,1],  # C#, Java, JavaScript, Android, Python
                    'penalty': 0.3
                },
                {
                    'class_indices': [2,3,4,5],   # Пример редких классов
                    'penalty': 0.2                 # Меньший штраф
                },
                {
                    'class_indices': [6,7,8,9,10,11,12,13,14],   # Пример редких классов
                    'penalty': 0.1                 # Меньший штраф
                },
                {
                    'default_penalty': 0.05          # Все остальные
                }
            ]
        )
    )(x)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    # Focal Loss для дисбаланса
    loss = keras.losses.BinaryFocalCrossentropy(gamma=2.0, alpha=0.25)
    
    # Настройка оптимизатора
    optimizer = keras.optimizers.Nadam(
        learning_rate=keras.optimizers.schedules.ExponentialDecay(
            0.001, 1000, 0.9)
    )
    
    model.compile(
        loss=loss,
        optimizer=optimizer,
        metrics=[
            metrics.PrecisionAtRecall(0.5),
            metrics.AUC(multi_label=True, name='auc'),
            metrics.RecallAtPrecision(0.7),
            metrics.Precision(name='precision'),
            metrics.Recall(name='recall'),
            metrics.F1Score(name="F1"),
        ]
    )
    
    return model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight

model = focal_loss_model()
model.summary()

sample_weights = np.sum(y_train, axis=1)

# Нормализуем веса
sample_weights = sample_weights / np.mean(sample_weights)

history = model.fit(
    X_train,
    y_train,
    sample_weight=sample_weights,  # Передаем веса примеров
    epochs=EPOCHS_COUNT,
    callbacks=[EarlyStopping(patience=3)]
)

model.save("fit_history/model_1.keras")
np.save("fit_history/hist_1", history)

#### 2

In [None]:
class TokenLearner(layers.Layer):
    def __init__(self, num_tokens=16):
        super().__init__()
        self.num_tokens = num_tokens
        
    def build(self, input_shape):
        self.tokenizer = layers.Dense(self.num_tokens, activation='softmax')
        
    def call(self, inputs):
        # inputs: (batch, features)
        attn_weights = self.tokenizer(inputs)  # (batch, num_tokens)
        return tf.einsum('bf,bk->bkf', inputs, attn_weights)  # (batch, num_tokens, features)

    def get_config(self):
        return {'num_tokens': self.num_tokens}

class GroupAwareRegularizer(regularizers.Regularizer):
    def __init__(self, groups):
        self.groups = groups
        
    def __call__(self, weights):
        penalty = 0.0
        for group in self.groups:
            class_weights = tf.gather(weights, group['class_indices'], axis=1)
            penalty += group['penalty'] * tf.reduce_sum(tf.square(class_weights))
        return penalty

    def get_config(self):
        return {'groups': self.groups}


def build_enhanced_model(input_dim=2000, num_classes=100):
    inputs = tf.keras.Input(shape=(input_dim,))
    
    # 1. Token Learning
    tokens = TokenLearner(num_tokens=64)(inputs)
    tokens = layers.GlobalAveragePooling1D()(tokens)
    
    # 2. Main Path
    x = layers.Dense(1024, activation='gelu')(inputs)
    x = layers.LayerNormalization()(x)
    
    # 3. Gated Residual Connection
    gate = layers.Dense(1024, activation='sigmoid')(tokens)
    x = layers.Multiply()([x, gate])
    
    # 4. Enhanced Bottleneck
    x = layers.Concatenate()([
        layers.Dense(512, activation='gelu')(x),
        layers.Dense(512, activation='gelu')(tokens)
    ])
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)
    
    # 5. Output Layer with Group-aware Regularization
    outputs = layers.Dense(
        num_classes,
        activation='sigmoid',
        kernel_regularizer=GroupAwareRegularizer([
            {'class_indices': list(range(3)), 'penalty': 0.1},
            {'class_indices': list(range(3, 7)), 'penalty': 0.01},
            {'class_indices': list(range(7, 11)), 'penalty': 0.001}
        ])
    )(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
    # 6. Custom Optimizer Configuration
    optimizer = optimizers.Adam(
        learning_rate=optimizers.schedules.CosineDecay(
            initial_learning_rate=1e-4,
            decay_steps=1000
        ),
        weight_decay=1e-5
    )
    
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=[
            metrics.PrecisionAtRecall(0.5),
            metrics.AUC(multi_label=True, name='auc'),
            metrics.RecallAtPrecision(0.7),
            metrics.Precision(name='precision'),
            metrics.Recall(name='recall'),
            metrics.F1Score(name="F1"),
        ]
    )
    
    return model

In [None]:
model = build_enhanced_model()
model.summary()

history = model.fit(X_train, y_train, epochs=EPOCHS_COUNT)

model.save("fit_history/model_2.keras")
np.save("fit_history/hist_2", history)

### 3

In [None]:
class TokenLearner(layers.Layer):
    def __init__(self, num_tokens=16):
        super().__init__()
        self.num_tokens = num_tokens
        
    def build(self, input_shape):
        self.tokenizer = layers.Dense(self.num_tokens, activation='softmax')
        
    def call(self, inputs):
        # inputs: (batch, features)
        attn_weights = self.tokenizer(inputs)  # (batch, num_tokens)
        return tf.einsum('bf,bk->bkf', inputs, attn_weights)  # (batch, num_tokens, features)

    def get_config(self):
        return {'num_tokens': self.num_tokens}

class GroupAwareRegularizer(regularizers.Regularizer):
    def __init__(self, groups):
        self.groups = groups
        
    def __call__(self, weights):
        penalty = 0.0
        for group in self.groups:
            class_weights = tf.gather(weights, group['class_indices'], axis=1)
            penalty += group['penalty'] * tf.reduce_sum(tf.square(class_weights))
        return penalty

    def get_config(self):
        return {'groups': self.groups}


def build_enhanced_model(input_dim=2000, num_classes=100):
    inputs = tf.keras.Input(shape=(input_dim,))
    
    # 1. Token Learning
    tokens = TokenLearner(num_tokens=64)(inputs)
    tokens = layers.GlobalAveragePooling1D()(tokens)
    
    # 2. Main Path
    x = layers.Dense(1024, activation='gelu')(inputs)
    x = layers.LayerNormalization()(x)
    
    # 3. Gated Residual Connection
    gate = layers.Dense(1024, activation='sigmoid')(tokens)
    x = layers.Multiply()([x, gate])
    
    # 4. Enhanced Bottleneck
    x = layers.Concatenate()([
        layers.Dense(512, activation='gelu')(x),
        layers.Dense(512, activation='gelu')(tokens)
    ])
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)
    
    # 5. Output Layer with Group-aware Regularization
    outputs = layers.Dense(
        num_classes,
        activation='sigmoid',
        kernel_regularizer=GroupAwareRegularizer([
            {'class_indices': list(range(3)), 'penalty': 0.1},
            {'class_indices': list(range(3, 7)), 'penalty': 0.01},
            {'class_indices': list(range(7, 11)), 'penalty': 0.001}
        ])
    )(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
    # 6. Custom Optimizer Configuration
    optimizer = optimizers.Adam(
        learning_rate=optimizers.schedules.CosineDecay(
            initial_learning_rate=1e-4,
            decay_steps=1000
        ),
        weight_decay=1e-5
    )
    
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=[
            metrics.PrecisionAtRecall(0.5),
            metrics.AUC(multi_label=True, name='auc'),
            metrics.RecallAtPrecision(0.7),
            metrics.Precision(name='precision'),
            metrics.Recall(name='recall'),
            metrics.F1Score(name="F1"),
        ]
    )
    
    return model

In [None]:
model = build_enhanced_model()
model.summary()

history = model.fit(X_train, y_train, epochs=EPOCHS_COUNT)

model.save("fit_history/model_3.keras")
np.save("fit_history/hist_3", history)

#### 4

In [None]:
def balanced_dynamic_dropout_model(input_dim=2000, num_classes=100):
    inputs = tf.keras.Input(shape=(input_dim,))
    
    # 1. Входные слои
    x = layers.Dense(1024, activation='swish', kernel_regularizer=regularizers.l2(1e-3))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    
    # 2. Промежуточные слои с проекцией для residual
    residual = layers.Dense(512, activation='swish')(x)  # Проекция до нужной размерности
    x = layers.Dense(768, activation='swish')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)
    
    # 3. Bottleneck с согласованием размерностей
    x = layers.Dense(512, activation='swish')(x)  # Приводим к размеру residual
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    
    # Правильный skip connection (обе ветви 512)
    x = layers.Add()([x, residual])
    
    # 4. Выходной слой
    outputs = layers.Dense(num_classes, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
    # Кастомная loss с весами классов
    def weighted_bce(y_true, y_pred):
        class_counts = tf.reduce_sum(y_true, axis=1)
        weights = (1. / (class_counts)) * tf.reduce_mean(class_counts)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        return tf.reduce_mean(bce * weights)
    
    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-3),
        loss=weighted_bce,
        metrics=[
            metrics.PrecisionAtRecall(0.5),
            metrics.AUC(multi_label=True, name='auc'),
            metrics.RecallAtPrecision(0.7),
            metrics.Precision(name='precision'),
            metrics.Recall(name='recall'),
            metrics.F1Score(name="F1"),
        ]
    )
    
    return model

In [None]:
model = balanced_dynamic_dropout_model()
model.summary()

history = model.fit(X_train, y_train, epochs=EPOCHS_COUNT)

model.save("fit_history/model_4.keras")
np.save("fit_history/hist_4", history)

### Еще модели

In [None]:
def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    #print("Accuracy score: {}".format(accuracy_score(y_test, y_pred)))
    print("Recall score: {}".format(recall_score(y_true=y_test, y_pred=y_pred, average='weighted')))
    print("Precision score: {}".format(precision_score(y_true=y_test, y_pred=y_pred, average='weighted')))
    print("F1 score: {}".format(f1_score(y_pred, y_test, average='weighted')))
    #print("Jacard score: {}".format(avg_jacard(y_test, y_pred)))
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_test)*100))
    print("---")  

from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import hamming_loss
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
warnings.filterwarnings("ignore")

from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier

from datetime import datetime
import pickle

sgd = SGDClassifier()
lr = LogisticRegression()
mn = MultinomialNB()
svc = LinearSVC()
perceptron = Perceptron()
pac = PassiveAggressiveClassifier()

for classifier in [ sgd, lr, mn, svc, perceptron, pac]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    clf_name = classifier.__class__.__name__
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f'fit_history/{clf_name}_{timestamp}.pkl'
    
    with open(filename, 'wb') as f:
        pickle.dump(clf, f)
    
    # Оценка и вывод результатов
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

## Результаты обучения

In [None]:
# Визуализация графиков
def plot_training_history(history):
    plt.figure(figsize=(15, 10))
    
    # Loss
    try:
        plt.subplot(2, 2, 1)
        plt.plot(history['loss'], label='Training Loss')
        plt.title('Training Loss')
        plt.xlabel('Epochs')
        plt.legend()
    except:
        pass
    # AUC
    try:

        plt.subplot(2, 2, 2)
        plt.plot(history['auc'], label='AUC')
        plt.title('AUC')
        plt.xlabel('Epochs')
        plt.legend()
    except:
        pass
    # Precision@Recall
    try:
  
        plt.subplot(2, 2, 3)
        plt.plot(history['precision_at_recall'], label='Precision@Recall=0.5')
        plt.title('Precision@Recall=0.5')
        plt.xlabel('Epochs')
        plt.legend()
    except:
        pass
    # Recall@Precision
    try:
        plt.subplot(2, 2, 4)
        plt.plot(history['recall_at_precision'], label='Recall@Precision=0.7')
        plt.title('Recall@Precision=0.7')
        plt.xlabel('Epochs')
        plt.legend()
    except:
        pass
    plt.tight_layout()
    plt.show()

def check_model(model):
    print(model.evaluate(X_test , y_test))

### Метрики

In [None]:
prefix = "fit_history/"


for i in range(3):
    history = np.load(f"fit_history/hist_{i+1}.npy", allow_pickle=True).item()
    model = keras.models.load_model(f"fit_history/model_{i+1}.keras")
    plot_training_history(history.history)
    print(f"fit_history/model_{i+1}.keras")
    check_model(model)

### Тесты

In [None]:
import pandas as pd 
from ast import literal_eval

from nltk import FreqDist


data = pd.read_csv("processed_dataset/processed_questions.csv", encoding="ISO-8859-1", converters={
        "Tags": lambda x: literal_eval(x)
    })

flat_series = data['Tags'].explode()


keywords = FreqDist(flat_series)

In [None]:
for i in range(20):
    for tag, count in keywords.most_common(COMMON_TAGS_COUNT)[i*5:(i+1)*5]:
        print(f"{tag}: {count}", end="\t\t")
    print()

In [None]:
from tensorflow import keras
import joblib
from scipy.sparse import hstack


# Загрузка векторизаторов и бинаризатора
vectorizer_X1 = joblib.load("vectorizers/BodyVectorizer.pickle")
vectorizer_X2 = joblib.load("vectorizers/TitleVectorizer.pickle")
multilabel_binarizer = joblib.load("vectorizers/TagsVectorizer.pickle")

In [None]:
TITLE = """Make some buttons on frontend"""
BODY = """    
Create a button, which will use jQuery javascript script
"""

TITLE = """Fix user profile page"""
BODY = """    
Rewrite our python untyped backend view, which calculate user bonus amount, maybe fix celery, maybe it broke crontab 
"""

TITLE = """Refactor Backend Integration for Data Export"""
BODY = """
The current CSV export in PyQt5 blocks the UI thread.  
- Move export logic to a QThread worker.  
- Replace string concatenation with pandas DataFrame for CSV generation.  
- Add error handling for invalid data (show QMessageBox on failure).  
- Allow cancellation via a "Stop Export" button.  """

# TITLE = """Optimize Slow Customer Orders Query"""
# BODY = """
# The query fetching customer orders (JOIN on `customers`, `orders`, `products`) takes 15+ seconds.  
# - Analyze the execution plan with `EXPLAIN ANALYZE`.  
# - Add missing indexes (suggest candidates: `orders.customer_id`, `products.sku`).  
# - Rewrite the query to avoid correlated subqueries.  
# - Partition the `orders` table by `order_date` (YYYY-MM).  
# - Validate speed improvement (target: <1s).  
# """

# TITLE = """Dynamically added jQuery elements not triggering click events"""
# BODY = """I'm using jQuery to add new buttons to a div with append(), but the click events don't work on the new elements. My code:
# javascript
# Copy

# $('#container').append('<button class='btn'>Click me</button>');  
# $('.btn').on('click', () => alert('Button clicked'));  

# # Events work on initial buttons but not dynamically added ones. How can I fix this?"""

# TITLE = """RecyclerView not updating after adding new items in Android"""
# BODY = """
# I have a RecyclerView adapter that updates a list of data. After calling adapter.add(newItem) and adapter.notifyDataSetChanged(), the UI doesn't refresh. 
# My code uses ListAdapter with DiffUtil. What am I missing? Is there a threading issue?
# """

# TITLE = """Fix Cross-Platform Layout Issues in Flutter App"""  
# BODY = """  
# UI renders differently on iOS/Android devices (text overflow, alignment).  
# - Replace hardcoded sizes with MediaQuery-based layout.  
# - Implement platform-aware ThemeData (Cupertino/Material).  
# - Add golden tests for critical screens.  
# - Use Flex widgets instead of Row/Column nesting.  
# - Verify font scaling (1.0-2.0) accessibility requirements.  
# """


# TITLE = """Secure API Endpoints Against SQL Injection"""  
# BODY = """  
# Raw SQL queries in ASP.NET Core 6 controllers are vulnerable to injection.  
# - Replace string concatenation with Entity Framework parameterization.  
# - Add Dapper's `DynamicParameters` for complex queries.  
# - Implement regex filter for suspicious characters in request params.  
# - Create automated tests with SQLMAP test cases.  
# - Add rate limiting (max 5 req/sec) to brute-force endpoints.  
# """

In [None]:
X1_new = vectorizer_X1.transform([TITLE])
X2_new = vectorizer_X2.transform([BODY])

# Объединение и преобразование в плотный формат
X_input = hstack([X1_new, X2_new]).toarray()  # (1, 2000)
# Загрузка модели
model = keras.models.load_model("fit_history/model_3.keras")

probas = model.predict(X_input)
labels = multilabel_binarizer.inverse_transform((probas > 0.20).astype(int))


print("Predicted tags:", labels)

# print(probas[0])

import matplotlib.pyplot as plt

plt.plot(probas[0])
