In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from nltk import FreqDist

from tqdm.auto import tqdm
import re
import contractions
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

from bs4 import MarkupResemblesLocatorWarning
import warnings

## Подготовка датасета

### Чтение датасета

In [None]:
questions = pd.read_csv("dataset/Questions.csv", encoding="ISO-8859-1")
questions.head(5)

In [None]:
tags = pd.read_csv("dataset/Tags.csv", encoding="ISO-8859-1")
print(f"""Unique Tags count: {tags["Tag"].unique().shape[0]}""")
tags.head(5)

In [None]:
questions.info()

In [None]:
tags.info()

### Группировка и объединение тегов

In [None]:
tags["Tag"] = tags["Tag"].astype(str)
grouped_tags = tags.groupby("Id")["Tag"].apply(lambda tags: " ".join(tags))
print(grouped_tags.shape)
grouped_tags.head(10)

In [None]:
df_grouped_tags = grouped_tags.reset_index(name="Tags")
df_grouped_tags.columns = ["Id", "Tags"]
df_grouped_tags.head(5)

### Удаление ненужных колонок

In [None]:
questions.drop(columns=["OwnerUserId", "CreationDate", "ClosedDate", "Title"], inplace=True)
questions.head(5)

### Объединение вопросов и тегов

In [None]:
data = questions.merge(df_grouped_tags, on="Id")
data.head(10)

### Фильтрация по показателю рейтинга (Score)

#### Метрики

In [None]:
print(f"""
Minimum Score: {data["Score"].min()}
Maximum Score: {data['Score'].max()}

Total count {data["Score"].count()}
Count (Score > 0) {data[data["Score"] > 0]["Score"].count()}
Count (Score > 5) {data[data["Score"] > 5]["Score"].count()}
Count (Score > 10) {data[data["Score"] > 20]["Score"].count()}
Count (Score > 20) {data[data["Score"] > 10]["Score"].count()}

Describe:\n{data["Score"].describe()}
""")

#### График распределения рейтинга

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

ax1.hist(data["Score"], bins=100, color="green", edgecolor="black")
ax1.set_title("Распределение всего рейтинга (Score)")
ax1.set_xlabel("Score")
ax1.set_ylabel("Количество вопросов")
ax1.set_yscale("log")
ax1.grid(axis="x", linestyle="--", alpha=0.7)

ax2.hist(data[data["Score"] < 5]["Score"], bins=100, color="red", edgecolor="black")
ax2.set_title("Распределение рейтинга (Score < 5)")
ax2.set_xlabel("Score")
ax2.set_ylabel("Количество вопросов")
ax2.set_yscale("log")
ax2.grid(axis="x", linestyle="--", alpha=0.7)

plt.tight_layout()
plt.show()

#### Удаление записей с низким показателем рейтига, удаление ненужных

In [None]:
data = data[(data["Score"] > 5) & (data["Score"] < 2000)]
data.drop(columns=["Id", "Score"], inplace=True)

print(data.shape)
print(data.info())
data.head(10)

### Подготовка заголовка и описания

#### Изначальный вид описания


In [None]:
for idx, text in data["Body"].head(5).items():
    print(f"=== Запись {idx} ===")
    print(text[:200] + "...")
    print("\n")

#### Инициализация зависимостей и настройка NLTK

In [None]:
tqdm.pandas()

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

nltk.data.path.append("/home/ql/nltk_data") 

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("""
            FAILED (btw)
          """)

#### Конвейер обработки

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punct = set('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')

def preprocess_text(text: str) -> str:
    """
    предобработка текста:
    1. удаление HTML-тегов
    2. расширение сокращений
    3. удаление спецсимволов и цифр
    4. лемматизация
    5. удаление стоп-слов
    """
    if pd.isna(text):
        return ""
    if not isinstance(text, str) or text.strip() == '':
        return ''
    warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    text = contractions.fix(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    tokens = word_tokenize(text)
    processed = [
        lemmatizer.lemmatize(token, pos='v')
        for token in tokens
        if token not in stop_words and token not in punct
    ]
    return ' '.join(processed)

#### Обработка

In [None]:
tqdm.pandas(desc=f"Processing Body")
data["Body"] = data["Body"].progress_apply(preprocess_text)

data = data.dropna(subset=["Body", "Tags"], how="any")
print(data[data["Body"].isnull()].count())
print(data[data["Tags"].isnull()].count())

data.info()

In [None]:
import seaborn as sns
body_len = data["Body"].apply(lambda x: x.split(" ")).apply(len)

plt.figure(figsize=(12, 6))
sns.histplot(body_len, bins=50, log_scale=True)
plt.title("Распределение длины текста")
plt.xlabel("Длина текста (количество слов)")
plt.ylabel("Частота")
plt.axvline(96, color="red", linestyle="--", label=f"Среднее: 96")
plt.axvline(60, color="green", linestyle="--", label=f"Медиана: 60")
plt.axvline(33, color="black", linestyle="-", label=f"Квантиль 0.25: 33")
plt.axvline(110, color="black", linestyle="-", label=f"Квантиль 0.75: 110")
plt.legend()
plt.show()

print(body_len.describe())

In [None]:
data = data[(data["Body"].apply(lambda x: x.split(" ")).apply(len) >= 33) & (data["Body"].apply(lambda x: x.split(" ")).apply(len) <= 110)]

data["Body"] = data["Body"].fillna("")

filtered_data = data[
    (data["Body"].str.len().fillna(0) > 0)
]

data.dropna(subset=["Body", "Tags"], how="any")
data = data[data["Body"].apply(len) != 0]

data

#### Итог

In [None]:
for idx, text in data["Body"].head(5).items():
    print(f"=== Запись {idx} ===")
    print(text[:200] + "...")
    print("\n")

### Определение классов

In [None]:
data["Tags"] = data["Tags"].apply(lambda x: x.split())
data.head(10)

In [None]:
flat_series = data["Tags"].explode()

unique_tags_count = flat_series.nunique()
tag_counts = flat_series.value_counts()
total_tags = flat_series.count()

print(f"Total tags: {total_tags}")
print(f"Unique tags: {unique_tags_count}")
print(tag_counts.head(10))

In [None]:
COMMON_TAGS_COUNT = 33
keywords = FreqDist(flat_series)
tags_features = [word[0] for word in keywords.most_common(COMMON_TAGS_COUNT)]

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))

labels, frequencies = zip(*keywords.most_common(COMMON_TAGS_COUNT))
ax.bar(range(len(labels)), frequencies)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=90)

ax.yaxis.set_major_locator(ticker.MaxNLocator(20)) 
ax.yaxis.set_minor_locator(ticker.AutoMinorLocator(5))
ax.grid(axis='y', linestyle='--', alpha=0.7)

plt.title('50 самых частых меток')
plt.xlabel('Метки')
plt.ylabel('Частота')
plt.show()

In [None]:
tags_features_set = set(tags_features)

mask = data["Tags"].apply(lambda tags: any(tag in tags_features_set for tag in tags))
filtered_data = data[mask].copy()

filtered_data["Tags"] = filtered_data["Tags"].apply(
    lambda tags: [tag for tag in tags if tag in tags_features_set]
)

data = filtered_data[filtered_data["Tags"].apply(len) > 0]

print(f"""
    Average tags in question: {filtered_data["Tags"].apply(len).mean()}
    Max tags in question: {filtered_data["Tags"].apply(len).max()}

    Average body length in question: {filtered_data["Body"].apply(lambda x: x.split(" ")).apply(len).mean()}
    Min body length in question: {filtered_data["Body"].apply(lambda x: x.split(" ")).apply(len).min()}
    Max body length in question: {filtered_data["Body"].apply(lambda x: x.split(" ")).apply(len).max()}
""")

data.to_csv("processed_dataset/processed_questions.csv", index=False)

data

In [None]:
print(keywords.most_common(COMMON_TAGS_COUNT))

## Обучение модели

In [None]:
from ast import literal_eval

from tensorflow import keras
from tensorflow.keras import Model, regularizers, metrics, layers, optimizers, callbacks

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

import pickle
import pandas as pd

import tensorflow as tf
import numpy as np

import matplotlib.pyplot as plt

# tf.config.threading.set_intra_op_parallelism_threads(4)
# tf.config.threading.set_inter_op_parallelism_threads(4)

### Подготовка данных

In [None]:
data = pd.read_csv("processed_dataset/processed_questions.csv", encoding="ISO-8859-1",
    dtype={
        "Body": str
    },
    converters={
        "Tags": lambda x: literal_eval(x)
    }
)

data["Body"] = data["Body"].fillna("")

data = data[
    data["Body"].str.len().fillna(0) > 0
]

In [None]:
X = data['Body']
Y = data['Tags']

In [None]:
MAX_FEATURES_BODY = 1280
TOTAL_FEATURES = MAX_FEATURES_BODY

vectorizer_X = TfidfVectorizer(
    analyzer = 'word',
    min_df=150,
    max_df = 1.0,
    encoding = 'utf-8',
    ngram_range=(1, 2),
    token_pattern=r"(?u)\S\S+",
    max_features=MAX_FEATURES_BODY,
)

X_tfidf = vectorizer_X.fit_transform(X)

pickle.dump(vectorizer_X, open("vectorizers/BodyVectorizer.pickle", "wb"))

TOTAL_FEATURES = len(vectorizer_X.get_feature_names_out())

multilabel_binarizer = MultiLabelBinarizer()
y_bin = multilabel_binarizer.fit_transform(Y)

In [None]:
TEST_SPLIT_FRACTION = 0.2

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size=TEST_SPLIT_FRACTION, random_state=0)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### Обучение моделей

In [None]:
def plot_training_history(history):
    plt.figure(figsize=(10, 7))
    
    metrics = [
        ('loss', 'val_loss'),
        ('auc', 'val_auc'),
        ('recall', 'val_recall'),
        ('precision', 'val_precision'),
    ]
    
    for i, (metric, val_metric) in enumerate(metrics):
        plt.subplot(3, 2, i+1)
        
        plt.plot(history.history[metric], label=f'Training {metric}')
        
        if val_metric in history.history:
            plt.plot(history.history[val_metric], label=f'Validation {metric}')
        
        plt.title(metric.upper())
        plt.xlabel('Epochs')
        plt.ylabel(metric)
        plt.legend()
        plt.grid(True)

    plt.subplot(3, 2, 5)
    plt.plot(history.history["recall"], history.history["precision"], label="Precision / recall")
    plt.title("Pecision / recall")
    plt.xlabel('recall')
    plt.ylabel("precision")
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

def check_model(model):
    print(model.evaluate(X_test , y_test))

In [None]:
COMMON_TAGS_COUNT = 33
EPOCHS_COUNT = 2

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from tensorflow.keras.losses import CategoricalFocalCrossentropy 

def balanced_class_weights(y, recall_factor=1.5, max_weight=5.0):
    class_weights = {}
    num_classes = y.shape[1]
    
    for class_idx in range(num_classes):
        class_labels = y[:, class_idx]
        pos_count = np.sum(class_labels)
        neg_count = len(class_labels) - pos_count
        
        weight_positive = min(max_weight, 
                            (neg_count / (pos_count + 1e-6))**0.5 * recall_factor)
        class_weights[class_idx] = weight_positive
 
    return class_weights

class_weights_dict = balanced_class_weights(y_train, recall_factor=1.4, max_weight=8.0)

def build_model():
    inputs = tf.keras.Input(shape=(TOTAL_FEATURES,))
    
    x = layers.Dense(768, activation='swish', kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.01, l2=0.01))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)

    residual = layers.Dense(384, activation='swish', kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.01, l2=0.01))(x)
    
    x = layers.Dense(512, activation='swish', kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.001, l2=0.01))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Dense(384, activation='swish', kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.001, l2=0.01))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Add()([x, residual])
    x = layers.Dense(256, activation='swish', kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l1_l2(0.001))(x)

    outputs = layers.Dense(COMMON_TAGS_COUNT, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
    optimizer = optimizers.AdamW(
        learning_rate=1e-4,
    )
    
    model.compile(
        optimizer=optimizer,
        loss=CategoricalFocalCrossentropy(
            alpha=0.65,
            gamma=1.4,
            from_logits=False,
        ),
        metrics=[
            metrics.Recall(name='recall', thresholds=0.4),
            metrics.Precision(name='precision', thresholds=0.4),
            metrics.AUC(name='auc'),
            metrics.F1Score(name='f1', threshold=0.4)
        ]
    )
    
    return model

model = build_model()

model.summary()

history = model.fit(
    X_train, y_train,
    epochs=30,
    validation_data=(X_test, y_test),
    class_weight=class_weights_dict,
    batch_size=64,
)

model.save("fit_history/model_11.keras")
np.save("fit_history/hist_11", history)

check_model(model)
plot_training_history(history)

In [None]:
model = keras.models.load_model("fit_history/model_11.keras")

for layer in model.layers[:8]:
    layer.trainable = False

history = model.fit(
    X_train, y_train,
    epochs=30,
    validation_data=(X_test, y_test),
    class_weight=class_weights_dict,
    batch_size=64,
)

model.save("fit_history/model_11_1.keras")
np.save("fit_history/hist_11_1", history)

check_model(model)
plot_training_history(history)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def load_combined_history(hist_path1, hist_path2):
    hist1 = np.load(hist_path1, allow_pickle=True).item()
    hist2 = np.load(hist_path2, allow_pickle=True).item()
    
    combined_hist = {}
    for key in hist1.history.keys():
        if key in hist2.history:
            combined_hist[key] = np.concatenate((hist1.history[key], hist2.history[key]), axis=None)
    
    class CombinedHistory:
        def __init__(self, history):
            self.history = history
    
    return CombinedHistory(combined_hist)

combined_history = load_combined_history(
    'fit_history/hist_11.npy',
    'fit_history/hist_11_1.npy'
)

plot_training_history(combined_history)

## Результаты обучения

In [None]:
import pandas as pd 
from ast import literal_eval

from nltk import FreqDist


data = pd.read_csv("processed_dataset/processed_questions.csv", encoding="ISO-8859-1", converters={
        "Tags": lambda x: literal_eval(x)
    })

flat_series = data['Tags'].explode()

keywords = FreqDist(flat_series)

In [None]:
COMMON_TAGS_COUNT = 33

for i in range(7):
    for tag, count in keywords.most_common(COMMON_TAGS_COUNT)[i*5:(i+1)*5]:
        print(f"{tag}: {count}", end="\t\t")
    print()

In [None]:
from tensorflow import keras
import joblib
from scipy.sparse import hstack


vectorizer_X1 = joblib.load("vectorizers/BodyVectorizer.pickle")
multilabel_binarizer = joblib.load("vectorizers/TagsVectorizer.pickle")

In [None]:
import heapq
import matplotlib.pyplot as plt

def get_top_ten(arr: list[float]):
    h = []
    for i, val in enumerate(arr):
        h.append((-val, [i, val]))
    heapq.heapify(h)

    res = []
    for i in range(10):
        res.append(heapq.heappop(h)[1])

    return res

model = None

def predict(body: str):
    global model
    X_input = vectorizer_X1.transform([body])
    if model is None:
        model = keras.models.load_model("fit_history/model_11_1.keras")


    probas = model.predict(X_input)

    print("Predicted tags:", multilabel_binarizer.inverse_transform((probas > 0.10).astype(int)))
    print("Predicted tags (top 10):", multilabel_binarizer.classes_[[t for t, _ in get_top_ten(probas[0])]])

    # plt.plot(probas[0])


In [None]:
predict("""    
Create a button, which will use jQuery javascript script
""")

In [None]:
predict("""    
Rewrite our python backend view, which calculate bonus amount
""")

In [None]:
predict("""
The query fetching customer orders (JOIN on `customers`, `orders`, `products`) takes 15+ seconds.  
- Analyze the execution plan with `EXPLAIN ANALYZE`.  
- Add missing indexes (suggest candidates: `orders.customer_id`, `products.sku`).  
- Rewrite the query to avoid correlated subqueries.  
- Partition the `orders` table by `order_date` (YYYY-MM).  
- Validate speed improvement (target: <1s).  
""")

In [None]:
predict("""
Implementing a sorting algorithm for large datasets in C++ with multithreading support
""")       

In [None]:
predict("""
Using Git to resolve merge conflicts after rebasing a feature branch
""")

In [None]:
predict("""
Parsing JSON string in Android app and displaying data in RecyclerView
""")

In [None]:
predict("""
Designing a WPF UI with dynamic data binding and custom styles in Visual Studio
""")

### Без явного упоминания меток

In [None]:
predict("""
Database connection errors when fetching records for a web page
""")

In [None]:
predict("""
Application crashes after prolonged use on mobile devices
""")

In [None]:
predict("""
Version history conflicts during branch integration
""")

### Задачи

In [None]:
predict("""
Design and implement a real-time dashboard that visualizes sensor data streams with dynamic filtering capabilities. 
Ensure the solution supports 10K+ concurrent connections, provides historical data overlays, 
and maintains sub-second latency during peak loads across both desktop and mobile browsers
""")

In [None]:
predict("""
Diagnose and resolve random null reference exceptions occurring in production when users submit complex forms.
The issue manifests only after 15+ form interactions and appears correlated with multi-step validation workflows.
Provide hotfix with regression tests
""")

In [None]:
predict("""
Eliminate OWASP Top 10 vulnerabilities across all public APIs.
Implement strict input validation, rate limiting, JWT token rotation, and automated penetration testing.
Address critical CSRF findings from recent audit
""")

In [None]:
predict("""
Create CI/CD pipeline that executes static code analysis, runs test suites across multiple runtime versions,
generates deployment artifacts, and promotes builds between environments based on git branch policies
""")

In [None]:
predict("""
Migrate our legacy financial reconciliation engine from end-of-life platforms to modern infrastructure
without disrupting daily transaction processing. Re-implement custom rounding rules and currency conversion logic with atomicity guarantees.
Include zero-downtime cutover strategy and automated consistency validation across terabyte-scale historical datasets
""")