In [412]:
import kagglehub
import transformers
import json
import pandas as pd
import arxiv
import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
import warnings
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [413]:
term_types = ['cs', 'econ', 'eess', 'math', 'astro-ph', 'cond-mat', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'nlin', 'nucl-ex', 'nucl-th', 'physics', 'quant-ph', 'q-bio', 'q-fin', 'stat']
term_dict = {}
for term_type in term_types:
    term_dict[term_type] = 0

In [414]:
def parse_terms(input_series):
    result = []

    for item in input_series:
        clean_string = item.replace("", "").replace("", "").replace("{", "").replace("}", "")
        clean_string = clean_string.replace("'", "").replace('"', "").replace(":", "").replace(", ", ",")
        
        terms = clean_string.split('term')
        
        terms_list = []
        for term in terms[1:]:
            term = term.split(',')[0].strip()
            if term:
                for term_type in term_types:
                    if term_type in term:
                        term_dict[term_type] += 1
                        terms_list.append(term)
                        break
        if terms_list == []:
            result.append(None)
        else:
            result.append(terms_list[0])
    return result

def make_text(title, summary):
    result = []
    for t, s in zip(title, summary):
        result.append(f'Title: {t}\nSummary: {s}')
    return result

In [415]:
df = pd.read_json('arxivData.json')
df = df[['summary', 'tag', 'title']]
df['tag'] = parse_terms(df['tag'])
df = df.groupby('tag').filter(lambda x: len(x) > 1)
df['text'] = make_text(df['title'], df['summary'])
df = df[['text', 'tag']]

In [416]:
df

Unnamed: 0,text,tag
0,Title: Dual Recurrent Attention Units for Visu...,cs.AI
1,Title: Sequential Short-Text Classification wi...,cs.CL
2,Title: Multiresolution Recurrent Neural Networ...,cs.CL
3,Title: Learning what to share between loosely ...,stat.ML
4,Title: A Deep Reinforcement Learning Chatbot\n...,cs.CL
...,...,...
40995,Title: Nearly Tight Bounds on $\ell_1$ Approxi...,cs.LG
40996,Title: Concurrent bandits and cognitive radio ...,cs.LG
40997,Title: A Comparison of Clustering and Missing ...,math.NA
40998,Title: Applying machine learning to the proble...,cs.SC


In [417]:
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df['tag'])

In [418]:
label_map = {label: i for i, label in enumerate(df['tag'].unique())}
inv_label_map = {v: k for k, v in label_map.items()}

In [419]:
model_name = 'answerdotai/ModernBERT-base'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_map),
    ignore_mismatched_sizes=True
)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [420]:
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['text'].tolist()
        self.labels = df['tag'].map(label_map).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label)
        }

In [421]:
batch_size = 8

train_dataset = TextDataset(train, tokenizer)
test_dataset = TextDataset(test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [422]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

epochs = 10
total_steps = len(train_loader) * epochs

scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [423]:
def train(model, train_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        
    return total_loss / len(train_loader)

def evaluate(model, test_loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    
    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            
            correct += (predictions == batch['labels']).sum().item()
    
    avg_loss = total_loss / len(test_loader)
    accuracy = correct / len(test_loader.dataset)
    
    return avg_loss, accuracy

In [424]:
for epoch in range(1, epochs + 1):
    train_loss = train(model, train_loader, optimizer, scheduler, device)
    
    test_loss, test_accuracy = evaluate(model, test_loader, device)
    
    print(f"Epoch {epoch}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print("-"*40)

Epoch 1/10
Train Loss: 1.4090
Test Loss: 1.2254
Test Accuracy: 0.6350
----------------------------------------
Epoch 2/10
Train Loss: 1.0587
Test Loss: 1.1388
Test Accuracy: 0.6535
----------------------------------------
Epoch 3/10
Train Loss: 0.8021
Test Loss: 1.1644
Test Accuracy: 0.6576
----------------------------------------
Epoch 4/10
Train Loss: 0.4809
Test Loss: 1.3751
Test Accuracy: 0.6275
----------------------------------------
Epoch 5/10
Train Loss: 0.1847
Test Loss: 1.5100
Test Accuracy: 0.6415
----------------------------------------
Epoch 6/10
Train Loss: 0.0450
Test Loss: 1.6904
Test Accuracy: 0.6373
----------------------------------------
Epoch 7/10
Train Loss: 0.0127
Test Loss: 1.8795
Test Accuracy: 0.6472
----------------------------------------
Epoch 8/10
Train Loss: 0.0050
Test Loss: 1.9972
Test Accuracy: 0.6429
----------------------------------------
Epoch 9/10
Train Loss: 0.0035
Test Loss: 2.0795
Test Accuracy: 0.6447
----------------------------------------
E

In [425]:
model.save_pretrained('10_epochs_model_deberta')
tokenizer.save_pretrained('10_epochs_model_deberta')

('10_epochs_model_deberta/tokenizer_config.json',
 '10_epochs_model_deberta/special_tokens_map.json',
 '10_epochs_model_deberta/tokenizer.json')

In [426]:
# def normal_tags(tags):
#     new_tags = []
#     for tag in tags:
#         for term_type in term_types:
#             if term_type in tag:
#                 new_tags.append(tag)
# all_data = []
# for _ in range(4000):
#     client = arxiv.Client()
#     search = arxiv.Search(query = "submittedDate:[201901010600 TO 202501010600]", max_results=100)
#     all_data = []
#     for r in client.results(search):
#         try:
#             if r:
#                 summary = r.summary
#                 tag = normal_tags(r.categories)
#                 title = r.title
#                 if title and summary and tag:
#                     data = {
#                         'summary': summary,
#                         'tag': tag,
#                         'title': title
#                     }
#                     all_data.append(data)
#         except Exception:
#             continue


# df_new = pd.DataFrame(all_data)
# df_new.to_csv('new_df.csv', index=False)

---
Код приложения
---

In [None]:
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import pandas as pd
import torch
import json

@st.cache_resource  # кэширование
def load_model():
    return AutoModelForSequenceClassification.from_pretrained('YSDA/2-ML/HF_tune_Transformers/10_epochs_model_deberta')

@st.cache_resource  # кэширование
def load_tokenizer():
    return AutoTokenizer.from_pretrained('YSDA/2-ML/HF_tune_Transformers/10_epochs_model_deberta')

@st.cache_resource  # кэширование
def load_dict():
    with open('YSDA/2-ML/HF_tune_Transformers/dictionary.json', 'r', encoding='utf-8') as file:
        label_dict = json.load(file)
    return label_dict

model = load_model()
tokenizer = load_tokenizer()
label_dict = load_dict()

st.title("Классификатор статей")
st.markdown("Данное веб-приложение может предсказать тематику статьи на основе ее названия и описания.\nЧтобы воспользоваться функциональностью, просто введите название и описание в поля ниже.")

query1 = st.text_input("Название статьи")
query2 = st.text_input("Описание статьи")

if query1 and query2:
    query = f'''
    Input: {query1}
    Summary: {query2}
    '''
    
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    
    logits = logits.detach().cpu().numpy()
    
    prediction = np.argmax(logits, axis=1)
    
    probabilities = torch.softmax(torch.from_numpy(logits), dim=1).numpy()
    
    top_n = 5
    top_predictions = [label_dict[str(i)] for i in np.argsort(probabilities, axis=1)[:, -top_n:][:, ::-1][0]]
    top_probabilities = np.sort(probabilities, axis=1)[:, -top_n:][:, ::-1]

    df = pd.DataFrame({
        'Тематика': top_predictions,
        'Вероятность': top_probabilities[0]
    })

    st.dataframe(df)
    on = st.toggle("Показать смешную картинку")
    if on:
        st.markdown("Для хорошего настроения проверяющего")
        st.image("YSDA/2-ML/HF_tune_Transformers/image.png")