In [18]:
import urllib.request
from lxml import html
import os
import shutil
import random
import spacy
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sentence_transformers import models, SentenceTransformer
from torch.utils.data import DataLoader
import torch.nn as nn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [None]:
def get_text_from_article(url, xpath):
    response = urllib.request.urlopen(url)
    page_content = response.read()
    response.close()
    tree = html.fromstring(page_content)
    text = tree.xpath(xpath)
    return " ".join(text).strip()

def get_wp_articles_urls(url):
    response = urllib.request.urlopen(url)
    page_content = response.read()
    response.close()
    tree = html.fromstring(page_content)
    xpath = "//div[@class='akWTRfRe a3brEpa-']//a"
    links = tree.xpath(xpath)
    counter = 0
    urls = []
    for link in links:
        if counter % 2 == 0:
            urls.append(link.get('href'))
        counter+=1
    return urls
            
def create_files(urls, xpath, folder_name):
    os.makedirs(folder_name, exist_ok=True)
    counter = 1
    for url in urls:
        text = get_text_from_article(url, xpath)
        file_name = os.path.join(folder_name,str(counter) + '.txt')
        counter+=1
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(text)

def get_text_from_wp(url, xpath, folder_name):
    try:
        urls = get_wp_articles_urls(url)
        page = 2
        while True:
            new_url = url + '/' + str(page)
            page+=1
            for url1 in get_wp_articles_urls(new_url):
                urls.append(url1)
    except Exception as e:
        create_files(urls, xpath, folder_name)

def get_bi_articles_urls(url):
    response = urllib.request.urlopen(url)
    page_content = response.read()
    response.close()
    tree = html.fromstring(page_content)
    xpath = "//main//div[@class='stream-list']//a"
    links = tree.xpath(xpath)
    urls = []
    for link in links:
        urls.append(link.get("href"))
    return urls

def get_text_from_bi(url, xpath, folder_name):
    old_size = 0
    urls = get_bi_articles_urls(url)
    new_size = len(urls)
    page = 2
    while old_size != new_size:
        new_url = url + '?page=' + str(page)
        page+=1
        old_size = new_size
        for url1 in get_bi_articles_urls(new_url):
            urls.append(url1)
        new_size = len(urls)
    create_files(urls, xpath, folder_name)

xpath_wp = '//article//h1//text() | //article//li//text() | //article//p//text() | //article//h2//text()'
xpath_bi = "//p[@class='article_p']//text()"
url_przestepstwa = 'https://wiadomosci.wp.pl/tag/przest%C4%99pstwa'
url_katastrofa = 'https://wiadomosci.wp.pl/tag/katastrofa'
url_biznes = 'https://businessinsider.com.pl/biznes'
get_text_from_wp(url_przestepstwa, xpath_wp, 'przestępstwo')
get_text_from_wp(url_katastrofa, xpath_wp, 'katastrofa')
get_text_from_bi(url_biznes, xpath_bi, 'biznes')

In [None]:
nlp = spacy.load('pl_core_news_lg')


def split_text_into_sentences(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

def create_files(category, ratio):
    train_counter = 1
    test_counter = 1
    for filename in os.listdir(category):
        with open(category + '/' + filename, 'r', encoding='utf-8') as file:
            text = file.read()
            sentences = split_text_into_sentences(text)
            for sentence in sentences:
                if random.randint(0, 9) > ratio:
                    path = 'dataset/train/' + category + '/' + str(train_counter) + '.txt'
                    train_counter+=1
                else:
                    path = 'dataset/test/' + category + '/' + str(test_counter) + '.txt'
                    test_counter+=1
                with open(path, 'w', encoding='utf-8') as f:
                    f.write(sentence)
                

def divide_into_test_and_trening(test_size=0.2):
    ratio = test_size * 10
    categories = ['biznes', 'katastrofa', 'przestępstwo']
    try:
        shutil.rmtree('dataset')
    except Exception as e:
        pass
    for category in categories:
        os.makedirs('dataset/test/' + category, exist_ok=True)
        os.makedirs('dataset/train/' + category, exist_ok=True)
        create_files(category, ratio)

divide_into_test_and_trening()

In [2]:

def load_data_from_folder(data_dir, label_map=None):
    texts = []
    labels = []

    if label_map is None:
        label_map = {
            d: idx
            for idx, d in enumerate(
                sorted([
                    x for x in os.listdir(data_dir)
                    if os.path.isdir(os.path.join(data_dir, x))
                ])
            )
        }

    for label, label_id in label_map.items():
        label_path = os.path.join(data_dir, label)
        if not os.path.isdir(label_path):
            continue

        for txt_file in os.listdir(label_path):
            txt_path = os.path.join(label_path, txt_file)
            if not os.path.isfile(txt_path):
                continue

            with open(txt_path, "r", encoding="utf-8") as f:
                texts.append(f.read())
                labels.append(label_id)

    return texts, labels, label_map



train_texts, train_labels, label_map = load_data_from_folder('dataset/train')
test_texts, test_labels, _ = load_data_from_folder('dataset/test')



In [35]:
device = "cuda" if torch.cuda.is_available() else "cpu"

sbert = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    device=device
)

with torch.no_grad():
    X_train = sbert.encode(
        train_texts,
        convert_to_tensor=True,
        batch_size=32,
        show_progress_bar=True
    )
    X_test = sbert.encode(
        test_texts,
        convert_to_tensor=True,
        batch_size=32,
        show_progress_bar=True
    )
    

Batches:   0%|          | 0/2080 [00:00<?, ?it/s]

In [37]:
class Classifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.model(x)

y_train = torch.tensor(train_labels, dtype=torch.long)
y_test  = torch.tensor(test_labels, dtype=torch.long)

X_train = X_train.clone().detach()
X_test  = X_test.clone().detach()

X_train = X_train.to(device)
X_test  = X_test.to(device)
y_train = y_train.to(device)
y_test  = y_test.to(device)

model = Classifier(
    input_dim=X_train.shape[1],
    num_classes=len(label_map)
).to(device)

#weights = np.array([0.89, 0.65, 4.0])

#weights = torch.tensor(weights, dtype=torch.float).to(device)
#criterion = nn.CrossEntropyLoss(weight=weights)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
    
for epoch in range(30):
    model.train()

    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")


model.eval()
with torch.no_grad():
    logits = model(X_test)
    preds = torch.argmax(logits, dim=1).cpu().numpy()

print("\nClassification report:")
print(classification_report(test_labels, preds))

print("Confusion matrix:")
print(confusion_matrix(test_labels, preds))


Epoch 1 | Loss: 1.0790
Epoch 2 | Loss: 1.0613
Epoch 3 | Loss: 1.0437
Epoch 4 | Loss: 1.0260
Epoch 5 | Loss: 1.0079
Epoch 6 | Loss: 0.9888
Epoch 7 | Loss: 0.9688
Epoch 8 | Loss: 0.9474
Epoch 9 | Loss: 0.9246
Epoch 10 | Loss: 0.9001
Epoch 11 | Loss: 0.8743
Epoch 12 | Loss: 0.8469
Epoch 13 | Loss: 0.8181
Epoch 14 | Loss: 0.7881
Epoch 15 | Loss: 0.7575
Epoch 16 | Loss: 0.7260
Epoch 17 | Loss: 0.6946
Epoch 18 | Loss: 0.6633
Epoch 19 | Loss: 0.6329
Epoch 20 | Loss: 0.6031
Epoch 21 | Loss: 0.5746
Epoch 22 | Loss: 0.5475
Epoch 23 | Loss: 0.5217
Epoch 24 | Loss: 0.4972
Epoch 25 | Loss: 0.4739
Epoch 26 | Loss: 0.4525
Epoch 27 | Loss: 0.4321
Epoch 28 | Loss: 0.4135
Epoch 29 | Loss: 0.3970
Epoch 30 | Loss: 0.3822

Classification report:
              precision    recall  f1-score   support

           0       0.89      0.81      0.85     10905
           1       0.87      0.94      0.90     17145
           2       0.00      0.00      0.00       351

    accuracy                           0.88    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
