In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
#importovanie potrebných knižníc
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, preprocessing
import numpy as np
import json

# Nastavenia: maximálny počet slov vo vocab, maximálna dĺžka textov (v slovách), dimenzia embedding vrstvy
max_words = 20000
max_length = 100
embedding_dim = 128

#funkcia na načítanie dát z JSONL súboru
def load_data_from_jsonl(jsonl_path):
    texts = []
    labels = []
    with open(jsonl_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            texts.append(data["text"])
            labels.append(data["label"])
    return texts, labels

#načítanie trénovacích dát
train_jsonl_path = "/content/drive/MyDrive/data/train.jsonl"
train_texts, train_labels = load_data_from_jsonl(train_jsonl_path)

#načítanie testovacích dát
test_jsonl_path = "/content/drive/MyDrive/data/test_seen.jsonl"
test_texts, test_labels = load_data_from_jsonl(test_jsonl_path)

#tokenizácia
tokenizer = preprocessing.text.Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

#prevod textov na číselné sekvencie
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

#padding sekvencií na rovnakú dĺžku
x_train = preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding='post')
x_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post')

#konverzia štítkov na numpy array
y_train = np.array(train_labels)
y_test = np.array(test_labels)

#výstupy na kontrolu
print(f"Počet tréningových vzoriek: {len(x_train)}")
print(f"Počet testovacích vzoriek: {len(x_test)}")
print(f"Tvar tréningových dát: {x_train.shape}, Tvar štítkov: {y_train.shape}")
print(f"Tvar testovacích dát: {x_test.shape}, Tvar štítkov: {y_test.shape}")


Počet tréningových vzoriek: 8500
Počet testovacích vzoriek: 1000
Tvar tréningových dát: (8500, 100), Tvar štítkov: (8500,)
Tvar testovacích dát: (1000, 100), Tvar štítkov: (1000,)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np


max_words = 10000       #veľkosť slovníka
embedding_dim = 100     #dimenzia embedding vrstvy
max_length = 200        #dĺžka vstupnej sekvencie


#definovanie modelu
model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.LSTM(64),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

#trénovanie modelu
history = model.fit(
    x_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=16,
    verbose=1
)

#testovanie
y_pred_probs = model.predict(x_test)
y_pred = (y_pred_probs > 0.5).astype("int32")

#výpočet metrík
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n Výsledky testovania:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")


In [None]:
#testovanie
y_pred_probs = model.predict(x_test)
y_pred = (y_pred_probs > 0.5).astype("int32")

#výpočet metrík
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n Výsledky testovania:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step

 Výsledky testovania:
Accuracy:  0.5660
Precision: 0.6138
Recall:    0.3082
F1 Score:  0.4103


In [None]:
#zmena hyperparametrov pre dosiahnutie vyššej presnosti

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, preprocessing
import numpy as np
import json
import re

#nastavenia: maximálny počet slov vo vocab, maximálna dĺžka textov (v slovách), dimenzia embedding vrstvy
max_words = 20000
max_length = 100
embedding_dim = 128

#funkcia na načítanie dát z JSONL súboru
def load_data_from_jsonl(jsonl_path):
    texts = []
    labels = []
    with open(jsonl_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            texts.append(data["text"])
            labels.append(data["label"])
    return texts, labels

#funkcia na normalizáciu textu, zmena na malé písmená, odstránenie interpunkcie, odstránenie nadbytočných medzier
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zá-ž0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

#načítanie a normalizácia dát
train_jsonl_path = "/content/drive/MyDrive/data/train.jsonl"
test_jsonl_path = "/content/drive/MyDrive/data/test_seen.jsonl"

train_texts, train_labels = load_data_from_jsonl(train_jsonl_path)
test_texts, test_labels = load_data_from_jsonl(test_jsonl_path)

#normalizácia
train_texts = [normalize_text(t) for t in train_texts]
test_texts = [normalize_text(t) for t in test_texts]

#tokenizácia
tokenizer = preprocessing.text.Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

#padding
x_train = preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding='post')
x_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post')

#konverzia štítkov
y_train = np.array(train_labels)
y_test = np.array(test_labels)

#výstupy na kontrolu
print(f"Počet tréningových vzoriek: {len(x_train)}")
print(f"Počet testovacích vzoriek: {len(x_test)}")
print(f"Tvar tréningových dát: {x_train.shape}, Tvar štítkov: {y_train.shape}")
print(f"Tvar testovacích dát: {x_test.shape}, Tvar štítkov: {y_test.shape}")


Počet tréningových vzoriek: 8500
Počet testovacích vzoriek: 1000
Tvar tréningových dát: (8500, 100), Tvar štítkov: (8500,)
Tvar testovacích dát: (1000, 100), Tvar štítkov: (1000,)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#definovanie CNN-LSTM modelu
model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.LSTM(64),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

#trénovanie modelu
history = model.fit(
    x_train, y_train,
    validation_split=0.2,  #20% na validáciu počas trénovania
    epochs=10,
    batch_size=16,
    verbose=1
)

#vyhodnotenie na testovacích dátach
y_pred_probs = model.predict(x_test)
y_pred = (y_pred_probs > 0.5).astype("int32")

#výpočet metrík
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n Výsledky testovania:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")




Epoch 1/10
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - accuracy: 0.6596 - loss: 0.6486 - val_accuracy: 0.5882 - val_loss: 0.6790
Epoch 2/10
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.6571 - loss: 0.6503 - val_accuracy: 0.5882 - val_loss: 0.6840
Epoch 3/10
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.6643 - loss: 0.6410 - val_accuracy: 0.5882 - val_loss: 0.6810
Epoch 4/10
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6689 - loss: 0.6385 - val_accuracy: 0.5882 - val_loss: 0.6799
Epoch 5/10
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.6579 - loss: 0.6450 - val_accuracy: 0.5882 - val_loss: 0.6878
Epoch 6/10
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.6491 - loss: 0.6518 - val_accuracy: 0.5882 - val_loss: 0.6901
Epoch 7/10
[1m425/425

In [None]:
model.save("/content/drive/MyDrive/cnn_lstm_model.h5")





In [None]:
model.summary()
