In [1]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split




In [2]:

df = pd.read_csv('srpski.csv', sep='\t')

df = df.drop(columns=['Rbr', 'SR', 'sr/sr', 'Naslov', 'Jezik'])
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].str.lower()


In [3]:

max_text_length = df['Tekst'].apply(lambda x: len(str(x).split())).max()

X_train, X_test, y_train, y_test = train_test_split(df['Tekst'], df['Autor'], test_size=0.2, random_state=42, stratify=df['Autor'])


In [4]:
early_stopping = EarlyStopping(
    patience=3,  
    monitor='accuracy',  
    restore_best_weights=True  
)


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
X = pad_sequences(sequences, maxlen=max_text_length)


In [6]:

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_text_length))
model.add(LSTM(100))
model.add(Dense(len(set(df['Autor']))+1, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.fit(X, y_train, epochs=10, validation_split=0.2, callbacks=early_stopping)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.src.callbacks.History at 0x291c69c70>

In [7]:

new_sequences = tokenizer.texts_to_sequences(X_test)
new_X = pad_sequences(new_sequences, maxlen=max_text_length)

predictions_train = model.predict(X)
predictions_test = model.predict(new_X)

evaluation_train = model.evaluate(X, y_train)
evaluation_test = model.evaluate(new_X, y_test)



In [8]:
evaluation_train[1]

0.2454545497894287

In [9]:
evaluation_test[1]

0.0714285746216774