In [None]:
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('srpski.csv', sep='\t')
df = df.drop(columns=['Rbr', 'SR', 'sr/sr', 'Naslov', 'Jezik'])
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].str.lower()
df

Tokenizacija reci i recenica


In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, sent_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars

nltk.download('punkt')

In [None]:
class LangVars(PunktLanguageVars):
    sent_end_chars = ('.', '!', '?', ';', ":", "...", '..','…')
custom_sent_tokenizer = PunktSentenceTokenizer(lang_vars=LangVars())
df['Recenice'] = df['Tekst'].apply(custom_sent_tokenizer.tokenize)

In [None]:
custom_tokenizer = RegexpTokenizer(r'\w+[\'\’]*\w*|[^\w\s]')
df['Tokeni'] = df['Tekst'].apply(custom_tokenizer.tokenize)

In [None]:
punctuation = r'[^\w\s]'
def remove_punctuation(tokens): 
    filtered_tokens = [word for word in tokens if not re.match(punctuation, word)]
    return filtered_tokens

df['Filtrirani tokeni'] = df['Tokeni'].apply(remove_punctuation)

In [None]:
from nltk import FreqDist

In [None]:
all_words = [word for words in df['Filtrirani tokeni'] for word in words]
fdist = FreqDist(all_words)
stopwords = [word for word, count in fdist.items() if count > 50 or (len(word) in (1,2,3) and count > 20)]

In [None]:
def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word not in stopwords]
    return filtered_tokens

#df['Filtrirani tokeni'] = df['Filtrirani tokeni'].apply(remove_stopwords)

In [None]:
df.to_csv('stilometrija_medjukorak.csv', index=False)

Podela podataka na trening i test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df[['Tekst', 'Recenice', 'Tokeni','Filtrirani tokeni']]
y = df['Autor']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

Stilometrijske analize

In [None]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)


In [None]:
def average_word_length(words):
    return sum(len(word) for word in words) / len(words)

df_train['Duzina reci'] = df_train['Filtrirani tokeni'].apply(average_word_length)
df_test['Duzina reci'] = df_test['Filtrirani tokeni'].apply(average_word_length)


In [None]:
df_avgWordLength = df_train.groupby('Autor')['Duzina reci'].mean().reset_index()

In [None]:
import matplotlib.pyplot as plt


In [None]:
plt.bar(df_avgWordLength['Autor'], df_avgWordLength['Duzina reci'])
plt.xlabel('Autor')
plt.ylabel('Dužina reči')
plt.show()

In [None]:
def average_sentence_length(sentences):
    return sum(len(sentence.split()) for sentence in sentences) / len(sentences)

df_train['Duzina recenica'] = df_train['Recenice'].apply(average_sentence_length)
df_test['Duzina recenica'] = df_test['Recenice'].apply(average_sentence_length)


In [None]:
df_avgSentLength = df_train.groupby('Autor')['Duzina recenica'].mean().reset_index()


In [None]:
plt.bar(df_avgSentLength['Autor'], df_avgSentLength['Duzina recenica'], color='cyan')
plt.xlabel('Autor')
plt.ylabel('Dužina rečenice')
plt.show()

In [None]:
def text_length(words):
    return sum(len(word) for word in words)
df_train['Duzina teksta'] = df_train['Tokeni'].apply(text_length)
df_test['Duzina teksta'] = df_test['Tokeni'].apply(text_length)

In [None]:
df_avgTextLength = df_train.groupby('Autor')['Duzina teksta'].mean().reset_index()


In [None]:
plt.bar(df_avgTextLength['Autor'], df_avgTextLength['Duzina teksta'], color='orange')
plt.xlabel('Autor')
plt.ylabel('Dužina teksta')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score
model = RandomForestClassifier(max_depth=5, min_samples_split=15, n_estimators=300)

model.fit(df_train[['Duzina reci', 'Duzina recenica', 'Duzina teksta']], df_train['Autor'])

y_train_pred_rf = model.predict(df_train[['Duzina reci', 'Duzina recenica', 'Duzina teksta']])

# Predikcije na test skup
y_test_pred_rf = model.predict(df_test[['Duzina reci', 'Duzina recenica', 'Duzina teksta']])

print('Random Forest - Trening skup:\n')
print(f'Tačnost: {accuracy_score(df_train["Autor"], y_train_pred_rf)}')
print(f'F1 ocena: {f1_score(df_train["Autor"], y_train_pred_rf, average="weighted")}')
print(f'Odziv: {recall_score(df_train["Autor"], y_train_pred_rf, average="weighted")}\n')

print('Random Forest - Test skup:\n')
print(f'Tačnost: {accuracy_score(df_test["Autor"], y_test_pred_rf)}')
print(f'F1 ocena: {f1_score(df_test["Autor"], y_test_pred_rf, average="weighted")}')
print(f'Odziv: {recall_score(df_test["Autor"], y_test_pred_rf, average="weighted")}\n')


In [None]:
from sklearn.naive_bayes import MultinomialNB

classifierMB = MultinomialNB()

# Treniranje modela na trening skupu
classifierMB.fit(df_train[['Duzina reci', 'Duzina recenica', 'Duzina teksta']], df_train['Autor'])

# Predikcije na trening skupu
y_train_pred_MB = classifierMB.predict(df_train[['Duzina reci', 'Duzina recenica', 'Duzina teksta']])

# Predikcije na test skupu
y_test_pred_MB = classifierMB.predict(df_test[['Duzina reci', 'Duzina recenica', 'Duzina teksta']])

# Evaluacione metrike za Multinomial Bajes
print('Multinomial Bajes - Trening skup:\n')
print(f'Tačnost: {accuracy_score(df_train["Autor"], y_train_pred_MB)}')
print(f'F1 ocena: {f1_score(df_train["Autor"], y_train_pred_MB, average="weighted")}')
print(f'Odziv: {recall_score(df_train["Autor"], y_train_pred_MB, average="weighted")}\n')

print('Multinomial Bajes - Test skup:\n')
print(f'Tačnost: {accuracy_score(df_test["Autor"], y_test_pred_MB)}')
print(f'F1 ocena: {f1_score(df_test["Autor"], y_test_pred_MB, average="weighted")}')
print(f'Odziv: {recall_score(df_test["Autor"], y_test_pred_MB, average="weighted")}\n')