In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('srpski.csv', sep='\t')
df = df.drop(columns=['Rbr', 'SR', 'sr/sr'])

In [3]:
from nltk import FreqDist

In [4]:
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].str.lower()

In [5]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jelenalazovic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
custom_tokenizer = RegexpTokenizer(r'\w+[\'\’]\w+|\w+')
df['Tokeni'] = df['Tekst'].apply(custom_tokenizer.tokenize)


In [7]:
all_words = [word for words in df['Tokeni'] for word in words]
fdist = FreqDist(all_words)

In [8]:
most_common_words = fdist.most_common(70)
stopwords = [word for word,count in most_common_words]
punctuation = ['.', ',', '``', "'", "''",'...','—',"-",';', ':' ]

In [9]:
def remove_stopwords_and_punctuation(tokens): 
    filtered_tokens = [word for word in tokens if word not in punctuation]
    return filtered_tokens

In [10]:
df['Tokeni'] = df['Tokeni'].apply(remove_stopwords_and_punctuation)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
author_texts = df['Tokeni'].apply(' '.join)
vectorizer = TfidfVectorizer()
tf_matrix = vectorizer.fit_transform(author_texts)
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tf_df

Unnamed: 0,1938,39,afrodita,age,agnus,ah,aj,ajd,ajde,ajdemo,...,žudite,žudnih,žudno,žulji,žut,žute,žuti,žutih,žutilo,žućkastih
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(tf_matrix, df['Autor'], test_size=0.3, random_state=42, stratify=df['Autor'])

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


In [15]:
classifierMB = MultinomialNB()
#df_random = pd.DataFrame(X_train.toarray(), columns= vectorizer.get_feature_names_out())
classifierMB.fit(X_train,y_train)
y_pred = classifierMB.predict(X_test)
y_pred_train = classifierMB.predict(X_train)
report = accuracy_score(y_test, y_pred)
report1 = accuracy_score(y_train, y_pred_train)
report


0.11904761904761904

In [16]:
model = GradientBoostingClassifier(learning_rate=0.1,max_depth=5,min_samples_split=2, n_estimators=300)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
report = accuracy_score(y_test, y_pred)
report

0.11904761904761904

In [17]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(max_depth=5,min_samples_split=15, n_estimators=300)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
report = accuracy_score(y_test, y_pred)
report

0.16666666666666666

In [18]:
# Create an SVM model
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', C=1.0)  # You can adjust the kernel and other hyperparameters

# Train the model
svm_model.fit(X_train, y_train)


In [19]:
# Make predictions on the test set
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.19047619047619047