In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('srpski.csv', sep='\t')
df = df.drop(columns=['Rbr', 'SR', 'sr/sr'])

In [3]:
from nltk import FreqDist

In [4]:
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].str.lower()

In [5]:
author_counts = df['Autor'].value_counts()
single_instance_authors = author_counts[author_counts == 1].index

unique_authors_rows = df[df['Autor'].isin(single_instance_authors)].copy()
df = pd.concat([df, unique_authors_rows], ignore_index=True)

In [6]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jelenalazovic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
custom_tokenizer = RegexpTokenizer(r'\w+[\'\’]\w+|\w+')
df['Tokeni'] = df['Tekst'].apply(custom_tokenizer.tokenize)


In [8]:
all_words = [word for words in df['Tokeni'] for word in words]
fdist = FreqDist(all_words)

In [9]:
most_common_words = fdist.most_common(70)
stopwords = [word for word,count in most_common_words]
punctuation = ['.', ',', '``', "'", "''",'...','—',"-",';', ':' ]

In [10]:
def remove_stopwords_and_punctuation(tokens): 
    filtered_tokens = [word for word in tokens if word not in stopwords and word not in punctuation]
    return filtered_tokens

In [11]:
df['Tokeni'] = df['Tokeni'].apply(remove_stopwords_and_punctuation)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
author_texts = df['Tokeni'].apply(' '.join)
vectorizer = TfidfVectorizer()
tf_matrix = vectorizer.fit_transform(author_texts)
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tf_df

Unnamed: 0,1938,39,age,agnus,ah,aj,ajd,ajde,ajdemo,ajduk,...,žubori,žude,žudi,žudno,žulji,žute,žuti,žutih,žutilo,žućkastih
0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
71,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
72,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027837,0.0
73,0.0,0.0,0.0,0.0,0.1218,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(tf_matrix, df['Autor'], test_size=0.3, random_state=42, stratify=df['Autor'])

In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


In [16]:
classifierMB = MultinomialNB()
#df_random = pd.DataFrame(X_train.toarray(), columns= vectorizer.get_feature_names_out())
classifierMB.fit(X_train,y_train)
y_pred = classifierMB.predict(X_test)
y_pred_train = classifierMB.predict(X_train)
report = accuracy_score(y_test, y_pred)
report1 = accuracy_score(y_train, y_pred_train)
report


0.30434782608695654

In [17]:
model = GradientBoostingClassifier(learning_rate=0.1,max_depth=5,min_samples_split=2, n_estimators=300)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
report = accuracy_score(y_test, y_pred)
report

0.5652173913043478

In [48]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(max_depth=5,min_samples_split=15, n_estimators=300)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
report = accuracy_score(y_test, y_pred)
report

0.4782608695652174