In [195]:
import pandas as pd

In [196]:
df = pd.read_csv('srpski.csv', sep='\t')
df = df.drop(columns=['Rbr', 'SR', 'sr/sr'])
df

Unnamed: 0,Jezik,Autor,Naslov,Tekst
0,sr,1,Padajte braćo,"Padajte, braćo, plin'te u krvi! Ostav'te sela ..."
1,sr,1,Gospođici L,"Mudraci su prinosili dara: Smirnu, zlato, caru..."
2,sr,1,Gospođici u spomen,Mnogi me je dosad zapitkiv’o: sa čega sam srca...
3,sr,1,Ja sam stena,"Ja sam stena, o koju se zloba mori, svetska ču..."
4,sr,1,Kroz ponoć nemu,Kroz ponoć nemu i gusto granje vidi se zvezda ...
...,...,...,...,...
66,sr,13,Pesma o keruši,"Jutros u košari, gde sja, šuška niz rogoza žuć..."
67,sr,13,Pismo majci,"Jesi l živa, staričice moja? Sin tvoj živi i p..."
68,sr,14,Barbara,"Sećaš li se Barbara, padala je kiša neprestana..."
69,sr,14,Poljubi me,To je bilo u jednom kvartu Grada Svetlosti gde...


In [197]:
from nltk import FreqDist

In [198]:
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].str.lower()

In [199]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jelenalazovic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [200]:
custom_tokenizer = RegexpTokenizer(r'\w+[\'\’]\w+|\w+')
df['Tokeni'] = df['Tekst'].apply(custom_tokenizer.tokenize)

In [201]:
all_words = [word for words in df['Tokeni'] for word in words]
fdist = FreqDist(all_words)

In [202]:
most_common_words = fdist.most_common(50)
stopwords = [word for word,count in most_common_words]
punctuation = ['.', ',', '``', "'", "''",'...','—',"-",';', ':' ]

In [203]:
def remove_stopwords_and_punctuation(tokens): 
    filtered_tokens = [word for word in tokens if word not in stopwords and word not in punctuation]
    return filtered_tokens

In [204]:
df['Tokeni'] = df['Tokeni'].apply(remove_stopwords_and_punctuation)

In [205]:
from sklearn.model_selection import train_test_split

In [206]:
y = df[['Autor']]
X = df[['Tokeni']]

In [207]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [208]:
df_train = pd.concat([y_train, X_train], axis=1)

In [209]:
df_train['Tekst'] = df_train['Tokeni'].apply(' '.join)
author_texts = df_train.groupby('Autor')['Tekst'].apply(' '.join)
author_texts


Autor
1     ovaj kamen zemlje srbije preteć suncu dere kro...
2     znamo sudbu čeka no strah nam neće zalediti gr...
3     pevam danju pevam noću pevam sele god hoću hoć...
4     šta dao mom mestu mrze dive šta dao veliku ges...
5     tijo noći sunce spava glavom bisera grana gran...
6     volim jednim žarom neveselim sumnjom tugu lepo...
7     razbolela ljubav tvome draganu razbolela časko...
8     silni oklopnici mane straha hladni vaš oklop p...
10    mostaru voleo neku svetlanu jedne jeseni jao b...
11    poklanjala krala nevjerna bila suviše nisi dal...
12    mladi mesec pojavio iznad vlažnih rosa poljana...
13    doviđenja dragi doviđenja prijatelju jednom be...
14    bilo jednom kvartu grada svetlosti gde nikada ...
15    pođo snužden kraj dola pak stado malo kod kola...
Name: Tekst, dtype: object

In [210]:
from sklearn.feature_extraction.text import CountVectorizer

In [211]:
vectorizer = CountVectorizer()
tf_matrix = vectorizer.fit_transform(author_texts)
vocabulary = vectorizer.get_feature_names_out()
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=vocabulary)
tf_df

Unnamed: 0,age,ah,ajd,ajde,ajdemo,ajduk,ako,ala,ali,amo,...,žrtve,žuboreći,žubori,žudi,žudno,žulji,žute,žutih,žutilo,žućkastih
0,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,3,2,1,3,0,6,11,19,...,0,0,2,0,1,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,2,0,...,0,1,1,1,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,3,0,1,0,...,0,0,0,0,0,0,0,0,0,0
