In [1]:
import pandas as pd
import string

In [2]:
df = pd.read_csv('srpski.csv', sep='\t')
df = df.drop(columns=['Rbr', 'SR', 'sr/sr', 'Naslov', 'Jezik'])

In [3]:
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].str.lower()

In [4]:
import classla



In [5]:
classla.download('sr')

ConnectionError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /clarinsi/classla-resources/main/resources_1.0.1.json.zip (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x16c4eab20>: Failed to resolve 'raw.githubusercontent.com' ([Errno 8] nodename nor servname provided, or not known)"))

In [6]:
nlp = classla.Pipeline("sr",  processors='tokenize,lemma,pos')

2023-12-26 16:07:44 INFO: Loading these models for language: sr (Serbian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2023-12-26 16:07:44 INFO: Use device: cpu
2023-12-26 16:07:44 INFO: Loading: tokenize
2023-12-26 16:07:44 INFO: Loading: pos
  from .autonotebook import tqdm as notebook_tqdm
2023-12-26 16:07:46 INFO: Loading: lemma
2023-12-26 16:07:50 INFO: Done loading processors!


In [7]:
def process_text(text):
    # Process the text using Classla pipeline
    doc = nlp(text)
    
    # Extract information (e.g., lemmas) from the processed document
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    
    return lemmas

# Apply the process_text function to a new column in the DataFrame
df['Procesiran tekst'] = df['Tekst'].apply(process_text)


In [8]:
from nltk import FreqDist

In [9]:
def remove_punctuation(words):
    punctuation = string.punctuation
    punctuation_additional = ['``', "'", "''",'...','—',"-",';', ':',"“", "„", ".."]
    words_without_punct = [word for word in words if word not in punctuation and word not in punctuation_additional] 
    return words_without_punct

# Apply the remove_punctuation function to the processed_text column
df['Procesiran tekst'] = df['Procesiran tekst'].apply(remove_punctuation)

In [10]:
all_words = [word for words in df['Procesiran tekst'] for word in words]
fdist = FreqDist(all_words)
def remove_freq_words(words):
    fw_words = fdist.most_common(100)
    freq_words = [word for word,count in fw_words if count > 100 or len(word) in (1,2)]
    words_without_punct = [word for word in words if word not in freq_words] 
    return words_without_punct

# Apply the remove_punctuation function to the processed_text column
df['Procesiran tekst'] = df['Procesiran tekst'].apply(remove_freq_words)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
author_texts = df['Procesiran tekst'].apply(' '.join)
vectorizer = TfidfVectorizer(max_features=3000)
tf_matrix = vectorizer.fit_transform(author_texts)
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tf_df

Unnamed: 0,1938,ah,aj,ajd,ajduk,ako,ala,aleksa,ali,am,...,žrtva,žubori,žuboriti,žudan,žudeti,žudila,žulja,žut,žuta,žuti
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.085636,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.058950,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(tf_matrix, df['Autor'], test_size=0.2, random_state=42, stratify=df['Autor'])

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

In [15]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier()

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(gb_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}


In [23]:
model = GradientBoostingClassifier(learning_rate=0.1,max_depth=3,min_samples_split=5, min_samples_leaf=4, n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
report = accuracy_score(y_test, y_pred)
report

0.25

In [16]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(max_depth=5,min_samples_split=15, n_estimators=300)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
report = accuracy_score(y_test, y_pred)
report

0.21428571428571427

In [17]:
classifierMB = MultinomialNB()
classifierMB.fit(X_train,y_train)
y_pred = classifierMB.predict(X_test)
y_pred_train = classifierMB.predict(X_train)
report = accuracy_score(y_test, y_pred)
report1 = accuracy_score(y_train, y_pred_train)
report


0.25

In [18]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV



In [19]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'sigmoid']
}

# Initialize SVC
svc = SVC()

# Perform grid search with cross-validation
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train,y_train)

# Get the best hyperparameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'C': 10, 'kernel': 'sigmoid'}


In [20]:
svm_model = SVC(kernel='sigmoid', C=10, coef0=1.0, gamma='scale')  # You can adjust the kernel and other hyperparameters

# Train the model
svm_model.fit(X_train, y_train)

In [21]:
# Make predictions on the test set
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5

In [22]:
param_grid = {
    'C': [0.1, 1, 10,20],
    'kernel': ['sigmoid'],
    'gamma': ['scale', 'auto'],
    'coef0': [0.0, 1.0],
}

# Initialize SVC
svc = SVC()

# Perform grid search with cross-validation
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train,y_train)

# Get the best hyperparameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'C': 10, 'coef0': 1.0, 'gamma': 'scale', 'kernel': 'sigmoid'}
