In [1]:
import os
import joblib
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from utils.evaluation import evaluate_model
from utils.preprocess import preprocess_text_bow

In [2]:
# Get the constants via config.py
from utils.constants import (
    DATA_FOLDER_INPUT,
    LANGUAGE_MODEL_BOW,
    DATA_FOLDER_NAME_POSITIVE,
    DATA_FOLDER_NAME_NEGATIVE,
)

***tf-idf not preffered to not reduce the context and texts are small enough***

In [3]:
lemmas_positive = preprocess_text_bow(DATA_FOLDER_INPUT, DATA_FOLDER_NAME_POSITIVE, LANGUAGE_MODEL_BOW)
labels_positive = [1] * len(lemmas_positive)

lemmas_negative = preprocess_text_bow(DATA_FOLDER_INPUT, DATA_FOLDER_NAME_NEGATIVE, LANGUAGE_MODEL_BOW)
labels_negative = [0] * len(lemmas_negative)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(f"Number of positive samples: {len(lemmas_positive)}")
print(f"Number of negative samples: {len(lemmas_negative)}")

Number of positive samples: 1000
Number of negative samples: 1000


In [5]:
def train(lemmas, labels, model_type='svm'):
  model_path = os.path.join('models', f'{model_type}_model.pkl')
  vectorizer_path = os.path.join('models', f'{model_type}_vectorizer.pkl')


  corpus = [' '.join(doc) for doc in lemmas]

  X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, stratify=labels, random_state=42)

  vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
  vectorizer.fit(X_train)

  X_train_vect = vectorizer.transform(X_train)
  X_test_vect = vectorizer.transform(X_test)

  if model_type == 'svm':
      model = SVC(random_state=42)
  elif model_type == 'naive_bayes':
      model = MultinomialNB()
  elif model_type == 'knn':
      model = KNeighborsClassifier(n_neighbors=3)
  else:
      raise ValueError(f"Unsupported model type: {model_type}")

  model.fit(X_train_vect, y_train)

  joblib.dump(model, model_path)
  joblib.dump(vectorizer, vectorizer_path)

  print("Model training complete!")
  print(f"Model saved to {model_path}")
  print(f"Vectorizer saved to {vectorizer_path}")

  return model, vectorizer, X_train_vect, X_test_vect, y_train, y_test

In [6]:
all_lemmas = lemmas_positive + lemmas_negative
all_labels = labels_positive + labels_negative

model, vectorizer, X_train, X_test, y_train, y_test = train(all_lemmas, all_labels)

Model training complete!
Model saved to models/svm_model.pkl
Vectorizer saved to models/svm_vectorizer.pkl


In [7]:
print(f"Training data: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Test data: {X_test.shape[0]} samples, {X_test.shape[1]} features")
print(f"Training labels: {len(y_train)} labels")
print(f"Test labels: {len(y_test)} labels")

Training data: 1600 samples, 12106 features
Test data: 400 samples, 12106 features
Training labels: 1600 labels
Test labels: 400 labels


In [8]:
cross_val_results, confusion_matrix_result = evaluate_model(model, X_train, y_train, X_test, y_test)

Cross-validation results:
Accuracy: 0.520625
Precision: 0.5248427270909932
Recall: 0.520625
F1 Score: 0.5037881859423999
Confusion Matrix:
[[199   1]
 [196   4]]
