In [4]:
import os
import joblib
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from utils.evaluation import evaluate_model
from utils.preprocess import preprocess_text_embeddings

In [None]:
# Get the constants via config.py
from utils.constants import (
    DATA_FOLDER_INPUT,
    LANGUAGE_MODEL_EMBEDDINGS,
    DATA_FOLDER_NAME_POSITIVE,
    DATA_FOLDER_NAME_NEGATIVE,
)

In [None]:
embeddings_positive = preprocess_text_embeddings(DATA_FOLDER_INPUT, DATA_FOLDER_NAME_POSITIVE, LANGUAGE_MODEL_EMBEDDINGS)
embeddings_negative = preprocess_text_embeddings(DATA_FOLDER_INPUT, DATA_FOLDER_NAME_NEGATIVE, LANGUAGE_MODEL_EMBEDDINGS)

X_embeddings = embeddings_positive + embeddings_negative
y_embeddings = [1] * len(embeddings_positive) + [0] * len(embeddings_negative)

(0,)
(0,)
(0,)


*Embeddings are empty for some reason???? Although, I fill empty texts with 0s*

In [None]:
print(f"Number of positive samples: {len(embeddings_positive)}")
print(f"Number of negative samples: {len(embeddings_negative)}")

Number of positive samples: 1000
Number of negative samples: 1000
Number of neutral samples: 1000


In [None]:
def train(embeddings, labels, model_type='svm', kernel='linear'):
  model_path = os.path.join('models', f'{model_type}_model_embeddings({kernel}).pkl')
  
  X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, stratify=labels, random_state=42)

  if model_type == 'svm':
      model = SVC(kernel=kernel, random_state=42)
  elif model_type == 'naive_bayes':
      model = MultinomialNB()
  elif model_type == 'knn':
      model = KNeighborsClassifier(n_neighbors=3)
  else:
      raise ValueError(f"Unsupported model type: {model_type}")

  model.fit(X_train, y_train)

  joblib.dump(model, model_path)

  print("Model training complete!")
  print(f"Model saved to {model_path}")


  return model, X_train, X_test, y_train, y_test

In [None]:
X_embeddings = embeddings_positive + embeddings_negative
y_embeddings = [1] * len(embeddings_positive) + [0] * len(embeddings_negative)

model, X_train, X_test, y_train, y_test = train(X_embeddings, y_embeddings, model_type='svm', kernel='linear')

ValueError: Found array with 0 feature(s) (shape=(2400, 0)) while a minimum of 1 is required by SVC.

In [None]:
print(f"Training data: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Test data: {X_test.shape[0]} samples, {X_test.shape[1]} features")
print(f"Training labels: {len(y_train)} labels")
print(f"Test labels: {len(y_test)} labels")

In [None]:
cross_val_results, confusion_matrix_result = evaluate_model(model, X_train, y_train, X_test, y_test)