In [1]:
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 

from preprocess import preprocess_text_bow

***tf-idf not used so to not reduce the context and texts are small***

In [2]:
DATA_FOLDER_RAW = "raw"
DATA_FOLDER_PROCESSED = "processed"

DATA_FOLDER_NAME_POSITIVE = "GOOD"
DATA_FOLDER_NAME_NEGATIVE = "BAD"
DATA_FOLDER_NAME_NEUTRAL = "NEUTRAL"

In [3]:
lemmas_positive = preprocess_text_bow(DATA_FOLDER_RAW, DATA_FOLDER_NAME_POSITIVE, lemmatize=True)
labels_positive = [1] * len(lemmas_positive)

lemmas_negative = preprocess_text_bow(DATA_FOLDER_RAW, DATA_FOLDER_NAME_NEGATIVE, lemmatize=True)
labels_negative = [0] * len(lemmas_negative)

lemmas_neutral = preprocess_text_bow(DATA_FOLDER_RAW, DATA_FOLDER_NAME_NEUTRAL, lemmatize=True)
labels_neutral = [2] * len(lemmas_neutral)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(f"Number of positive samples: {len(lemmas_positive)}")
print(f"Number of negative samples: {len(lemmas_negative)}")
print(f"Number of neutral samples: {len(lemmas_neutral)}")

Number of positive samples: 1000
Number of negative samples: 1000
Number of neutral samples: 1000


In [5]:
def train(lemmas, labels, model_type='svm', kernel='linear'):
  model_path = os.path.join('models', f'{model_type}_model({kernel}).pkl')
  vectorizer_path = os.path.join('models', f'{model_type}_vectorizer({kernel}).pkl')

  corpus = [' '.join(doc) for doc in lemmas]

  X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, stratify=labels, random_state=42)

  vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
  vectorizer.fit(X_train)

  X_train_vect = vectorizer.transform(X_train)
  X_test_vect = vectorizer.transform(X_test)

  if model_type == 'svm':
      model = SVC(kernel=kernel, random_state=42)
  elif model_type == 'naive_bayes':
      model = MultinomialNB()
  elif model_type == 'knn':
      model = KNeighborsClassifier(n_neighbors=3)
  else:
      raise ValueError(f"Unsupported model type: {model_type}")

  model.fit(X_train_vect, y_train)

  # Save the model and vectorizer
  joblib.dump(model, model_path)
  joblib.dump(vectorizer, vectorizer_path)

  return model, vectorizer, X_train_vect, X_test_vect, y_train, y_test

In [6]:
all_lemmas = lemmas_positive + lemmas_negative + lemmas_neutral
all_labels = labels_positive + labels_negative + labels_neutral

model, vectorizer, X_train_vect, X_test_vect, y_train, y_test = train(all_lemmas, all_labels)

In [7]:
print(f"Training data: {X_train_vect.shape[0]} samples, {X_train_vect.shape[1]} features")
print(f"Test data: {X_test_vect.shape[0]} samples, {X_test_vect.shape[1]} features")
print(f"Training labels: {len(y_train)} labels")
print(f"Test labels: {len(y_test)} labels")

Training data: 2400 samples, 18156 features
Test data: 600 samples, 18156 features
Training labels: 2400 labels
Test labels: 600 labels
