# A. Préparation de données

In [1]:
import pandas as pd
df = pd.read_csv("spooky_cleaned.csv")
df.head()

Unnamed: 0,id,text,author,space_tokenized_text,rule_tokenized_text,wordpiece_tokenized_text
0,id26305,proces however afforded means ascertaining dim...,EAP,"['proces', 'however', 'afforded', 'means', 'as...","['proces', 'however', 'afforded', 'means', 'as...","['[CLS]', 'pro', '##ces', 'however', 'afforded..."
1,id17569,never occurred fumbling might mere mistake,HPL,"['never', 'occurred', 'fumbling', 'might', 'me...","['never', 'occurred', 'fumbling', 'might', 'me...","['[CLS]', 'never', 'occurred', 'fu', '##mbling..."
2,id11008,left hand gold snuff box capered hil cutting m...,EAP,"['left', 'hand', 'gold', 'snuff', 'box', 'cape...","['left', 'hand', 'gold', 'snuff', 'box', 'cape...","['[CLS]', 'left', 'hand', 'gold', 's', '##nu',..."
3,id27763,lovely spring looked windsor terrace sixteen f...,MWS,"['lovely', 'spring', 'looked', 'windsor', 'ter...","['lovely', 'spring', 'looked', 'windsor', 'ter...","['[CLS]', 'lovely', 'spring', 'looked', 'winds..."
4,id12958,finding nothing else even gold superintendent ...,HPL,"['finding', 'nothing', 'else', 'even', 'gold',...","['finding', 'nothing', 'else', 'even', 'gold',...","['[CLS]', 'finding', 'nothing', 'else', 'even'..."


# B. Encodage de la variable à prédire

In [2]:
from sklearn.preprocessing import OneHotEncoder
# OneHot Encoding for the 'author' column
encoder = OneHotEncoder(sparse_output=False)
author_encoded = encoder.fit_transform(df[['author']])  # Needs to be 2D

# Convert the encoded result into a DataFrame
encoded_df = pd.DataFrame(
    author_encoded,
    columns=encoder.categories_[0],  # Use the unique categories as column names
    index=df.index
)

# Add the OneHotEncoded columns back to the original DataFrame
df = pd.concat([df, encoded_df], axis=1)

df.head()

Unnamed: 0,id,text,author,space_tokenized_text,rule_tokenized_text,wordpiece_tokenized_text,EAP,HPL,MWS
0,id26305,proces however afforded means ascertaining dim...,EAP,"['proces', 'however', 'afforded', 'means', 'as...","['proces', 'however', 'afforded', 'means', 'as...","['[CLS]', 'pro', '##ces', 'however', 'afforded...",1.0,0.0,0.0
1,id17569,never occurred fumbling might mere mistake,HPL,"['never', 'occurred', 'fumbling', 'might', 'me...","['never', 'occurred', 'fumbling', 'might', 'me...","['[CLS]', 'never', 'occurred', 'fu', '##mbling...",0.0,1.0,0.0
2,id11008,left hand gold snuff box capered hil cutting m...,EAP,"['left', 'hand', 'gold', 'snuff', 'box', 'cape...","['left', 'hand', 'gold', 'snuff', 'box', 'cape...","['[CLS]', 'left', 'hand', 'gold', 's', '##nu',...",1.0,0.0,0.0
3,id27763,lovely spring looked windsor terrace sixteen f...,MWS,"['lovely', 'spring', 'looked', 'windsor', 'ter...","['lovely', 'spring', 'looked', 'windsor', 'ter...","['[CLS]', 'lovely', 'spring', 'looked', 'winds...",0.0,0.0,1.0
4,id12958,finding nothing else even gold superintendent ...,HPL,"['finding', 'nothing', 'else', 'even', 'gold',...","['finding', 'nothing', 'else', 'even', 'gold',...","['[CLS]', 'finding', 'nothing', 'else', 'even'...",0.0,1.0,0.0


# C. Construction des bases d’entraînement et de test

In [3]:
from sklearn.model_selection import train_test_split
X = df['text']
y = df['author']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#smote ,nn, smote nn, undersampling, oversampling

In [4]:
print("Class distribution in the original dataset:")
print(y.value_counts())

print("\nClass distribution in the training dataset:")
print(y_train.value_counts())

print("\nClass distribution in the test dataset:")
print(y_test.value_counts())

Class distribution in the original dataset:
author
EAP    7899
MWS    6044
HPL    5634
Name: count, dtype: int64

Class distribution in the training dataset:
author
EAP    5479
MWS    4297
HPL    3927
Name: count, dtype: int64

Class distribution in the test dataset:
author
EAP    2420
MWS    1747
HPL    1707
Name: count, dtype: int64


# D. Méthodes de vectorisation

## 1. Frequence lexical

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [6]:
print("\nVectorisation par fréquence lexicale (CountVectorizer) :")
print("Forme de X_train_counts :", X_train_cv.shape)
print("Forme de X_test_counts  :", X_test_cv.shape)


Vectorisation par fréquence lexicale (CountVectorizer) :
Forme de X_train_counts : (13703, 22919)
Forme de X_test_counts  : (5874, 22919)


X_train_cv contient des matrices de fréquence de mots, où chaque ligne représente un texte du dataset et chaque colonne correspond à un mot du vocabulaire extrait par CountVectorizer. La valeur dans la matrice indique combien de fois ce mot apparaît dans la ligne (document).

In [7]:
df_cv = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names_out())
df_cv.head()

Unnamed: 0,ab,aback,abaft,abandon,abandoned,abandoning,abandonment,abaout,abased,abasement,...,zobna,zobnarian,zodiac,zodiacal,zoilus,zokar,zone,zopyrus,zory,zubmizion
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2. TF-IDF

In [8]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)  # On ajuste et transforme X_train une seule fois
X_test_tfidf = tfidf.transform(X_test) 

In [9]:
df_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())
df_tfidf.head()

Unnamed: 0,ab,aback,abaft,abandon,abandoned,abandoning,abandonment,abaout,abased,abasement,...,zobna,zobnarian,zodiac,zodiacal,zoilus,zokar,zone,zopyrus,zory,zubmizion
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Smote pour équilibré la dataset

In [10]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_train_cv_balanced, y_train_cv_balanced = smote.fit_resample(X_train_cv, y_train)
X_train_tfidf_balanced, y_train_tfidf_balanced = smote.fit_resample(X_train_tfidf, y_train)

# E. Entrainement

### 1. Créer trois modèles :

In [11]:
from sklearn.neural_network import MLPClassifier

mlp1_cv = MLPClassifier(hidden_layer_sizes=(100,), max_iter=5, activation='relu', solver='adam', random_state=0)
mlp2_cv = MLPClassifier(hidden_layer_sizes=(100,), max_iter=5, activation='logistic', solver='sgd', random_state=0)
mlp3_cv = MLPClassifier(hidden_layer_sizes=(100,), max_iter=5, activation='tanh', solver='adam', random_state=0)

mlp1_tfidf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=5, activation='relu', solver='adam', random_state=0)
mlp2_tfidf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=5, activation='logistic', solver='sgd', random_state=0)
mlp3_tfidf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=5, activation='tanh', solver='adam', random_state=0)

### 2.Entraînement sur les représentations vectorielles :

In [12]:
mlp1_cv.fit(X_train_cv_balanced, y_train_cv_balanced)
mlp2_cv.fit(X_train_cv_balanced, y_train_cv_balanced)
mlp3_cv.fit(X_train_cv_balanced, y_train_cv_balanced)

mlp1_tfidf.fit(X_train_tfidf_balanced, y_train_tfidf_balanced)
mlp2_tfidf.fit(X_train_tfidf_balanced, y_train_tfidf_balanced)
mlp3_tfidf.fit(X_train_tfidf_balanced, y_train_tfidf_balanced)



### 3. Prédiction et Évaluation :

In [13]:
from sklearn.metrics import classification_report

y_pred1 = mlp1_cv.predict(X_train_cv)
y_pred2 = mlp2_cv.predict(X_train_cv)
y_pred3 = mlp3_cv.predict(X_train_cv)
print("MLP1 + CountVectorizer:\n", classification_report(y_train, y_pred1))
print("MLP2 + CountVectorizer:\n", classification_report(y_train, y_pred2))
print("MLP3 + CountVectorizer:\n", classification_report(y_train, y_pred3))

y2_pred1 = mlp1_tfidf.predict(X_train_tfidf)
y2_pred2 = mlp2_tfidf.predict(X_train_tfidf)
y2_pred3 = mlp3_tfidf.predict(X_train_tfidf)
print("MLP1 + TF-IDF:\n", classification_report(y_train, y2_pred1))
print("MLP2 + TF-IDF:\n", classification_report(y_train, y2_pred2))
print("MLP3 + TF-IDF:\n", classification_report(y_train, y2_pred3))

MLP1 + CountVectorizer:
               precision    recall  f1-score   support

         EAP       0.99      0.97      0.98      5479
         HPL       0.98      0.99      0.98      3927
         MWS       0.98      0.99      0.99      4297

    accuracy                           0.99     13703
   macro avg       0.98      0.99      0.99     13703
weighted avg       0.99      0.99      0.99     13703

MLP2 + CountVectorizer:
               precision    recall  f1-score   support

         EAP       0.40      1.00      0.57      5479
         HPL       0.00      0.00      0.00      3927
         MWS       0.55      0.01      0.02      4297

    accuracy                           0.40     13703
   macro avg       0.32      0.34      0.20     13703
weighted avg       0.33      0.40      0.24     13703



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


MLP3 + CountVectorizer:
               precision    recall  f1-score   support

         EAP       0.99      0.97      0.98      5479
         HPL       0.98      0.99      0.99      3927
         MWS       0.98      0.99      0.99      4297

    accuracy                           0.99     13703
   macro avg       0.98      0.99      0.99     13703
weighted avg       0.99      0.99      0.99     13703

MLP1 + TF-IDF:
               precision    recall  f1-score   support

         EAP       0.99      0.99      0.99      5479
         HPL       0.99      0.99      0.99      3927
         MWS       0.99      0.99      0.99      4297

    accuracy                           0.99     13703
   macro avg       0.99      0.99      0.99     13703
weighted avg       0.99      0.99      0.99     13703

MLP2 + TF-IDF:
               precision    recall  f1-score   support

         EAP       0.40      1.00      0.57      5479
         HPL       0.00      0.00      0.00      3927
         MWS      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# F. Test

### 1. Predict on Test Data

In [14]:
from sklearn.metrics import classification_report

y_pred1 = mlp1_cv.predict(X_test_cv)
y_pred2 = mlp2_cv.predict(X_test_cv)
y_pred3 = mlp3_cv.predict(X_test_cv)
print("MLP1 + CountVectorizer:\n", classification_report(y_test, y_pred1))
print("MLP2 + CountVectorizer:\n", classification_report(y_test, y_pred2))
print("MLP3 + CountVectorizer:\n", classification_report(y_test, y_pred3))

y2_pred1 = mlp1_tfidf.predict(X_test_tfidf)
y2_pred2 = mlp2_tfidf.predict(X_test_tfidf)
y2_pred3 = mlp3_tfidf.predict(X_test_tfidf)
print("MLP1 + TF-IDF:\n", classification_report(y_test, y2_pred1))
print("MLP2 + TF-IDF:\n", classification_report(y_test, y2_pred2))
print("MLP3 + TF-IDF:\n", classification_report(y_test, y2_pred3))

MLP1 + CountVectorizer:
               precision    recall  f1-score   support

         EAP       0.81      0.76      0.79      2420
         HPL       0.76      0.79      0.78      1707
         MWS       0.76      0.80      0.78      1747

    accuracy                           0.78      5874
   macro avg       0.78      0.78      0.78      5874
weighted avg       0.78      0.78      0.78      5874

MLP2 + CountVectorizer:
               precision    recall  f1-score   support

         EAP       0.41      1.00      0.58      2420
         HPL       1.00      0.00      0.00      1707
         MWS       0.47      0.01      0.02      1747

    accuracy                           0.41      5874
   macro avg       0.63      0.34      0.20      5874
weighted avg       0.60      0.41      0.25      5874

MLP3 + CountVectorizer:
               precision    recall  f1-score   support

         EAP       0.81      0.76      0.78      2420
         HPL       0.76      0.79      0.77      1707


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2. Compute Prediction Time

In [15]:
import time

def time_prediction(model, X):
    start = time.time()
    model.predict(X)
    end = time.time()
    return end - start

time_mlp1_cv = time_prediction(mlp1_cv, X_test_cv)
time_mlp2_cv = time_prediction(mlp2_cv, X_test_cv)
time_mlp3_cv = time_prediction(mlp3_cv, X_test_cv)

print(f"Prediction Time (MLP1 + CountVectorizer): {time_mlp1_cv:.4f} seconds")
print(f"Prediction Time (MLP2 + CountVectorizer): {time_mlp2_cv:.4f} seconds")
print(f"Prediction Time (MLP3 + CountVectorizer): {time_mlp3_cv:.4f} seconds")


time_mlp1_tfidf = time_prediction(mlp1_tfidf, X_test_tfidf)
time_mlp2_tfidf = time_prediction(mlp2_tfidf, X_test_tfidf)
time_mlp3_tfidf = time_prediction(mlp3_tfidf, X_test_tfidf)

print(f"Prediction Time (MLP1 + TF-IDF:): {time_mlp1_tfidf:.4f} seconds")
print(f"Prediction Time (MLP2 + TF-IDF): {time_mlp2_tfidf:.4f} seconds")
print(f"Prediction Time (MLP3 + TF-IDF): {time_mlp3_tfidf:.4f} seconds")

Prediction Time (MLP1 + CountVectorizer): 0.0358 seconds
Prediction Time (MLP2 + CountVectorizer): 0.0595 seconds
Prediction Time (MLP3 + CountVectorizer): 0.0249 seconds
Prediction Time (MLP1 + TF-IDF:): 0.0119 seconds
Prediction Time (MLP2 + TF-IDF): 0.0412 seconds
Prediction Time (MLP3 + TF-IDF): 0.0250 seconds


In [21]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

evaluation_results = []

models = [
    (mlp2_cv, "logistic", "CountVectorizer"),
    (mlp3_cv, "tanh", "CountVectorizer"),
    (mlp2_tfidf, "logistic", "TF-IDF"),
    (mlp3_tfidf, "tanh", "TF-IDF")
]

test_sets = {
    "CountVectorizer": X_test_cv,
    "TF-IDF": X_test_tfidf
}

for model, activation, vectorizer in models:
    y_pred = model.predict(test_sets[vectorizer])
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    evaluation_results.append({
        'Model': activation,
        'Vectorizer': vectorizer,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    })

evaluation = pd.DataFrame(evaluation_results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
evaluation

Unnamed: 0,Model,Vectorizer,Accuracy,Precision,Recall,F1-score
0,logistic,CountVectorizer,0.413687,0.602092,0.413687,0.247276
1,tanh,CountVectorizer,0.779707,0.780978,0.779707,0.779796
2,logistic,TF-IDF,0.411985,0.169732,0.411985,0.240416
3,tanh,TF-IDF,0.828056,0.828061,0.828056,0.828023


# G. Vectorisations basées sur les embeddings de mots

In [24]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import time
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hafsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
import numpy as np
import pandas as pd
from collections import Counter
import string


texts = df['text'].tolist()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.split()


tokenized_texts = [preprocess(text) for text in texts]
word_counts = Counter(word for text in tokenized_texts for word in text)
vocab = [word for word, count in word_counts.most_common(10000)]
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)

# create training data where each sample represent a target and its context
window_size = 3
training_data = []
for text in tokenized_texts:
    indices = [word_to_idx[word] for word in text if word in word_to_idx]
    for i, target in enumerate(indices):
        context = []
        for j in range(i-window_size, i+window_size+1):
            if j != i and 0 <= j < len(indices):
                context.append(indices[j])
        if len(context) > 0:
            training_data.append((context, target))



embedding_dim = 10
learning_rate = 0.01
epochs = 3



W1 = np.random.randn(vocab_size, embedding_dim) * 0.01
W2 = np.random.randn(embedding_dim, vocab_size) * 0.01

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

#input is the context and the output is the target , this is CBOW
for epoch in range(epochs):
    total_loss = 0
    # Forward Pass
    for context, target in training_data:
        context_vectors = W1[context, :] # we have W1 contain for each word its embedding
        hidden = np.mean(context_vectors, axis=0)
        output = np.dot(hidden, W2)  
        probs = softmax(output)


        target_onehot = np.zeros(vocab_size)
        target_onehot[target] = 1
        loss = -np.log(probs[target])  
        total_loss += loss
        # Backward Pass
        grad_output = probs - target_onehot
        grad_hidden = np.dot(W2, grad_output)
        grad_W2 = np.outer(hidden, grad_output)
        grad_context = grad_hidden / len(context)


        for word_idx in context:
            W1[word_idx] -= learning_rate * grad_context  
        W2 -= learning_rate * grad_W2

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(training_data):.4f}")

Epoch 1/3, Loss: 9.2103
Epoch 2/3, Loss: 9.2103
Epoch 3/3, Loss: 9.2100


In [27]:
def get_embedding(word):
    return W1[word_to_idx[word]] if word in word_to_idx else None


def find_similar_words(query_word, top_n=5):
    if query_word not in word_to_idx:
        return []

    query_vec = W1[word_to_idx[query_word]]
    similarities = []

    for word, idx in word_to_idx.items():
        vec = W1[idx]
        sim = np.dot(query_vec, vec)/(np.linalg.norm(query_vec)*np.linalg.norm(vec))
        similarities.append((word, sim))

    return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]


word = "spring" 
similar_words = find_similar_words(word, top_n=6)

print(f"Mots similaires à '{word}':")
for w, score in similar_words:
    print(f"{w} (similarité: {score:.4f})")

Mots similaires à 'spring':
spring (similarité: 1.0000)
dormant (similarité: 0.8800)
welfare (similarité: 0.8550)
worship (similarité: 0.8543)
gibbous (similarité: 0.8487)
emitting (similarité: 0.8448)


### Glove and FastText

In [31]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import FastText, KeyedVectors
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from joblib import dump
nltk.download('punkt')
import os

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hafsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
def vectorize(vectors, words):
    words = [word for word in words if word in vectors]
    if len(words) == 0:
        return np.zeros(50)
    return np.mean(vectors[words], axis=0)

In [33]:
#use FastText
X_train_ft = [word_tokenize(text) for text in X_train]
X_test_ft = [word_tokenize(text) for text in X_test]

model = FastText(X_train_ft, vector_size=50, window=5, min_count=1, workers=4)

X_train_ft = [vectorize(model.wv, sentence) for sentence in X_train_ft]
X_test_ft = [vectorize(model.wv, sentence) for sentence in X_test_ft]

In [34]:
#use glove
X_train_gl = [word_tokenize(text) for text in X_train]
X_test_gl = [word_tokenize(text) for text in X_test]

glove = KeyedVectors.load_word2vec_format('glove.6B.50d.txt', binary=False, no_header=True)

X_train_glove = [vectorize(glove, sentence) for sentence in X_train_gl]
X_test_glove = [vectorize(glove, sentence) for sentence in X_test_gl]

### Evaluation

In [35]:
from gensim.models import Word2Vec
import numpy as np
from nltk.tokenize import word_tokenize

X_train_w2v = [word_tokenize(text) for text in X_train]
X_test_w2v = [word_tokenize(text) for text in X_test]

w2v_model = Word2Vec(sentences=X_train_w2v, vector_size=50, window=5, min_count=1, workers=4)

def vectorize_w2v(model, words):
    words = [word for word in words if word in model.wv]
    if len(words) == 0:
        return np.zeros(50)
    return np.mean(model.wv[words], axis=0)

X_train_w2v = [vectorize_w2v(w2v_model, sentence) for sentence in X_train_w2v]
X_test_w2v = [vectorize_w2v(w2v_model, sentence) for sentence in X_test_w2v]


In [36]:
def train(vectorizer, model, epochs=10):
    X = None
    y = y_train

    if vectorizer == 'ft':
        X = X_train_ft
    elif vectorizer == 'gl':
        X = X_train_glove
    elif vectorizer == 'w2v':
        X = X_train_w2v
    else:
        raise ValueError("Invalid vectorizer")

    losses = []
    prev_accuracy = None 

    for epoch in range(epochs):
        model.partial_fit(X, y, classes=np.unique(y_train))

        y_train_pred = model.predict(X)

        accuracy = accuracy_score(y, y_train_pred)
        precision = precision_score(y, y_train_pred, average='weighted', zero_division=0)
        recall = recall_score(y, y_train_pred, average='weighted')
        f1 = f1_score(y, y_train_pred, average='weighted')

        print(f"Epoch {epoch+1}/{epochs} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
        
        if prev_accuracy is not None:
            loss = accuracy - prev_accuracy
            losses.append(loss)
        prev_accuracy = accuracy

    os.makedirs('models', exist_ok=True)
    dump(model, f'models/{model.activation}_{vectorizer}.joblib')

    return losses

In [37]:
models = [
    MLPClassifier(hidden_layer_sizes=(32, 64), max_iter=1, activation='logistic'),
    MLPClassifier(hidden_layer_sizes=(32, 64), max_iter=1, activation='tanh')
]

vectorizers = ['ft', 'gl', 'w2v']  # Add w2v
losses = {}

for model in models:
    for vectorizer in vectorizers:
        print(f"Training model with {model.activation} activation and {vectorizer} vectorizer")
        loss = train(vectorizer, model, 100)
        losses[f"{model.activation}_{vectorizer}"] = loss

Training model with logistic activation and ft vectorizer
Epoch 1/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 2/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 3/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 4/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 5/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 6/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 7/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 8/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 9/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 10/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 11/100 - Accuracy: 0.3998, Precision: 0.1599, Recall: 0.3998, F1-score: 0.2284
Epoch 12/100 - A

In [40]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier

def train_and_evaluate(vectorizer, model):
    if vectorizer == 'w2v':
        X_train_vec, X_test_vec = X_train_w2v, X_test_w2v
    elif vectorizer == 'ft':
        X_train_vec, X_test_vec = X_train_ft, X_test_ft
    elif vectorizer == 'gl':
        X_train_vec, X_test_vec = X_train_glove, X_test_glove
    else:
        raise ValueError("Invalid vectorizer choice!")

    model.fit(X_train_vec, y_train)

    y_pred = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    return accuracy, precision, recall, f1

models = [
    MLPClassifier(hidden_layer_sizes=(32, 64), max_iter=100, activation='logistic'),
    MLPClassifier(hidden_layer_sizes=(32, 64), max_iter=100, activation='tanh')
]

vectorizers = ['w2v', 'ft', 'gl'] 
evaluation_results = []

for model in models:
    for vectorizer in vectorizers:
        print(f"Training model with {model.activation} activation and {vectorizer} vectorizer")
        
        accuracy, precision, recall, f1 = train_and_evaluate(vectorizer, model)

        evaluation_results.append({
            'Model': model.activation,
            'Vectorizer': vectorizer,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1
        })

evaluation = pd.DataFrame(evaluation_results)

Training model with logistic activation and w2v vectorizer
Training model with logistic activation and ft vectorizer
Training model with logistic activation and gl vectorizer




Training model with tanh activation and w2v vectorizer
Training model with tanh activation and ft vectorizer
Training model with tanh activation and gl vectorizer




In [41]:
evaluation

Unnamed: 0,Model,Vectorizer,Accuracy,Precision,Recall,F1-score
0,logistic,w2v,0.411985,0.169732,0.411985,0.240416
1,logistic,ft,0.394961,0.257566,0.394961,0.280625
2,logistic,gl,0.595506,0.595056,0.595506,0.595247
3,tanh,w2v,0.3524,0.270071,0.3524,0.291409
4,tanh,ft,0.411645,0.275096,0.411645,0.247362
5,tanh,gl,0.614573,0.614209,0.614573,0.614334
