In [39]:
import re

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors #  implements word vectors

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy

from tqdm.auto import tqdm
tqdm.pandas()

## Data preparation

In [2]:
df = pd.read_csv('/kaggle/input/email-spam-detection-dataset-classification/spam.csv', encoding='latin-1')

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


(5572, 5)

In [4]:
# Remove unnecessary columns (Unnamed 2, 3, 4)
df = df[['v1', 'v2']]
df.shape

(5572, 2)

In [5]:
# Rename columns
df.columns = ['label', 'message']

In [6]:
# Create a new column: 0 for 'spam' and 1 for 'ham'
df['score'] = df['label'].map({'spam': 0, 'ham': 1})

df.head()

Unnamed: 0,label,message,score
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


In [7]:
# Evaluate the distribution of the target variable
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
# Droping duplicated records
df = df.drop_duplicates().reset_index(drop=True)

df.shape

(5169, 3)

In [9]:
# Contractions. Source http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [10]:
# Stop-words
stop_words = set(stopwords.words('english')).union({'also', 'would', 'much', 'many'})

negations = {
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    'mightn',
    "mightn't",
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'no',
    'nor',
    'not',
    'shan',
    "shan't",
    'shouldn',
    "shouldn't",
    'wasn',
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"
}

stop_words = stop_words.difference(negations)

In [11]:
nlp = spacy.load("en_core_web_sm", disable = ['parser','ner'])

# function to clean text
def normalize_text(raw_review):
    
    # Remove html tags
    text = re.sub("<[^>]*>", " ", raw_review) # match <> and everything in between. [^>] - match everything except >
    
    # Remove emails
    text = re.sub("\S*@\S*[\s]+", " ", text) # match non-whitespace characters, @ and a whitespaces in the end
    
    # remove links
    text = re.sub("https?:\/\/.*?[\s]+", " ", text) # match http, s - zero or once, //, 
                                                    # any char 0-unlimited, whitespaces in the end
        
     # Convert to lower case, split into individual words
    text = text.lower().split()
    
    # Replace contractions with their full versions
    text = [contractions.get(word) if word in contractions else word 
            for word in text]
   
    # Re-splitting for the correct stop-words extraction
    text = " ".join(text).split()    
    
    # Remove stop words
    text = [word for word in text if not word in stop_words]

    text = " ".join(text)
    
    # Remove non-letters        
    text = re.sub("[^a-zA-Z' ]", "", text) # match everything except letters and '

    # Lemmatize words. Need to define lemmatizer above
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc if len(token.lemma_) > 1 ])
    
    # Remove excesive whitespaces
    text = re.sub("[\s]+", " ", text)    
    
    # Join the words back into one string separated by space, and return the result.
    return text

In [12]:
df['text_normalized'] = df['message'].progress_apply(normalize_text)
df.head()

  0%|          | 0/5169 [00:00<?, ?it/s]

Unnamed: 0,label,message,score,text_normalized
0,ham,"Go until jurong point, crazy.. Available only ...",1,go jurong point crazy available bugis great wo...
1,ham,Ok lar... Joking wif u oni...,1,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,1,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",1,nah not think go usf live around though


## LogisticRegression model with BoW and TF-IDF

In [13]:
def get_preds(text_column, algorithm, ngrams=(1,1), max_features=5000):
    # Train-test split using the column for text data
    X_train, X_test, y_train, y_test = train_test_split(df[text_column], df['score'], test_size=0.2, random_state=42)
    
    if algorithm == 'cv':
        vect = CountVectorizer(max_features=max_features, ngram_range=ngrams).fit(X_train)
    elif algorithm == 'tfidf':
        vect = TfidfVectorizer(max_features=max_features, ngram_range=ngrams).fit(X_train)
    else:
        raise ValueError('Select correct algorithm: `cv` or `tfidf`')
    
    # transform the documents in the training data to a document-term matrix

    X_train_vectorized = vect.transform(X_train)
    
    model = LogisticRegression(random_state=42)
    model.fit(X_train_vectorized, y_train)
    
    predictions = model.predict(vect.transform(X_test))

    print('Algorithm: ', algorithm)
    print('AUC: ', roc_auc_score(y_test, predictions))
    print('Accuracy: ', accuracy_score(y_test, predictions))
    print('F1-score: ', f1_score(y_test, predictions))

#### CountVectorizer

In [14]:
get_preds('message', 'cv')

Algorithm:  cv
AUC:  0.9391295915596758
Accuracy:  0.9796905222437138
F1-score:  0.9882747068676717


In [15]:
get_preds('message', 'cv', (1,2))

Algorithm:  cv
AUC:  0.9494744191458827
Accuracy:  0.9825918762088974
F1-score:  0.9899328859060402


In [16]:
get_preds('message', 'cv', (2,2))

Algorithm:  cv
AUC:  0.8442651565106086
Accuracy:  0.9555125725338491
F1-score:  0.9747530186608123


In [17]:
get_preds('text_normalized', 'cv')

Algorithm:  cv
AUC:  0.9224506419456188
Accuracy:  0.9758220502901354
F1-score:  0.9860879243183084


In [18]:
get_preds('text_normalized', 'cv', (1,2))

Algorithm:  cv
AUC:  0.9201272254761258
Accuracy:  0.97678916827853
F1-score:  0.9866666666666667


In [19]:
get_preds('text_normalized', 'cv', (2,2))

Algorithm:  cv
AUC:  0.7586206896551724
Accuracy:  0.9323017408123792
F1-score:  0.9621212121212122


#### TfidfVectorizer

In [20]:
get_preds('message', 'tfidf')

Algorithm:  tfidf
AUC:  0.901760986773205
Accuracy:  0.9700193423597679
F1-score:  0.9828064337215752


In [21]:
get_preds('message', 'tfidf', (1,2))

Algorithm:  tfidf
AUC:  0.903448275862069
Accuracy:  0.9729206963249516
F1-score:  0.9844961240310077


In [22]:
get_preds('message', 'tfidf', (2,2))

Algorithm:  tfidf
AUC:  0.7172413793103448
Accuracy:  0.9206963249516441
F1-score:  0.9559139784946237


In [23]:
get_preds('text_normalized', 'tfidf')

Algorithm:  tfidf
AUC:  0.8690392149257204
Accuracy:  0.9584139264990329
F1-score:  0.9762299613045882


In [24]:
get_preds('text_normalized', 'tfidf', (1,2))

Algorithm:  tfidf
AUC:  0.8644660796710755
Accuracy:  0.9555125725338491
F1-score:  0.9745575221238939


In [25]:
get_preds('text_normalized', 'tfidf', (2,2))

Algorithm:  tfidf
AUC:  0.6344827586206896
Accuracy:  0.8974854932301741
F1-score:  0.9437367303609342


## LogisticRegression model with Word Embeddings

#### Word2Vec from scratch

In [26]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for sentence in data:
        word_list = sentence.split(" ")
        corpus.append(word_list)    
           
    return corpus

corpus = build_corpus(df['text_normalized'])
corpus[0]

# vector_size - Dimensionality of the word vectors
# window - Maximum distance between the current and predicted word within a sentence
# min_count - Ignores all words with total frequency lower than this

model_from_scratch = word2vec.Word2Vec(corpus, vector_size=300, window=10, min_count=20, workers=4)

#### Pretrained Word2Vec

In [27]:
pretrained_model_path = '/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'

# Завантаження моделі Word2Vec із файлу
pretrained_model = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

#### Model training

In [28]:
# Function to convert a text into an embedding vector (average of word vectors)
def text_to_embedding(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [29]:
def train_and_evaluate_model(df, model):
    
    # Convert dataset into embedding vectors
    X = np.array([text_to_embedding(text, model) for text in df['text_normalized']])
    y = df['score']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Train a Logistic Regression model
    model = LogisticRegression(random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    
    print('AUC: ', roc_auc_score(y_test, predictions))
    print('Accuracy: ', accuracy_score(y_test, predictions))
    print('F1-score: ', f1_score(y_test, predictions))

In [30]:
train_and_evaluate_model(df, model_from_scratch.wv)

AUC:  0.9029401570676202
Accuracy:  0.9216634429400387
F1-score:  0.9538986909504839


In [31]:
train_and_evaluate_model(df, pretrained_model)

AUC:  0.9225778363893046
Accuracy:  0.9274661508704062
F1-score:  0.9572162007986309


## LogisticRegression results

In [38]:
data = {
    'Model': [
        'BoW (original data)', 
        'BoW (normalized data)', 
        'TF-IDF (original data)', 
        'TF-IDF (normalized data)', 
        'Word2Vec (from scratch)', 
        'Pretrained Word2Vec'
    ],
    'AUC': [
        0.949, 0.922, 0.903, 0.869, 0.903, 0.923
    ],
    'Accuracy': [
        0.9826, 0.9758, 0.9729, 0.9584, 0.9217, 0.9275
    ],
    'F1-score': [
        0.9899, 0.9861, 0.9845, 0.9762, 0.9539, 0.9572
    ]
}

df_results = pd.DataFrame(data).sort_values(by='AUC', ascending=False)

print(df_results)

                      Model    AUC  Accuracy  F1-score
0       BoW (original data)  0.949    0.9826    0.9899
5       Pretrained Word2Vec  0.923    0.9275    0.9572
1     BoW (normalized data)  0.922    0.9758    0.9861
2    TF-IDF (original data)  0.903    0.9729    0.9845
4   Word2Vec (from scratch)  0.903    0.9217    0.9539
3  TF-IDF (normalized data)  0.869    0.9584    0.9762


## Висновки

Найкраща модель - **BoW** з n-грамами (1,2) на оригінальних текстах показала найвищі метрики:

**AUC: 0.949<br>
Accuracy: 0.9826<br>
F1-score: 0.9899**

Це означає, що модель, яка використовує BoW з великими n-грамами, краще розуміє контекст і особливості тексту, що робить її найбільш точною для нашої задачі класифікації спаму.

**Pretrained Word2Vec** показав **AUC 0.923** і **Accuracy 0.9275**, що є досить хорошими результатами. **Word2Vec**, **тренований з нуля**, має нижчі результати з **AUC 0.903** та **Accuracy 0.9217**. Це означає, що попередньо навчена модель може краще захоплювати семантичні зв'язки, ніж модель, тренована лише на цьому невеликому наборі даних.

**Переваги та недоліки підходів:**

**BoW**: простий і ефективний для задач класифікації, але може не враховувати семантичні зв'язки між словами.

**TF-IDF**: підсилює важливість унікальних слів і зменшує вплив часто вживаних, але менш ефективний у розумінні контексту, ніж Word2Vec.

**Word2Vec**: розуміє семантичні зв'язки між словами, краще підходить для складних текстів і довгих контекстів, але потрібні великі дані для ефективного тренування моделі з нуля. Попередньо навчена модель може краще працювати та класифікувати дані.

**Вдосконалення моделі:**

- можна спробувати змінити налаштування гіперпараметрів для BoW або TF-IDF
- для Word2Vec можна тренувати модель на більшому корпусі текстів або спробувати інші моделі ембедингів
- використання більш складних моделей класифікації.