In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')

import gensim
from gensim.models import word2vec
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import spacy

from tqdm.auto import tqdm
tqdm.pandas()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
filepath = '/content/drive/My Drive/dataset/spam.csv'
data = pd.read_csv(filepath, encoding = "ISO-8859-1")
data.shape

(5572, 5)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [None]:
data = data.drop_duplicates().reset_index(drop=True)

In [None]:
data = data[['v1', 'v2']]

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5169 entries, 0 to 5168
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5169 non-null   object
 1   v2      5169 non-null   object
dtypes: object(2)
memory usage: 80.9+ KB


In [None]:
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
stop_words = set(stopwords.words('english')).union({'also', 'would', 'much', 'many'})

negations = {
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    'mightn',
    "mightn't",
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'no',
    'nor',
    'not',
    'shan',
    "shan't",
    'shouldn',
    "shouldn't",
    'wasn',
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"
}

stop_words = stop_words.difference(negations)

In [None]:
nlp = spacy.load("en_core_web_sm", disable = ['parser','ner'])

def normalize_text(raw_review):

    # Remove html tags
    text = re.sub("<[^>]*>", " ", raw_review)

    # Remove emails
    text = re.sub("\S*@\S*[\s]+", " ", text)

    # remove links
    text = re.sub("https?:\/\/.*?[\s]+", " ", text)

     # Convert to lower case, split into individual words
    text = text.lower().split()

    # Replace contractions with their full versions
    text = [contractions.get(word) if word in contractions else word
            for word in text]

    # Re-splitting for the correct stop-words extraction
    text = " ".join(text).split()

    # Remove stop words
    text = [word for word in text if not word in stop_words]

    text = " ".join(text)

    # Remove non-letters
    text = re.sub("[^a-zA-Z' ]", "", text)

    # Lemmatize words. Need to define lemmatizer above
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc if len(token.lemma_) > 1 ])

    # Remove excesive whitespaces
    text = re.sub("[\s]+", " ", text)

    return(text)

In [None]:
data['email_normalized'] = data['v2'].progress_apply(normalize_text)

  0%|          | 0/5169 [00:00<?, ?it/s]

In [None]:
data['spam'] = [1 if val == 'spam' else 0 for val in data['v1']]

In [None]:
train_idxs = data.sample(frac=0.8, random_state=42).index
test_idxs = [idx for idx in data.index if idx not in train_idxs]

In [None]:
def get_preds(text_column, model_name, algorithm, ngrams=(1,1)):

    X_train = data.loc[train_idxs, text_column]
    X_test = data.loc[test_idxs, text_column]

    y_train = data.loc[train_idxs, 'spam']
    y_test = data.loc[test_idxs, 'spam']

    if algorithm == 'cv':
        vect = CountVectorizer(ngram_range=ngrams).fit(X_train)
    elif algorithm == 'tfidf':
        vect = TfidfVectorizer(ngram_range=ngrams).fit(X_train)
    else:
        raise ValueError('Select correct algorithm: `cv` or `tfidf`')

    print('Vocabulary length: ', len(vect.vocabulary_))

    # transform the documents in the training data to a document-term matrix

    X_train_vectorized = vect.transform(X_train)
    print('Document-term matrix shape:', X_train_vectorized.shape)

    if model_name == 'logistic_regression':
      model = LogisticRegression(random_state=42)
    elif model_name =='random_forest':
      model = RandomForestClassifier(n_jobs=1, random_state=42)

    model.fit(X_train_vectorized, y_train)

    predictions = model.predict(vect.transform(X_test))

    print('Algorithm: ', algorithm.upper())
    print('Model: ', model_name)
    print('Text: ', text_column)
    print('AUC: ', roc_auc_score(y_test, predictions))
    print('Accuracy: ', accuracy_score(y_test, predictions))

In [None]:
get_preds('v2', 'logistic_regression', 'cv')

Vocabulary length:  7617
Document-term matrix shape: (4135, 7617)
Algorithm:  CV
Model:  logistic_regression
Text:  v2
AUC:  0.9342616670860946
Accuracy:  0.9825918762088974


In [None]:
get_preds('v2', 'random_forest', 'cv')

Vocabulary length:  7617
Document-term matrix shape: (4135, 7617)
Algorithm:  CV
Model:  random_forest
Text:  v2
AUC:  0.8931623931623931
Accuracy:  0.9758220502901354


In [None]:
get_preds('email_normalized', 'logistic_regression', 'cv')

Vocabulary length:  6403
Document-term matrix shape: (4135, 6403)
Algorithm:  CV
Model:  logistic_regression
Text:  email_normalized
AUC:  0.9219864105360289
Accuracy:  0.9806576402321083


In [None]:
get_preds('email_normalized', 'random_forest', 'cv')

Vocabulary length:  6403
Document-term matrix shape: (4135, 6403)
Algorithm:  CV
Model:  random_forest
Text:  email_normalized
AUC:  0.8717948717948718
Accuracy:  0.9709864603481625


In [None]:
get_preds('v2', 'logistic_regression', 'tfidf')

Vocabulary length:  7617
Document-term matrix shape: (4135, 7617)
Algorithm:  TFIDF
Model:  logistic_regression
Text:  v2
AUC:  0.8450633336129519
Accuracy:  0.9632495164410058


In [None]:
get_preds('v2', 'random_forest', 'tfidf')

Vocabulary length:  7617
Document-term matrix shape: (4135, 7617)
Algorithm:  TFIDF
Model:  random_forest
Text:  v2
AUC:  0.8846153846153846
Accuracy:  0.9738878143133463


In [None]:
get_preds('email_normalized', 'logistic_regression', 'tfidf')

Vocabulary length:  6403
Document-term matrix shape: (4135, 6403)
Algorithm:  TFIDF
Model:  logistic_regression
Text:  email_normalized
AUC:  0.8066017951514135
Accuracy:  0.9545454545454546


In [None]:
get_preds('email_normalized', 'random_forest', 'tfidf')

Vocabulary length:  6403
Document-term matrix shape: (4135, 6403)
Algorithm:  TFIDF
Model:  random_forest
Text:  email_normalized
AUC:  0.8888888888888888
Accuracy:  0.9748549323017408


In [None]:
get_preds('email_normalized', 'logistic_regression', 'tfidf', (1,2))

Vocabulary length:  30968
Document-term matrix shape: (4135, 30968)
Algorithm:  TFIDF
Model:  logistic_regression
Text:  email_normalized
AUC:  0.7889625217869494
Accuracy:  0.9497098646034816


In [None]:
get_preds('email_normalized', 'random_forest', 'tfidf', (1,2))


Vocabulary length:  30968
Document-term matrix shape: (4135, 30968)
Algorithm:  TFIDF
Model:  random_forest
Text:  email_normalized
AUC:  0.8333333333333333
Accuracy:  0.9622823984526112


In [None]:
get_preds('email_normalized', 'logistic_regression', 'cv', (2,2))

Vocabulary length:  24565
Document-term matrix shape: (4135, 24565)
Algorithm:  CV
Model:  logistic_regression
Text:  email_normalized
AUC:  0.7735042735042735
Accuracy:  0.9487427466150871


In [None]:
get_preds('email_normalized', 'random_forest','cv', (2,2))


Vocabulary length:  24565
Document-term matrix shape: (4135, 24565)
Algorithm:  CV
Model:  random_forest
Text:  email_normalized
AUC:  0.7948717948717949
Accuracy:  0.9535783365570599


In [None]:
get_preds('v2', 'logistic_regression', 'cv', (2,2))

Vocabulary length:  34655
Document-term matrix shape: (4135, 34655)
Algorithm:  CV
Model:  logistic_regression
Text:  v2
AUC:  0.8034188034188035
Accuracy:  0.9555125725338491


In [None]:
get_preds('v2', 'random_forest', 'cv', (2,2))

Vocabulary length:  34655
Document-term matrix shape: (4135, 34655)
Algorithm:  CV
Model:  random_forest
Text:  v2
AUC:  0.8205128205128205
Accuracy:  0.9593810444874274


In [None]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for sentence in data:
        word_list = sentence.split(" ")
        corpus.append(word_list)

    return corpus

In [None]:
corpus = build_corpus(data['email_normalized'])

In [None]:
class WordEmbedding:

    def __init__(self):
        self.model = {}

    def convert(self, source, ipnut_file_path, output_file_path):

        # Converts word embeddings from GloVe format to Word2Vec format
        if source == 'glove':
            glove2word2vec(ipnut_file_path, output_file_path)
        elif source in ['word2vec', 'fasttext']:
            pass
        else:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

    def load(self, source, file_path):

        # Loads a specified word embedding model from a file
        if source in ['glove', 'fasttext']:
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path)
        elif source == 'word2vec':
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)
        else:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

        return self

    def get_model(self, source):

        # Retrieves the loaded word embedding model
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

        return self.model[source]

    def get_words(self, source, size=None):

        # Retrieves a list of words from the model
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

        if size is None:
            return [w for w in self.get_model(source=source).key_to_index]
        else:
            results = []
            for i, word in enumerate(self.get_model(source=source).key_to_index):
                if i >= size:
                    break
                results.append(word)
            return results

        return Exception('Unexpected flow')

    def get_dimension(self, source):

        # Retrieves the dimension of word vectors in the model
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

        return self.get_model(source=source).vectors[0].shape[0]

    def get_vectors(self, source, words=None):

        # Retrieves vectors for specified words or for all words in the model
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

        if words is None:
            words = self.get_words(source=source)

        embedding = np.empty((len(words), self.get_dimension(source=source)), dtype=np.float32)
        for i, word in enumerate(words):
            embedding[i] = self.get_vector(source=source, word=word)

        return embedding

    def get_vector(self, source, word):

        # Retrieves the vector representation of a single word
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)

        try:
            return self.model[source][word]
        except KeyError as e:
            dims = self.model[source][0].shape
            vect = np.empty(dims)
            vect[:] = np.nan
            return vect

    def get_synonym(self, source, word, topn=5):

        # Retrieves synonyms for a given word
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)

        try:
            return self.model[source].most_similar(positive=word, topn=topn)
        except KeyError as e:
            raise

    def get_distance_between_two_words(self, source, word1, word2):

        # Calculates cosine similarity between two words in the model
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec or fasttext')

        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)

        try:
            return self.model[source].similarity(word1, word2)
        except KeyError as e:
            raise

In [None]:
BASE_URL = '/content/drive/My Drive/dataset/pretrained_models'

word2vec_file_path = f'{BASE_URL}/GoogleNews-vectors-negative300.bin'
fasttext_file_path = f'{BASE_URL}/wiki-news-300d-1M.vec'

# adding absolute path for correct gensim work
downloaded_glove_file_path = f'{BASE_URL}/glove.6B.50d.txt'
glove_file_path = f'{BASE_URL}/glove.6B.50d.vec'

In [None]:
word_embedding = WordEmbedding()

In [None]:
word_embedding.convert(source='glove', ipnut_file_path=downloaded_glove_file_path, output_file_path=glove_file_path)

  glove2word2vec(ipnut_file_path, output_file_path)


In [None]:
def tok2vec(tokens, source:str, avg:str):
    """
    Given a list of tokens, return their vector representation.
    Args:
        tokens: List(str) tokenized input
        source: embedding algorithm to use with the WordEmbedding object
        avg: vectors averaging method - `sum` or `mean` of all vectors
    """
    vects = word_embedding.get_vectors(source=source, words=tokens)

    if avg == 'mean':
        return np.nanmean(vects, axis=0)
    elif avg == 'sum':
        return np.nansum(vects, axis=0)
    else:
        raise ValueError('Select correct averaging method: sum or mean')

In [None]:
def get_preds(text_column, model_name, source, file_path ):

    word_embedding.load(source=source, file_path=file_path)

    X_train = data.loc[train_idxs, text_column].apply(
        word_tokenize).apply(lambda x: tok2vec(x, source, 'mean')).to_numpy()

    X_test = data.loc[test_idxs, text_column].apply(
        word_tokenize).apply(lambda x: tok2vec(x, source, 'mean')).to_numpy()

    X_train = np.stack(X_train, axis=0)
    X_test = np.stack(X_test, axis=0)

    y_train = data.loc[train_idxs, 'spam']
    y_test = data.loc[test_idxs, 'spam']

    imputer= SimpleImputer(strategy='most_frequent').set_output(transform='pandas')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    smote = SMOTE()
    X_train, y_train = smote.fit_resample(X_train, y_train)

    if model_name == 'logistic_regression':
      model = LogisticRegression(random_state=42)
    elif model_name =='random_forest':
      model = RandomForestClassifier(n_jobs=1, random_state=42)

    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    print('\nAlgorithm: ', source.upper())
    print('Model: ', model_name)
    print('Text: ', text_column)
    print('AUC: ', roc_auc_score(y_test, predictions))
    print('Accuracy: ', accuracy_score(y_test, predictions))

In [None]:
get_preds('email_normalized', 'logistic_regression', 'word2vec', word2vec_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  WORD2VEC
Model:  logistic_regression
Text:  email_normalized
AUC:  0.9101818452963492
Accuracy:  0.9332688588007737


In [None]:
get_preds('email_normalized', 'random_forest', 'word2vec', word2vec_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  WORD2VEC
Model:  random_forest
Text:  email_normalized
AUC:  0.8930738472723204
Accuracy:  0.9690522243713733


In [None]:
get_preds('v2', 'logistic_regression', 'word2vec', word2vec_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  WORD2VEC
Model:  logistic_regression
Text:  v2
AUC:  0.9483637651576583
Accuracy:  0.9613152804642167


In [None]:
get_preds('v2', 'random_forest',  'word2vec', word2vec_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  WORD2VEC
Model:  random_forest
Text:  v2
AUC:  0.9374446588187046
Accuracy:  0.9816247582205029


In [None]:
get_preds('email_normalized', 'logistic_regression', 'glove', glove_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  GLOVE
Model:  logistic_regression
Text:  email_normalized
AUC:  0.863643989598188
Accuracy:  0.8771760154738878


In [None]:
get_preds('email_normalized', 'random_forest', 'glove', glove_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  GLOVE
Model:  random_forest
Text:  email_normalized
AUC:  0.8775270530995721
Accuracy:  0.9613152804642167


In [None]:
get_preds('v2', 'logistic_regression', 'glove', glove_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  GLOVE
Model:  logistic_regression
Text:  v2
AUC:  0.8760077920383263
Accuracy:  0.8858800773694391


In [None]:
get_preds('v2', 'random_forest', 'glove', glove_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  GLOVE
Model:  random_forest
Text:  v2
AUC:  0.8956230368444109
Accuracy:  0.960348162475822


In [None]:
get_preds('email_normalized', 'logistic_regression', 'fasttext', fasttext_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  FASTTEXT
Model:  logistic_regression
Text:  email_normalized
AUC:  0.8940897948531537
Accuracy:  0.9245647969052224


In [None]:
get_preds('email_normalized', 'random_forest', 'fasttext', fasttext_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  FASTTEXT
Model:  random_forest
Text:  email_normalized
AUC:  0.9006188891685075
Accuracy:  0.9758220502901354


In [None]:
get_preds('v2', 'logistic_regression', 'fasttext', fasttext_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  FASTTEXT
Model:  logistic_regression
Text:  v2
AUC:  0.95254872354109
Accuracy:  0.9555125725338491


In [None]:
get_preds('v2', 'random_forest', 'fasttext', fasttext_file_path)

  return np.nanmean(vects, axis=0)
  return np.nanmean(vects, axis=0)



Algorithm:  FASTTEXT
Model:  random_forest
Text:  v2
AUC:  0.9288976502716961
Accuracy:  0.9796905222437138


## Висновки

На основі результатів, отриманих під час проведення аналізу, можна зробити кілька важливих висновків:

1. **Ефективність моделей**:
   - Модель логістичної регресії, навчена на **ненормалізованих даних** з використанням методу CountVectorizer (CV) і модель LogisticRegression, показала найвищий AUC (0.934) і Accuracy (98.3%). Це вказує на високу здатність моделі правильно класифікувати дані.
   - Другим за ефективністю варіантом є модель LogisticRegression, що використовує нормалізовані електронні листи (email_normalized) з методом CountVectorizer (CV), яка також продемонструвала вражаючий AUC (0.922) і Accuracy (98.1%).

2. **Порівняння з Random Forest**:
   - Хоча моделі Random Forest показують хороші результати, жодна з них не перевершує результати LogisticRegression за показниками AUC і Accuracy. Наприклад, модель Random Forest на основі **ненормалізованих даних** має AUC 0.893 та Accuracy 97.6%, що є нижчими значеннями в порівнянні з логістичною регресією.
   
3. **Вплив вектораізації**:
   - Використання різних методів вектораізації (CountVectorizer, TFIDF, Word2Vec, GloVe, FastText) показало різні результати. LogisticRegression на основі Word2Vec для **ненормалізованих даних** показала AUC 0.948, що свідчить про ефективність цього методу векторизації.
   - Хоча TFIDF також надавало хороші результати, Word2Vec та FastText виявилися більш ефективними.

4. **Загальні результати**:
   - Загалом, результати вказують на високу якість класифікації для всіх моделей, проте LogisticRegression виявилася найефективнішою у більшості випадків. Зокрема, важливо відзначити, що моделі на основі **ненормалізованих даних** показують кращі результати в порівнянні з тими, що використовують **email_normalized**.
   - Використання n-грам у методах CountVectorizer та TFIDF призводить до погіршення результатів порівняно з використанням лише уніграм. Це можна побачити за зниженням метрик.
   - Високі значення метрики Accuracy можуть свідчити про перенавчання моделі (overfitting). Однак, метрика AUC (Area Under the Curve) в даному випадку показує приблизно хороші результати, що вказує на те, що модель все ще здатна розрізняти класи та має прийнятну дискримінативну здатність.
   Отже, хоча висока Accuracy може вводити в оману, AUC допомагає краще оцінити реальну ефективність моделі.

