## importing required libraries

In [104]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import chardet
import random
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.metrics import confusion_matrix, accuracy_score

## Getting a text dataset
the dataset we are going to be using is kaggle spam dataset

In [40]:
with open('spam.csv', 'rb') as f:
    result = chardet.detect(f.read())

df = pd.read_csv('spam.csv', encoding=result['encoding'])

In [41]:
df.head ()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Preprocessing our dataset

In [42]:
# renaming dataset columns
df.rename(columns={'v2': 'text', "v1": 'target'}, inplace=True)

In [44]:
# deleting unwanted columns
del df["Unnamed: 2"]
del df["Unnamed: 3"]
del df["Unnamed: 4"]

In [45]:
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [46]:
# converting target column to one hot encoded
df = pd.get_dummies (df, columns = ["target"])

In [48]:
df.head ()

Unnamed: 0,text,target_ham,target_spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0


#### spam = 0 and Ham = 1

In [49]:
del df["target_spam"]
df.rename(columns={"target_ham": "target"}, inplace=True)
df.head ()

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",1
1,Ok lar... Joking wif u oni...,1
2,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,U dun say so early hor... U c already then say...,1
4,"Nah I don't think he goes to usf, he lives aro...",1


In [59]:
# Let's visualise some random training examples
random_index = random.randint (0, len (df) - 5)
for row in df[["text", "target"]][random_index: random_index + 5].itertuples ():
    _, text, target = row
    print (f"Target: {target}", "(Ham)" if target > 0 else "(Spam)")
    print (f"Text:\n{text}\n")
    print ("----\n")

Target: 1 (Ham)
Text:
Dunno leh cant remember mayb lor. So wat time r we meeting tmr?

----

Target: 1 (Ham)
Text:
Best msg: It's hard to be with a person, when u know that one more step foward will make u fall in love.. &amp; One step back can ruin ur friendship.. good night:-) ...

----

Target: 0 (Spam)
Text:
URGENT! Your Mobile number has been awarded with a å£2000 prize GUARANTEED. Call 09061790126 from land line. Claim 3030. Valid 12hrs only 150ppm

----

Target: 1 (Ham)
Text:
Helloooo... Wake up..! \Sweet\" \"morning\" \"welcomes\" \"You\" \"Enjoy\" \"This Day\" \"with full of joy\".. \"GUD MRNG\"."

----

Target: 1 (Ham)
Text:
Vikky, come around  &lt;TIME&gt; ..

----



In [67]:
# Checking the num of classes in our target variable
df.target.value_counts ()

1    4825
0     747
Name: target, dtype: int64

## Data balancing
it's quiet obvious that our dataset category is imbalanced so let's balance it

#### Resampling Techniques:

- **Over-sampling**: Increase the number of instances in the minority class by randomly duplicating them or generating synthetic examples. Popular techniques include SMOTE (Synthetic Minority Over-sampling Technique).
- **Under-sampling**: Reduce the number of instances in the majority class by randomly removing examples. Be cautious with under-sampling as it may lead to loss of valuable information.

In [69]:
# defining our depedent and independent variables
x = df["text"]
y = df["target"]

# splitting dataset into train and test
x_train, x_test, y_train, y_test = train_test_split (x, y, train_size = 0.8, random_state = 42)

# Display the class distribution before applying SMOTE
print("Class distribution before SMOTE:")
print(y_train.value_counts())

# Apply SMOTE to the training set with a specific sampling strategy
smote = SMOTE(sampling_strategy=0.7, random_state=42)
x_train_sampled, y_train_sampled = smote.fit_resample(x_train, y_train)

# Display the class distribution after applying SMOTE
print("\nClass distribution after SMOTE:")
print(y_train.value_counts())

Class distribution before SMOTE:
1    3860
0     597
Name: target, dtype: int64


ValueError: could not convert string to float: "No I'm in the same boat. Still here at my moms. Check me out on yo. I'm half naked."

## Splitting our dataset

In [74]:
# defining our depedent and independent variables
x = df["text"]
y = df["target"]

# splitting dataset into train and test
x_train, x_test, y_train, y_test = train_test_split (x.to_numpy (), y.to_numpy (), train_size = 0.8, random_state = 42)


## text Tokenization

In [75]:
# Find the average number tokens in the training tweets
round (sum ([len (i.split ()) for i in x_train]) / len (x_train))

15

In [76]:
# tokenizing our text dataset
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

## setup text vectorization variables
max_vocab_length = 1000
max_length = 15  # how long our sequence will be

text_vectorizer = TextVectorization (max_tokens = max_vocab_length,
                                     output_mode = "int",
                                     output_sequence_length = max_length)

In [77]:
# Fit the text vectorizer to the traininig text
text_vectorizer.adapt (x_train)

In [78]:
# Create a sample sentence and tokenize it
sample_sentence = "osas ceo of coding pivots?"
text_vectorizer ([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 1,  1, 15,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int64)>

In [83]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice (x_train)
print ("Original text:\n", random_sentence, "\n\nTokenize version:\n",  text_vectorizer ([random_sentence]))

Original text:
 Haha... Really oh no... How? Then will they deduct your lesson tmr? 

Tokenize version:
 tf.Tensor([[239 162 129  40  54  64  35 110   1  14 581 417   0   0   0]], shape=(1, 15), dtype=int64)


In [84]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary ()  # get all the unique words in our vocabulary
top_5_words = words_in_vocab[:5]  # get the most common words
bottom_5_words = words_in_vocab[-5:]  # get the least common words
print (f"Number of words in vocab: {len (words_in_vocab)}")
print (f"5 most common words: {top_5_words}")
print (f"5 least common words: {bottom_5_words}")

Number of words in vocab: 1000
5 most common words: ['', '[UNK]', 'to', 'i', 'you']
5 least common words: ['paying', 'nyt', 'noon', 'none', 'nigeria']


## Create an embedding using Tensorflow embedding layer
Embeddings transform positive integers into compact vectors of a consistent size. The crucial parameters for our embedding layer are as follows:
- `input_dim`: the size of our vocabulary
- `output_dim`: the dimensionality of the resulting embedding vector; for instance, a setting of 200 implies that each token is encoded as a vector with 200 dimensions
- `input_length`: the length of the sequences supplied to the embedding layer

In [85]:
from tensorflow.keras import layers

embedding = layers.Embedding (input_dim = max_vocab_length, 
                              output_dim = 128,
                             input_length = max_length)

embedding

<tensorflow.python.keras.layers.embeddings.Embedding at 0x18f30009e50>

In [87]:
## So let's test our embedding tf layer with a random sample
random_sentence = random.choice (x_train)
print (f"Original text: \n {random_sentence} \n\nTokenized version: \n {text_vectorizer ([random_sentence])}\n\nEmbedded Version: ")
sample_embed = embedding (text_vectorizer ([random_sentence]))
sample_embed

Original text: 
 URGENT, IMPORTANT INFORMATION FOR O2 USER. TODAY IS YOUR LUCKY DAY! 2 FIND OUT WHY LOG ONTO HTTP://WWW.URAWINNER.COM THERE IS A FANTASTIC SURPRISE AWAITING FOR YOU 

Tokenized version: 
 [[190 464 784  13   1 883  95  10  14 579  69  22 175  55 169]]

Embedded Version: 


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.00430927,  0.03193492, -0.03968967, ..., -0.03643664,
         -0.01620886,  0.01653708],
        [ 0.0017296 , -0.01883905,  0.0421573 , ...,  0.0250286 ,
          0.04810629,  0.0300197 ],
        [-0.02490152, -0.032251  ,  0.02536373, ...,  0.01567742,
          0.03691034,  0.00890841],
        ...,
        [ 0.0042261 ,  0.04831162, -0.00775467, ..., -0.02069246,
          0.0471291 ,  0.01411555],
        [-0.01191616,  0.04681437, -0.03649992, ..., -0.01919986,
          0.00486423,  0.04056931],
        [ 0.00250328, -0.01777841, -0.01844982, ...,  0.01759103,
          0.03286393, -0.01302002]]], dtype=float32)>

In [88]:
# Check out a single token's embeddings
print (f"single token embed: \n{sample_embed[0][0]} \n\nembed shape: {sample_embed[0][0].shape} \nthe single token: {random_sentence.split ()[0]} \nThe full sentence: {random_sentence}")

single token embed: 
[-4.30927426e-03  3.19349207e-02 -3.96896712e-02  4.20137309e-02
  2.76331417e-02  3.23105194e-02 -6.42098114e-03 -4.14739363e-02
  1.85583904e-03 -4.02107127e-02  7.52482563e-03  2.65669934e-02
 -2.35676765e-04 -3.03605329e-02  3.92581150e-03  4.59532477e-02
 -6.86192513e-03 -1.89391524e-03  8.42101872e-05 -4.60703969e-02
  4.88206185e-02 -9.17595625e-03 -2.59319898e-02  4.26017381e-02
 -2.72915605e-02 -3.43948007e-02  2.13908032e-03 -2.48587616e-02
 -3.87796760e-02 -2.57640015e-02  3.11478637e-02 -1.90796740e-02
  8.94080475e-03 -4.83648665e-02  4.47187312e-02 -9.71783325e-03
 -1.48541816e-02 -2.52982378e-02 -2.42007133e-02 -1.93093307e-02
  2.27768458e-02  4.90573905e-02  8.89512151e-03 -4.28089760e-02
  4.35368754e-02  4.15120758e-02  4.07089032e-02  1.98472850e-02
  2.96027921e-02 -1.10615976e-02 -4.53276411e-02 -3.60886827e-02
 -1.27412789e-02  1.67534389e-02  1.80754177e-02  4.80070971e-02
  4.53468412e-03 -3.00048236e-02 -2.29531880e-02 -3.74512300e-02
  1.

### Model 0: Getting a baseline
as with all machine learning modelling experiments, it's important to create a baseline model so you have got a benchmark for future experiments to build upon

To create our baseline model, we'll use sklearn Multinomial naive bayes using the TF-IDF formula to convert our words to numbers

**NB:** ***it's common practice to use Non DL algorithm as a baseline because of their speed and then later use Dl to see if you can improve upon them***

In [96]:
model_0 = Pipeline ([("tfidf", TfidfVectorizer ()),
                      ("clf", MultinomialNB ())])

model_0.fit (x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [99]:
# evaluating our model performance
model_0.score (x_test, y_test)

0.9623318385650225

In [106]:
# Getting the confusion_matrix and accuracy_score
print (f"Accuracy_score:  {accuracy_score (y_test, y_pred)} \n\n Confusion_matrix: \n{confusion_matrix(y_test, y_pred)}")

Accuracy_score:  0.9623318385650225 

 Confusion_matrix: 
[[108  42]
 [  0 965]]


In [108]:
# A function that evaluate our entire model results
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results (y_true, y_pred):
    """
    Calculates model accuracy, precision, recall and f1 score of binary classification model
    """
    
    model_accuracy = accuracy_score (y_true, y_pred)
    precision, recall, f1_score, _ = precision_recall_fscore_support (y_true, y_pred, average = "weighted")

    model_result = {"accuracy": model_accuracy,
                    "recall": recall,
                    "Precision": precision,
                    "f1_score": f1_score}
    return model_result

calculate_results (y_test, y_pred)

{'accuracy': 0.9623318385650225,
 'recall': 0.9623318385650225,
 'Precision': 0.9639029038880305,
 'f1_score': 0.9596669569615455}

In [100]:
# making predictions
y_pred = model_0.predict (x_test)
pd.DataFrame ({"text": x_test[:15], "Prediction": y_pred[:15], "Actual": y_test[:15]})

Unnamed: 0,text,Prediction,Actual
0,"Funny fact Nobody teaches volcanoes 2 erupt, t...",1,1
1,I sent my scores to sophas and i had to do sec...,1,1
2,We know someone who you know that fancies you....,1,0
3,Only if you promise your getting out as SOON a...,1,1
4,Congratulations ur awarded either å£500 of CD ...,0,0
5,"I'll text carlos and let you know, hang on",1,1
6,K.i did't see you.:)k:)where are you now?,1,1
7,No message..no responce..what happend?,1,1
8,Get down in gandhipuram and walk to cross cut ...,1,1
9,You flippin your shit yet?,1,1
