In [1]:
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt 

import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from torch.utils.data import  TensorDataset, DataLoader
import torchtext

import nltk 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize

import string
from tqdm import tqdm
tqdm.pandas()
from collections import Counter
import re
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("spam.csv", encoding = 'latin-1')
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
data = data[['v1', 'v2']]
data = data.rename({'v1': 'label', 'v2': 'message'}, axis = 1)
data.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
def label_to_int(label):
    return 1 if label == 'ham' else 0

data['message_label'] = data['label'].apply(label_to_int)
data.head(5)

Unnamed: 0,label,message,message_label
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('English'))


def remove_hyperlinks(text):
    return re.sub(r'http\S+', ' ', text)

def remove_punctuation(text):
    return re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\,\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', '', text)

def rm_html_tags(text):
    text = re.sub(r'<.*?>', '', text)
    return re.sub(r'<br />', '', text)

def space_bt_punct(text):
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)  # add whitespaces between punctuation
    s = re.sub(r'\s{2,}', ' ', s)  # remove double whitespaces
    return s

def remove_integers(text):
    return re.sub(r'\d+', '', text)

def rm_whitespaces(text):
    return re.sub('\s+', ' ', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)


def clean_pipeline(text):
    text = text.lower()
    no_link = remove_hyperlinks(text)
    no_html = rm_html_tags(no_link)
    space_punct = space_bt_punct(no_html)
    no_punct = remove_punctuation(space_punct)
    no_number = remove_integers(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    #spell_corrected = spell_correction(no_emoji)
    return no_nonasci


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
data['message'] = data['message'].progress_apply(clean_pipeline)


100%|██████████| 5572/5572 [00:00<00:00, 43868.55it/s]


In [8]:
data.head(5)

Unnamed: 0,label,message,message_label
0,ham,go until jurong point crazy . . available only...,1
1,ham,ok lar . . . joking wif u oni . . .,1
2,spam,free entry in a wkly comp to win fa cup final ...,0
3,ham,u dun say so early hor . . . u c already then ...,1
4,ham,nah i dont think he goes to usf he lives aroun...,1


In [9]:
def tokenize_words(text):
    # Tokenizes the text into list of tokens(words)
    # "go until jurong point crazy" --> "go", "until","jurong", "point", "crazy",
    return word_tokenize(text)

def remove_stopwords(text):
    ## Remove stopwords such as is am it there
    return [word for word in text if word not in stop_words]

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in text:
        pos_tag = nltk.pos_tag([word])[0][1][0].upper()
        wordnet_pos = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}.get(pos_tag, wordnet.NOUN)
        # Lemmatize the word with the specified POS tag
        lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
        lemmas.append(lemma)
    ## Lemmatize words like running --> run
    # make sure lemmas does not contains stopwords
    return remove_stopwords(lemmas)


def preprocess_pipeline(text):
    tokens = tokenize_words(text)
    no_stopwords = remove_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

In [10]:
data['message'] = data['message'].progress_apply(preprocess_pipeline)

100%|██████████| 5572/5572 [00:08<00:00, 680.44it/s]


In [11]:
data.head(5)

Unnamed: 0,label,message,message_label
0,ham,go jurong point crazy . . available bugis n gr...,1
1,ham,ok lar . . . joking wif u oni . . .,1
2,spam,free entry wkly comp win fa cup final tkts st ...,0
3,ham,u dun say early hor . . . u c already say . . .,1
4,ham,nah dont think go usf life around though,1


In [12]:
reviews = data.message.values
# merge into single variable, separated by whitespaces
words = ' '.join(reviews)
# obtain list of words
words = words.split()
# build vocabulary
counter = Counter(words)
# only keep top 2000 words
vocab = sorted(counter, key=counter.get, reverse=True)[:2000]
int2word = dict(enumerate(vocab, 2))
int2word[0] = '<PAD>'
int2word[1] = '<UNK>'
word2int = {word: id for id, word in int2word.items()}

In [13]:
reviews_enc = [[word2int[word] if word in word2int else word2int['<UNK>'] for word in review.split()] for review in tqdm(reviews, desc='encoding')]


encoding: 100%|██████████| 5572/5572 [00:00<00:00, 394208.68it/s]


In [14]:
def pad_features(reviews, pad_id, seq_length=128):
    # features = np.zeros((len(reviews), seq_length), dtype=int)
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)

    for i, row in enumerate(reviews):
        start_index = max(0, seq_length - len(row))
        # if seq_length < len(row) then review will be trimmed
        features[i, start_index:] = np.array(row)[:min(seq_length, len(row))]

    return features


seq_length = 128
features = pad_features(reviews_enc, pad_id=word2int['<PAD>'], seq_length=seq_length)


In [15]:
type(features)

numpy.ndarray

In [34]:
features.shape, labels.shape

((5572, 128), torch.Size([64]))

In [16]:
labels = data.message_label.to_numpy()

# train test split
train_size = .75  # we will use 75% of whole data as train set
val_size = .5  # and we will use 50% of test set as validation set

# stratify will make sure that train and test set have same distribution of labels
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=1 - train_size, stratify=labels)

# split test set into validation and test set
val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, test_size=val_size, stratify=test_y)

In [35]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((4179, 128), (697, 128), (4179,), (697,))

In [18]:
# define batch size
batch_size = 64

# create tensor datasets
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# create dataloaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [31]:
len(train_loader)

66

In [36]:
class Rnn_textclassification(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(Rnn_textclassification, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        self.embedding_layer = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding_layer(x)
        output, hidden = self.rnn(embedded)
        # Take the last output of the RNN
        last_output = output[:, -1, :]
        final_out = torch.sigmoid(self.fc(last_output))
        return final_out


In [38]:
vocab_size = len(vocab) + 2
output_size = 1
embedding_dim = 100
hidden_dim = 256
num_layers = 2

model = Rnn_textclassification(vocab_size, embedding_dim, hidden_dim, output_size)

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [40]:
lr = 0.0001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [41]:
def train_model(model, criterion, optimizer, train_loader, device):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels.float())

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    return epoch_loss


In [42]:
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train_model(model, criterion, optimizer, train_loader, device)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')

Epoch [1/10], Loss: 0.4396
Epoch [2/10], Loss: 0.2397
Epoch [3/10], Loss: 0.1711
Epoch [4/10], Loss: 0.1475
Epoch [5/10], Loss: 0.1267
Epoch [6/10], Loss: 0.1165
Epoch [7/10], Loss: 0.1058
Epoch [8/10], Loss: 0.1031
Epoch [9/10], Loss: 0.0918
Epoch [10/10], Loss: 0.0810


In [43]:
def calculate_accuracy(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()  # Convert probabilities to binary predictions
            correct += (predicted == labels.view_as(predicted)).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return accuracy

# Calculate accuracy
accuracy = calculate_accuracy(model, test_loader, device)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.96
