### This notebook aimes to create a spam messages generation model based on several datasets contaning ones.
#### Potentially, it could be useful in further spam/non-spam classification for class imbalance overcoming or spam messages featues extraction.

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv
/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv


In [None]:
import warnings

warnings.filterwarnings("ignore")

from tqdm import tqdm

# Download the data

In [None]:

# train_data1 = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='ISO-8859-1')
# ref_data1 = train_data1[train_data1['v1'] != 'spam']['v2']
# train_data1 = train_data1[train_data1['v1'] == 'spam']['v2']

train_data2 = pd.read_csv('../input/spam-mails-dataset/spam_ham_dataset.csv')
ref_data2 = train_data2[train_data2['label'] != 'spam']['text']
train_data2 = train_data2[train_data2['label'] == 'spam']['text']

train_data3 = pd.read_csv('../input/spam-or-not-spam-dataset/spam_or_not_spam.csv')
ref_data3 = train_data3[train_data3['label'] != 1]['email']
train_data3 = train_data3[train_data3['label'] == 1]['email']

train_data = pd.concat([ train_data2, train_data3])
ref_data = pd.concat([ ref_data2, ref_data3])

# EDA

In [None]:
import re 
import nltk
from spacy.lang.en import English
from nltk.corpus import stopwords

nlp = English()

from functools import reduce
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re 
from wordcloud import WordCloud

%matplotlib inline

### Check for Nans

In [None]:
train_data.isnull().sum(), ref_data.isnull().sum()

In [None]:
# drop nan
train_data.dropna(inplace=True)

### Statistics

In [None]:
token_data = [t.lower().split() for t in train_data.values]


print(f'Total number of messages: {len(train_data)}')
print(f'Total number of tokens: {sum([len(t) for t in token_data])}')
print(f'Total number of UNIQUE tokens: {len(set(reduce(lambda x, y: x + y, token_data)))}')

Roughly estimated including punctuation and other different kinds of irrelevant characters, we have more than 500k tokens and more than 50k unique tokens

In [None]:
# ivestigating text lengths
text_lens = [len(t) for t in token_data]

fig, axes = plt.subplots(2, 1, figsize=(15, 7))

sns.boxplot(text_lens, ax=axes[0])
sns.histplot(text_lens, kde=True, ax=axes[1])
plt.show()

In [None]:
# closer look on text lens
text_lens = [len(t) for t in token_data if len(t) < 2000]

fig, axes = plt.subplots(2, 1, figsize=(15, 7))

sns.boxplot(text_lens, ax=axes[0])
sns.histplot(text_lens, kde=True, ax=axes[1])
axes[1].axvline(x=np.mean(text_lens), ymax=0.8, c='r')
axes[1].text(np.mean(text_lens)-70, 615, f'mean = {int(round(np.mean(text_lens), 0))}', c='r')
plt.show()

We observe many outliers which can exceed nearly 12k tokens in one message. The average length, however, is 169 tokens. 75% of have no more than 186 tokens.

In [None]:
# most frequent tokens
top_tokens = pd.Series(reduce(lambda x, y: x + y, token_data)).value_counts()[:30]

plt.figure(figsize=(15, 8))
sns.barplot(y=top_tokens.index, x=top_tokens.values, orient='h')
plt.title('Most frequent tokens')
plt.show()

In [None]:
all_texts_together = reduce(lambda x, y: f'{x}{y}', train_data)
punctuation_pattern = re.compile(f'[^{string.punctuation}]')

punctuation = re.sub(punctuation_pattern, '', all_texts_together)

top_punct = pd.Series(list(punctuation)).value_counts()

plt.figure(figsize=(15, 8))
sns.barplot(y=top_punct.index, x=top_punct.values, orient='h')
plt.title('Most frequent punctuations')
plt.show()

For further EDA some Preprocesing is required.

In [None]:
class EDAPreprocesser:
    def __init__(self):
        self.punkt_pattern = re.compile('[^A-z ]')
        self.tokenizer = nlp.tokenizer
        self.stemmer = nltk.stem.SnowballStemmer('english')
        self.stopwords = stopwords.words('english')
    
    def delete_punctuation(self, texts):
        return [re.sub(self.punkt_pattern, '', t) for t in tqdm(texts)]
    
    def stem(self, texts):
        return [[self.stemmer.stem(token) for token in text] for text in tqdm(texts)]
    
    def tokenize(self, texts):
        return [[*map(lambda x: str(x), self.tokenizer(text))] for text in tqdm(texts)]
    
    
    def delete_stopwords(self, texts):
        return [[t for t in text if t not in self.stopwords] for text in tqdm(texts)]
    
    def remove_spaces(self, texts):
        return [[t for t in text if not t.isspace()] for text in texts]
    
    def remove_long_words(self, texts):
        return [[t for t in text if len(t) < 20] for text in texts]
    
    def transform(self, texts):
        print('Lowering...')
        texts = [*map(str.lower, texts)] # lower
        
        print('Removing punctuation...')
        texts = self.delete_punctuation(texts)  # delete punktuation
        
        print('Tokenization...')
        texts = self.tokenize(texts)
        
        print('Stemming...')
        texts = self.stem(texts)
        
        print('Removing stopwords...')
        texts = self.delete_stopwords(texts)
        
        print('Removing spaces...')
        texts = self.remove_spaces(texts)
        
        print('Removing long words...')
        texts = self.remove_long_words(texts)
        
        return texts
        

In [None]:
eda_prep = EDAPreprocesser()

In [None]:
texts_prep = pd.Series(eda_prep.transform(train_data))

In [None]:
tokens_list = pd.Series(reduce(lambda x, y: x + y, texts_prep))
top_words = tokens_list.value_counts()[:30]

plt.figure(figsize=(15, 8))
sns.barplot(y=top_words.index, x=top_words.values, orient='h')
plt.title('Most frequent tokens')
plt.show()

In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud(background_color='white',
                     width=800, height=400,
                     random_state=42,
                     collocations=False).generate(' '.join(tokens_list))

plt.figure(figsize=(15, 8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
bigrams_list = pd.Series([' '.join(bi) for text in  texts_prep for bi in zip(text[:-1], text[1:])])
top_bigrams = bigrams_list.value_counts()[:30]

plt.figure(figsize=(15, 8))
sns.barplot(y=top_bigrams.index, x=top_bigrams.values, orient='h')
plt.title('Most frequent tokens')
plt.show()

In [None]:
# key words
ref_prep = eda_prep.transform(ref_data)

In [None]:
ref_list = pd.Series(reduce(lambda x, y: x + y, ref_prep))

In [None]:
freqs_texts = dict(tokens_list.value_counts())
freqs_ref = dict(ref_list.value_counts())
keyword_score = {token: freq / freqs_ref.get(token, 1) for token, freq in freqs_texts.items()}

In [None]:
top_keywords = sorted([(k, v) for k, v in keyword_score.items()], key=lambda x: x[1], reverse=True)

In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(y=[y[0] for y in top_keywords[:30]], x=[x[1] for x in top_keywords[:30]], orient='h')
plt.title('Most frequent keywords')
plt.show()

# Create Dataset

In [None]:
import torch

In [None]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

In [None]:
class Preprocesser:
    
    def __init__(self):
        self.tokenizer = nlp.tokenizer
        self.chars_pattern = re.compile(f'[^A-z0-9{string.punctuation} ]')
    
    def tokenize(self, texts):
        return [[*map(lambda x: str(x), self.tokenizer(text))] for text in tqdm(texts)]
    
    def remove_spaces(self, texts):
        return [[t for t in text if not t.isspace()] for text in texts]
    
    def remove_long_words(self, texts):
        return [[t for t in text if len(t) < 15] for text in texts]
    
    def remove_chars(self, texts):
        return [re.sub(self.chars_pattern, '', text) for text in tqdm(texts)]
    
    def transform(self, texts):
        print('Lowering...')
        texts = [*map(str.lower, texts)] # lower
        
        print('Removing some characters...')
        texts = self.remove_chars(texts)
        
        print('Tokenization...')
        texts = self.tokenize(texts)
        
        print('Removing spaces...')
        texts = self.remove_spaces(texts)
        
        print('Removing long words...')
        texts = self.remove_long_words(texts)
        
        return texts

    
class Indexer:
    def __init__(self, voc):
        self.voc = voc
        self.token_idx = {tok: i for i, tok in enumerate(self.voc)}
        self.idx_token =  {i: tok for tok, i in self.token_idx.items()}
        
    def encode(self, text):
        return [self.token_idx[tok] for tok in text]

    def decode(self, text):
        return [self.idx_token[tok] for tok in text]

class Dataset:
    def __init__(self, texts, context_size=5, voc=None):
        self.context_size = context_size
        self.token_count = self.__count_tokens(texts)
        self.voc = voc
        self.voc_len = len(self.voc)
        
        self.data = self.__create_data(texts)
        self.indexer = Indexer(self.voc)
        self.encoded_data = self.__encode_data()
        
    def __encode_data(self):
        encoded_data = [(self.indexer.encode(text), 
                        self.__ohe_target(target)) for text, target in self.data]
        return encoded_data
        
    def __ohe_target(self, y):
        #new_y = [0] * self.voc_len
        #new_y[self.indexer.encode([y])[0]] = 1
        return self.indexer.encode([y])
            
    def __create_data(self, texts):
        storage = []
        
        for text in texts:
            for start in range(len(text)-(self.context_size+1)):
                storage.append((text[start:start+self.context_size], 
                                text[start+self.context_size+1]))
                               
        return storage
    
    def __count_tokens(self, texts):
        token_count = {}
        
        for text in texts:
            for token in text:
                token_count[token] = token_count.get(token, 1) + 1
        return token_count
    
    def __getitem__(self, i):
        return torch.Tensor(self.encoded_data[i][0]).int(), torch.Tensor(self.encoded_data[i][1]).int()
    
    def __len__(self):
        return len(self.data)

In [None]:
voc = set(reduce(lambda x, y: x + y, Preprocesser().transform(train_data)))

In [None]:
from sklearn.model_selection import train_test_split
train_texts, test_texts = train_test_split(train_data, test_size=0.1, 
                                          shuffle=True,
                                          random_state=42)
train_texts, val_texts = train_test_split(train_texts, test_size=0.05,
                                         shuffle=True,
                                         random_state=42)

ds_train = Dataset(Preprocesser().transform(train_texts), voc=voc)
ds_val = Dataset(Preprocesser().transform(val_texts), voc=voc)
ds_test = Dataset(Preprocesser().transform(test_texts),voc=voc)

In [None]:
ds_train.data[0][0], ds_train.indexer.encode(ds_train.data[0][0])

In [None]:
ds_train.data[0][0], ds_val.indexer.encode(ds_train.data[0][0])

# Modeling

In [None]:
import pytorch_lightning as pl
from torch import optim, nn, utils
from transformers import BertTokenizer, DistilBertModel
import torch

In [None]:
BATCH_SIZE = 16
SEQ_LEN = 5

In [None]:
class TextGenerator(pl.LightningModule):
    def __init__(self, voc_size, num_layers=1, embed_size=300):
        super().__init__()
        self.voc_size = voc_size
        self.num_layers = 1
        self.embed_size = embed_size
        self.criterion = nn.CrossEntropyLoss()
        
        self.embedding = nn.Embedding(voc_size, embed_size, device=DEVICE)

        self.bilstm = nn.LSTM(input_size=embed_size,
            hidden_size=embed_size,
            num_layers=num_layers,
            dropout=0.2,
            bidirectional=True)
        
        self.lstm = nn.LSTM(input_size=embed_size*2,
            hidden_size=embed_size,
            num_layers=num_layers,
            dropout=0.2,
            bidirectional=False)
        
        self.fc1 = nn.Linear(SEQ_LEN*embed_size, SEQ_LEN*embed_size*2)
        self.fc2 = nn.Linear(SEQ_LEN*embed_size*2, voc_size)
        self.relu = nn.ReLU()
        
        self.prev_state_bilstm = self.init_state(bi=True)
        self.prev_state_lstm = self.init_state(bi=False)
        
        
    def forward(self, x):
        embed =  self.embedding(x)
        output, prev_state_bilstm = self.bilstm(embed, self.prev_state_bilstm)
        #output = output.view(SEQ_LEN, BATCH_SIZE, 2, self.embed_size)
        output, prev_state_lstm = self.lstm(output, self.prev_state_lstm)
        output = output.view(BATCH_SIZE, SEQ_LEN*self.embed_size)

        output = self.relu(self.fc1(output))
        logits = self.fc2(output)
        
        self.prev_state_bilstm = prev_state_bilstm[0].detach(), prev_state_bilstm[1].detach()
        self.prev_state_lstm = prev_state_lstm[0].detach(), prev_state_lstm[1].detach()
        
        return logits
    
    def init_state(self, bi):
        return (torch.zeros(self.num_layers+bi, SEQ_LEN, self.embed_size),
                torch.zeros(self.num_layers+bi, SEQ_LEN, self.embed_size))
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer
    
    def training_step(self, batch, batch_idx):
        X, y = batch
        y = y.long().to(DEVICE)
        X.to(DEVICE)
        
        output = torch.squeeze(self.forward(X))
        
        loss = self.criterion(output,y.squeeze())
        self.log('train_loss', loss, prog_bar=True)
        
        return loss
        
    
    def val_step(self, batch, batch_idx):
        X, y = batch
        y = y.long().to(DEVICE)
        X.to(DEVICE)
        
        output = torch.squeeze(self.forward(X))
        
        loss = self.criterion(output,y.squeeze())
        self.log('val_loss', loss, prog_bar=True)

In [None]:
train_loader = utils.data.DataLoader(ds_train, batch_size=BATCH_SIZE)
val_loader = utils.data.DataLoader(ds_val, batch_size=1)


In [None]:
next(iter(train_loader.dataset))

In [None]:
model = TextGenerator(ds_train.voc_len)
trainer = pl.Trainer()
trainer.fit(model, train_loader, val_loader,gpus=1)

In [None]:
import gc
gc.collect()