In [5]:
import pandas as pd
import spacy
import numpy as np
from spacy.lang.en import English
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from tqdm import tqdm
from sklearn.model_selection import train_test_split
en = English()

In [6]:
#!spacy download en_vectors_web_lg

In [7]:
#!pip install -r requirements.txt

Download data from 
http://students.mimuw.edu.pl/~sg385513/sms-spam-collection-dataset.zip

In [8]:
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1", usecols=['v1', 'v2'])
df.columns=['label', 'text']
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
ham,4825
spam,747


### Spliting data

Rember to split data even before any preprocessing, to eliminate your bias of preprocessing params on validation set

In [10]:
train_set, validation_set = train_test_split(df, stratify = df.label, random_state=123)

In [11]:
train_set = train_set[['text', 'label']].get_values()
validation_set = validation_set[['text', 'label']].get_values()

## Tokenization

In [12]:
def unique_words(docs):
    unique_words = set()
    for doc in docs:
        for word in doc:
            unique_words.add(word)
    
    return unique_words 

#### By whitespace

In [13]:
docs = []
labels = []
for text, label in train_set:
    docs.append(text.split())
    labels.append(label)

len(unique_words(docs))

13102

#### With tokenizer

In [14]:
docs = []
labels = []
for tokens, label in en.pipe(train_set, as_tuples=True):
    docs.append(tokens)
    labels.append(label)
    
tokenized_texts = [[token.text for token in doc] for doc in docs]
len(unique_words(tokenized_texts))

9914

### Normalized words
Lowercasing + simple normalization (e.g. hyphens and dashes merged to one symbol)

In [15]:
normalized_texts = [[token.norm_ for token in doc] for doc in docs]

len(unique_words(normalized_texts))

8136

In [16]:
#excercise 1:
unique_normalized = unique_words(normalized_texts)
DICT_SIZE = len(unique_normalized)
WORDS_TO_INDEX = {token: i for i, token in enumerate(unique_normalized)}

def make_BOW_norms(text):
    doc = en(text)
    result_vector = np.zeros(DICT_SIZE + 1)
    for token in doc:
        result_vector[WORDS_TO_INDEX.get(token.norm_, DICT_SIZE)]+=1

    return result_vector

### Dataset class

In [17]:
class SpamData(Dataset):
    def __init__(self, docs, labels, bow_maker):
        self.labels = labels
        self.bows = [bow_maker(doc) for doc in docs]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        bow = self.bows[idx]
        label = self.labels[idx]
        if label =='spam':
            l_id = 1
        else:
            l_id = 0
        sample = (torch.tensor(bow, dtype = torch.float), torch.tensor(l_id))

        return sample

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#excercise 2:

class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [19]:
#excercise:

def calc_accuracy(model, dataset):
    with torch.no_grad():
        loader = DataLoader(dataset, 1000)
        correct_predictions = 0
        for x, y in loader:
            prediction = model(x)
            prob, pred_label = torch.max(prediction, dim=1)
            assert(len(pred_label) == len(x))
            correct_predictions += int(torch.sum(y==pred_label))
    return correct_predictions/len(dataset)


def calc_f1(model, dataset):
    all_y = []
    all_predictions = []
    with torch.no_grad():
        loader = DataLoader(dataset, 1000)
        for x, y in loader:
            output = model(x)
            prob, pred_label = torch.max(output, dim=1)
            all_y.append(y)
            all_predictions.append(pred_label)
            
    y = np.concatenate(all_y)
    preds = np.concatenate(all_predictions)

    return f1_score(y, preds)

In [20]:
epochs_loss = []
epochs_accuracy = []
epochs_val_accuracy = []

def train(net, train_dataset, validation_dataset, batch_size, epochs):
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.7)
    batches_num = len(train_dataset) // batch_size
    print(f"Will train with {batches_num} batches.")
    
    epochs_loss = []
    epochs_accuracy = []
    epochs_val_accuracy = []
    for epoch in range(epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        correct_predictions = 0
        for inputs, y in (trainloader):
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            prob, pred_label = torch.max(outputs, dim=1)
            correct_predictions += int(torch.sum(y==pred_label))

        avg_loss = running_loss/batches_num
        accuracy = correct_predictions/len(train_dataset)
        epochs_loss.append(avg_loss)
        epochs_accuracy.append(accuracy)

        val_accuracy = calc_accuracy(net, validation_dataset)

        epochs_val_accuracy.append(val_accuracy)
        print(f"Loss {avg_loss}, training accuracy {accuracy}, validation accuracy {val_accuracy}")

    print('Finished Training')

In [21]:
batch_size=10

train_x, train_y = train_set.T
val_x, val_y = validation_set.T
train_dataset = SpamData(train_x, train_y, make_BOW_norms)
validation_dataset = SpamData(val_x, val_y, make_BOW_norms)

net = Net(DICT_SIZE + 1)

train(net, train_dataset, validation_dataset, 10, 10)

Will train with 417 batches.


KeyboardInterrupt: 

In [None]:
calc_accuracy(net, validation_dataset)

### Lemmas

In [22]:
lemmatized_texts = [[token.lemma_ for token in doc] for doc in docs]

unique_lemmas = unique_words(lemmatized_texts)
len(unique_lemmas)

8958

In [23]:
LEMMAS_DICT_SIZE = len(unique_lemmas)
LEMMAS_TO_INDEX = {token: i for i, token in enumerate(unique_lemmas)}

def make_BOW_lemmas(text):
    doc = en(text)
    result_vector = np.zeros(LEMMAS_DICT_SIZE + 1)
    for token in doc:
        result_vector[LEMMAS_TO_INDEX.get(token.lemma_, LEMMAS_DICT_SIZE)]+=1

    return result_vector

In [24]:
train_dataset = SpamData(train_x, train_y, make_BOW_lemmas)
validation_dataset = SpamData(val_x, val_y, make_BOW_lemmas)

net = Net(LEMMAS_DICT_SIZE+1)

train(net, train_dataset, validation_dataset, 10, 10)

Will train with 417 batches.
Loss 0.4511007210619444, training accuracy 0.873653984206748, validation accuracy 0.9562096195262024
Loss 0.3553209751487064, training accuracy 0.9674563292653745, validation accuracy 0.9820531227566404
Loss 0.3357818430419163, training accuracy 0.9818138310600623, validation accuracy 0.9842067480258435
Loss 0.3296222898314039, training accuracy 0.9875568317779373, validation accuracy 0.9870782483847811


KeyboardInterrupt: 

In [None]:
calc_accuracy(net, validation_dataset)

### Stems

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

stemmed_texts = [[stemmer.stem(token.norm_) for token in doc] for doc in docs]

unique_stems=unique_words(stemmed_texts)
len(unique_stems)

In [None]:
STEM_TO_INDEX = {token: i for i, token in enumerate(unique_stems)}
STEMS_DICT_SIZE = len(unique_stems)

def make_BOW_stems(text):
    doc = en(text)
    result_vector = np.zeros(STEMS_DICT_SIZE+1)
    for token in doc:
        result_vector[STEM_TO_INDEX.get(stemmer.stem(token.norm_), STEMS_DICT_SIZE)]+=1

    return result_vector

In [None]:
train_dataset = SpamData(train_x, train_y, make_BOW_stems)
validation_dataset = SpamData(val_x, val_y, make_BOW_stems)

net = Net(STEMS_DICT_SIZE+1)

train(net, train_dataset, validation_dataset, 10, 10)

In [None]:
calc_accuracy(net, validation_dataset)

## Hashing Trick

In [None]:
DICT_SIZE = 1000

def make_hashed_BOW_stems(text):
    doc = en(text)
    result_vector = np.zeros(DICT_SIZE)
    for token in doc:
        result_vector[hash(stemmer.stem(token.norm_)) % DICT_SIZE]+=1

    return result_vector

In [None]:
train_dataset = SpamData(train_x, train_y, make_hashed_BOW_stems)
validation_dataset = SpamData(val_x, val_y, make_hashed_BOW_stems)

net = Net(DICT_SIZE)

train(net, train_dataset, validation_dataset, 10, 10)

In [None]:
calc_accuracy(net, validation_dataset)

## Vectors

In [None]:
en = spacy.load('en')

In [None]:
def make_continous_BOW(text):
    doc = en(text)
    return sum([token.vector for token in doc])

In [None]:
train_dataset = SpamData(train_x, train_y, make_continous_BOW)
validation_dataset = SpamData(val_x, val_y, make_continous_BOW)

net = Net(96)

train(net, train_dataset, validation_dataset, 10, 10)

In [None]:
calc_accuracy(net, validation_dataset)