In [1]:
import pandas as pd
import spacy
import numpy as np
from spacy.lang.en import English
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from tqdm import tqdm
from sklearn.model_selection import train_test_split
en = English()

In [None]:
!spacy download en

Collecting en_vectors_web_lg==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz#egg=en_vectors_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz (661.8MB)
[K    31% |██████████▏                     | 211.4MB 10.6MB/s eta 0:00:43

In [None]:
!pip install -r requirements.txt

Download data from 
http://students.mimuw.edu.pl/~sg385513/sms-spam-collection-dataset.zip

In [2]:
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1", usecols=['v1', 'v2'])
df.columns=['label', 'text']
df.head()

FileNotFoundError: [Errno 2] File b'spam.csv' does not exist: b'spam.csv'

In [None]:
df.groupby('label').count()

### Spliting data

Rember to split data even before any preprocessing, to eliminate your bias of preprocessing params on validation set

In [None]:
train_set, validation_set = train_test_split(df, stratify = df.label, random_state=123)

In [None]:
train_set = train_set[['text', 'label']].get_values()
validation_set = validation_set[['text', 'label']].get_values()

## Tokenization

In [None]:
def unique_words(docs):
    unique_words = set()
    for doc in docs:
        for word in doc:
            unique_words.add(word)
    
    return unique_words 

#### By whitespace

In [None]:
docs = []
labels = []
for text, label in train_set:
    docs.append(text.split())
    labels.append(label)

len(unique_words(docs))

#### With tokenizer

In [None]:
docs = []
labels = []
for tokens, label in en.pipe(train_set, as_tuples=True):
    docs.append(tokens)
    labels.append(label)
    
tokenized_texts = [[token.text for token in doc] for doc in docs]
len(unique_words(tokenized_texts))

### Normalized words
Lowercasing + simple normalization (e.g. hyphens and dashes merged to one symbol)

In [None]:
normalized_texts = [[token.norm_ for token in doc] for doc in docs]

len(unique_words(normalized_texts))

In [None]:
#excercise 1:

def make_BOW_norms(text):
    #write here

### Dataset class

In [None]:
class SpamData(Dataset):
    def __init__(self, docs, labels, bow_maker):
        self.labels = labels
        self.bows = [bow_maker(doc) for doc in docs]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        bow = self.bows[idx]
        label = self.labels[idx]
        if label =='spam':
            l_id = 1
        else:
            l_id = 0
        sample = (torch.tensor(bow, dtype = torch.float), torch.tensor(l_id))

        return sample

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#excercise 2:

class Net(nn.Module):
    def __init__(self, input_size):
        #write here

    def forward(self, x):
        #write here

In [3]:
#excercise 3:

def calc_accuracy(model, dataset):
    with torch.no_grad():
        #write here

SyntaxError: unexpected EOF while parsing (<ipython-input-3-a1404e1ecfdd>, line 5)

In [None]:
epochs_loss = []
epochs_accuracy = []
epochs_val_accuracy = []

#excercise 4
def train(net, train_dataset, validation_dataset, batch_size, epochs):
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.7)
    batches_num = len(train_dataset) // batch_size
    print(f"Will train with {batches_num} batches.")
    
    epochs_loss = []
    epochs_accuracy = []
    epochs_val_accuracy = []
    for epoch in range(epochs):  # loop over the dataset multiple times
        
        running_loss = 0.0
        correct_predictions = 0
        for inputs, y in (trainloader):
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            
            #write here

        #and here

        val_accuracy = calc_accuracy(net, validation_dataset)

        epochs_val_accuracy.append(val_accuracy)
        print(f"Loss {avg_loss}, training accuracy {accuracy}, validation accuracy {val_accuracy}")

    print('Finished Training')

In [None]:
batch_size=10

train_x, train_y = train_set.T
val_x, val_y = validation_set.T
train_dataset = SpamData(train_x, train_y, make_BOW_norms)
validation_dataset = SpamData(val_x, val_y, make_BOW_norms)

net = Net(DICT_SIZE + 1)

train(net, train_dataset, validation_dataset, 10, 10)

In [None]:
calc_accuracy(net, validation_dataset)

### Lemmas

In [None]:
lemmatized_texts = [[token.lemma_ for token in doc] for doc in docs]

unique_lemmas = unique_words(lemmatized_texts)
len(unique_lemmas)

In [None]:
#excercise 5

def make_BOW_lemmas(text):
    #write here

In [None]:
train_dataset = SpamData(train_x, train_y, make_BOW_lemmas)
validation_dataset = SpamData(val_x, val_y, make_BOW_lemmas)

net = Net(LEMMAS_DICT_SIZE+1)

train(net, train_dataset, validation_dataset, 10, 10)

In [None]:
calc_accuracy(net, validation_dataset)

### Stems

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

stemmed_texts = [[stemmer.stem(token.norm_) for token in doc] for doc in docs]

unique_stems=unique_words(stemmed_texts)
len(unique_stems)

In [None]:
#excercise 6

def make_BOW_stems(text):
    #write here

In [None]:
train_dataset = SpamData(train_x, train_y, make_BOW_stems)
validation_dataset = SpamData(val_x, val_y, make_BOW_stems)

net = Net(STEMS_DICT_SIZE+1)

train(net, train_dataset, validation_dataset, 10, 10)

In [None]:
calc_accuracy(net, validation_dataset)

### Hashing tree

In [None]:
#excercise 7

DICT_SIZE = 1000

def make_hashed_BOW_stems(text):
    #write here

In [None]:
train_dataset = SpamData(train_x, train_y, make_hashed_BOW_stems)
validation_dataset = SpamData(val_x, val_y, make_hashed_BOW_stems)

net = Net(DICT_SIZE)

train(net, train_dataset, validation_dataset, 10, 10)

In [None]:
calc_accuracy(net, validation_dataset)

## Vectors

In [None]:
en = spacy.load('en')

In [None]:
#excercise 7

def make_continous_BOW(text):
    #write here

In [None]:
train_dataset = SpamData(train_x, train_y, make_continous_BOW)
validation_dataset = SpamData(val_x, val_y, make_continous_BOW)

net = Net(96)

train(net, train_dataset, validation_dataset, 10, 10)

In [None]:
calc_accuracy(net, validation_dataset)