In [116]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [117]:
!wget -nc https://raw.githubusercontent.com/highcansavci/nlp-notebooks/master/spam.csv

File ‘spam.csv’ already there; not retrieving.



In [118]:
!head spam.csv

v1,v2,,,
ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
ham,Ok lar... Joking wif u oni...,,,
spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,
ham,U dun say so early hor... U c already then say...,,,
ham,"Nah I don't think he goes to usf, he lives around here though",,,
spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, �1.50 to rcv",,,
ham,Even my brother is not like to speak with me. They treat me like aids patent.,,,
ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune,,,
spam,WINNER!! As a valued network customer you have been selected to receivea �900 prize reward! To claim call 0906170146

In [119]:
df = pd.read_csv("spam.csv", encoding="ISO-8859-1")

In [120]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [121]:
df = df[["v1", "v2"]]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [122]:
df.columns = ["labels", "data"]

In [123]:
# create binary labels
df["b_labels"] = df["labels"].map({"ham": 0, "spam": 1})

In [124]:
df_train, df_test = train_test_split(df, test_size=0.33)

In [125]:
df_train.shape, df_test.shape

((3733, 3), (1839, 3))

In [126]:
# 0 = padding
# 1 = unknown
idx = 2
word2idx = {"<PAD>": 0, "<UNK>": 1}

In [127]:
for i, row in df_train.iterrows():
  tokens = row["data"].lower().split() # simple tokenization
  for token in tokens:
    if token not in word2idx:
      word2idx[token] = idx
      idx += 1

In [128]:
word2idx

{'<PAD>': 0,
 '<UNK>': 1,
 'hello.': 2,
 'sort': 3,
 'of': 4,
 'out': 5,
 'in': 6,
 'town': 7,
 'already.': 8,
 'that': 9,
 '.': 10,
 'so': 11,
 'dont': 12,
 'rush': 13,
 'home,': 14,
 'i': 15,
 'am': 16,
 'eating': 17,
 'nachos.': 18,
 'will': 19,
 'let': 20,
 'you': 21,
 'know': 22,
 'eta.': 23,
 'or': 24,
 'better': 25,
 'still': 26,
 'can': 27,
 'catch': 28,
 'her': 29,
 'and': 30,
 'ask': 31,
 'if': 32,
 'she': 33,
 'sell': 34,
 '&lt;#&gt;': 35,
 'for': 36,
 'me.': 37,
 'should': 38,
 'now.': 39,
 "how's": 40,
 'anthony.': 41,
 'are': 42,
 'bringing': 43,
 'money.': 44,
 "i've": 45,
 'school': 46,
 'fees': 47,
 'to': 48,
 'pay': 49,
 'rent': 50,
 'stuff': 51,
 'like': 52,
 'that.': 53,
 'thats': 54,
 'why': 55,
 'need': 56,
 'your': 57,
 'help.': 58,
 'a': 59,
 'friend': 60,
 'need....|': 61,
 "there'll": 62,
 'be': 63,
 'minor': 64,
 'shindig': 65,
 'at': 66,
 'my': 67,
 'place': 68,
 'later': 69,
 'tonight,': 70,
 'interested?': 71,
 'ìï': 72,
 'go': 73,
 'home': 74,
 'liao?': 7

In [129]:
len(word2idx)

10685

In [130]:
train_sentences_as_int = []
for i, row in df_train.iterrows():
  tokens = row["data"].lower().split() # simple tokenization
  sentence_as_int = [word2idx[token] for token in tokens]
  train_sentences_as_int.append(sentence_as_int)

In [131]:
test_sentences_as_int = []
for i, row in df_test.iterrows():
  tokens = row["data"].lower().split() # simple tokenization
  sentence_as_int = [word2idx[token] if token in word2idx else 1 for token in tokens]
  test_sentences_as_int.append(sentence_as_int)

In [132]:
len(train_sentences_as_int), len(test_sentences_as_int)

(3733, 1839)

In [133]:
def data_generator(X, Y, batch_size=32):
  X, Y = shuffle(X, Y)
  n_batches = int(np.ceil(len(Y) / batch_size))
  
  for i in range(n_batches):
    end = min((i + 1) * n_batches, len(Y))
    X_batch = X[i * n_batches: end]
    Y_batch = Y[i * n_batches: end]

    # pad X_batch to be N x T
    max_len = np.max([len(x) for x in X_batch])
    for j in range(len(X_batch)):
      x = X_batch[j]
      pad = [0] * (max_len - len(x))
      X_batch[j] = pad + x
    
    # convert to Tensor
    X_batch = torch.from_numpy(np.array(X_batch)).long()
    Y_batch = torch.from_numpy(np.array(Y_batch)).long()

    yield X_batch, Y_batch

In [134]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [135]:
# N x T x D
embed = nn.Embedding(len(word2idx), 20)
for inputs, targets in data_generator(train_sentences_as_int, df_train.b_labels):
  out = embed(inputs)
  print(f"Input Shape: {inputs.shape}, Output Shape: {out.shape}")
  break

Input Shape: torch.Size([117, 39]), Output Shape: torch.Size([117, 39, 20])


In [136]:
# Define the model
class CNN(nn.Module):
  def __init__(self, n_vocab, embed_dim, n_outputs):
    super(CNN, self).__init__()
    self.V = n_vocab
    self.D = embed_dim
    self.K = n_outputs

    # if input is T words
    # then output is (T, D) matrix
    self.embed = nn.Embedding(self.V, self.D)

    # conv layers 
    self.conv1 = nn.Conv1d(self.D, 32, 3, padding=1)
    self.pool1 = nn.MaxPool1d(2)
    self.conv2 = nn.Conv1d(32, 64, 3, padding=1)
    self.pool2 = nn.MaxPool1d(2)
    self.conv3 = nn.Conv1d(64, 128, 3, padding=1)
    
    self.fc = nn.Linear(128, self.K)
  
  def forward(self, X):
    # embedding layer
    # turns word indexes into word vectors
    out = self.embed(X)

    # Note: Output of the embedding is always N x T x D,
    # but conv1d expects as N x D x T.

    # conv layers
    out = out.permute(0, 2, 1)
    out = self.conv1(out)
    out = F.relu(out)
    out = self.pool1(out)
    out = self.conv2(out)
    out = F.relu(out)
    out = self.pool2(out)
    out = self.conv3(out)
    out = F.relu(out)

    # change it back
    out = out.permute(0, 2, 1)

    # max pool
    out, _ = torch.max(out, 1)

    # final dense layer
    out = self.fc(out)
    return out

In [137]:
model = CNN(len(word2idx), 20, 1)
model.to(device)

CNN(
  (embed): Embedding(10685, 20)
  (conv1): Conv1d(20, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [138]:
# Loss and Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

In [139]:
# Create generators
train_gen = lambda: data_generator(train_sentences_as_int, df_train.b_labels)
test_gen = lambda: data_generator(test_sentences_as_int, df_test.b_labels)

In [140]:
def batch_gd(model, criterion, optimizer, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for i in range(epochs):
    t0 = datetime.now()
    train_loss = []
    for inputs, targets in train_gen():
      targets = targets.view(-1, 1).float()
      # move data to GPU
      inputs, targets = inputs.to(device), targets.to(device)

      # zero the parameter gradients
      optimizer.zero_grad()

      # Forward Pass
      outputs = model(inputs)
      loss = criterion(outputs, targets)

      # Backward and Optimize
      loss.backward()
      optimizer.step()

      train_loss.append(loss.item())
    
    # Get train loss and test loss
    train_losses[i] = np.mean(train_loss)

    test_loss = []
    for inputs, targets in test_gen():
      inputs, targets = inputs.to(device), targets.to(device)
      targets = targets.view(-1, 1).float()
      outputs = model(inputs)
      loss = criterion(outputs, targets)
      test_loss.append(loss.item())
    
    test_losses[i] = np.mean(test_loss)
    dt = datetime.now() - t0

    print(f"Epoch: {i+1} / {epochs}, Train Loss: {train_losses[i]:.4f}, Test Loss: {test_losses[i]:.4f}, Duration: {dt}")
  
  return train_losses, test_losses

In [141]:
train_losses, test_losses = batch_gd(model, criterion, optimizer, epochs=8)

ValueError: ignored