<a href="https://colab.research.google.com/github/levimcclenny/gurus/blob/master/Task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import string

# Load raw data
texts = []
labels = []
with open("drive/MyDrive/lp_data/SMSSpamCollection.txt", "r") as infile:
  for l in infile:
    label, text = l.strip().split("\t")
    if label == "ham":
      labels.append(0)
    else:
      labels.append(1)
    text = "".join([ch.lower() for ch in text if ch not in string.punctuation])
    texts.append(text)

labels = np.asarray(labels)

tokenizer = Tokenizer(num_words=5000) # voccab size is 9661 -> 5000 should be enough
tokenizer.fit_on_texts(texts)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(texts, labels, random_state=42)

X_train_emb = tokenizer.texts_to_sequences(X_train_raw)
X_test_emb = tokenizer.texts_to_sequences(X_test_raw)

X_train_emb = tokenizer.texts_to_sequences(X_train_raw)
X_test_emb = tokenizer.texts_to_sequences(X_test_raw)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train_emb, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test_emb, padding='post', maxlen=maxlen)


# numeric labels
def relabel(label):
  if label == "spam":
    return 0
  else:
    return 1

y_train = np.asarray([relabel(e) for e in y_train])
y_test = np.asarray([relabel(e) for e in y_test])

# convert them into tensors
X_train_tensor = torch.from_numpy(X_train).float()
X_test_tensor = torch.from_numpy(X_test).float()
Y_train_tensor = torch.from_numpy(np.array(y_train))
Y_test_tensor = torch.from_numpy(np.array(y_test))

In [3]:
from torch.utils.data import Dataset, TensorDataset
from torch.utils.data import DataLoader

train_data = TensorDataset(X_train_tensor, Y_train_tensor)
train_loader = DataLoader(train_data,batch_size=16, shuffle=True)

test_data = TensorDataset(X_test_tensor, Y_test_tensor)
test_loader = DataLoader(test_data,batch_size=16, shuffle=True)

In [4]:

class Network(torch.nn.Module):
  def __init__(self,vocab_size,hidden_units,num_classes): 
    super().__init__()
    #First fully connected layer
    self.fc1 = torch.nn.Linear(vocab_size,hidden_units)
    #Second fully connected layer
    self.fc2 = torch.nn.Linear(hidden_units,num_classes)
    #Final output of sigmoid function      
    self.output = torch.nn.Sigmoid()

  def forward(self,x):
    fc1 = self.fc1(x)
    fc2 = self.fc2(fc1)
    output = self.output(fc2)
    return output[:, -1]

In [5]:

import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("logs")

NUM_EPOCHS = 200
VOCAB_SIZE = 8110
HIDDEN_UNITS = 5
OUT_CLASSES = 1
LEARNING_RATE = 0.001




#Initialize model
model = Network(VOCAB_SIZE,HIDDEN_UNITS,OUT_CLASSES)
model.cuda()
print(model)
#Initialize optimizer
optimizer =torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
#Initialize loss function
loss_fun = torch.nn.BCELoss()
for i in range(NUM_EPOCHS):
  # print(i)
  PATH = f"/content/drive/My Drive/Colab Notebooks/01_models/pytorch/pytorch_model_epoch{i}.pt"
  for x_batch, y_batch in train_loader:
    model.train()
    y_pred = model(x_batch.cuda())
    loss = loss_fun(y_pred.cuda(), y_batch.float().cuda())
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  writer.add_scalar("Loss/train", loss, i)
  torch.save({"epoch": i,
              "model_state_dict": model.state_dict(),
              "optimizer_state_dict": optimizer.state_dict(),
              "loss": loss}, PATH)
print('After {} epoch training loss is {}'.format(i,loss.item()))

Network(
  (fc1): Linear(in_features=8110, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=1, bias=True)
  (output): Sigmoid()
)


RuntimeError: ignored

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:

from sklearn.metrics import classification_report

y_pred_tensor = model(X_test_tensor.cuda())
y_pred_probs = y_pred_tensor.cpu().detach().numpy()
# we want high recall & precision for spam
y_pred = np.asarray([(lambda x: 1 if x > 0.80 else 0) (x) for x in y_pred_probs])

print(classification_report(y_test, y_pred, target_names=["spam", "ham"]))