<a href="https://colab.research.google.com/github/josepeon/python_dad_class/blob/main/text_intro_to_rnn_dele.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification with Neural Networks

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#### Classifying Spam

In [3]:
#read in data
spam = pd.read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/refs/heads/master/sms_spam.csv')

In [4]:
#take a peek
spam.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
#create a tokenizer
tokenizer = Tokenizer(num_words = 500)

In [6]:
#fit the tokenizer -- learns the vocabulary
tokenizer.fit_on_texts(spam['text'].values)

In [7]:
#look at tokenizer
tokenizer.num_words

500

In [8]:
#create document term matrix (binarized)
dtm = tokenizer.texts_to_matrix(spam['text'].values)

In [9]:
#take a peek
dtm

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [10]:
tokenizer.index_word[2]

'to'

In [11]:
spam['text'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [12]:
[tokenizer.index_word[i] for i in range(1, 500)]

['i',
 'to',
 'you',
 'a',
 'the',
 'u',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'it',
 'of',
 'call',
 'have',
 'on',
 '2',
 'that',
 'now',
 'are',
 'so',
 'but',
 'not',
 'or',
 'do',
 'can',
 'at',
 "i'm",
 'ur',
 'get',
 'will',
 'if',
 'be',
 'with',
 'just',
 'no',
 'we',
 'this',
 '4',
 'gt',
 'lt',
 'up',
 'when',
 'ok',
 'free',
 'from',
 'go',
 'how',
 'all',
 'out',
 'what',
 'know',
 'like',
 'good',
 'then',
 'got',
 'come',
 'was',
 'its',
 'am',
 'time',
 'only',
 'day',
 'love',
 'there',
 'send',
 'he',
 'want',
 'text',
 'as',
 'txt',
 'one',
 'going',
 'by',
 'ü',
 "i'll",
 'need',
 'home',
 'about',
 'r',
 'lor',
 'sorry',
 'stop',
 'still',
 'see',
 'n',
 'back',
 'today',
 'da',
 'our',
 'dont',
 'reply',
 'k',
 "don't",
 'she',
 'mobile',
 'take',
 'hi',
 'tell',
 'new',
 'please',
 'later',
 'her',
 'pls',
 'any',
 'think',
 'been',
 'they',
 'phone',
 'here',
 'week',
 'did',
 'dear',
 'some',
 'well',
 'has',
 '1',
 'night',
 'much',
 'd',
 'gre

In [13]:
class TextDataset(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.x = torch.tensor(X, dtype = torch.float)
    self.y = torch.tensor(y, dtype = torch.float)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [14]:
X = dtm
y = np.where(spam['type'] == 'ham', 0, 1)

In [17]:
Xt = torch.tensor(X, dtype = torch.float32)
yt = torch.tensor(y, dtype = torch.float32)

In [18]:
from torch.utils.data import TensorDataset, DataLoader

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(Xt, yt, test_size=.2)

In [21]:
X_train

tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.]])

In [22]:
#create data class
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

  self.x = torch.tensor(X, dtype = torch.float)
  self.y = torch.tensor(y, dtype = torch.float)


In [24]:
#dataset and loader -- making batches of our bigger dataset
trainloader = DataLoader(train_dataset, batch_size = 32)
#dataset and loader
testloader = DataLoader(test_dataset, batch_size = 32)

In [None]:
model = nn.Sequential(nn.Linear)

In [25]:
#loss and optimizer
class TextModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin1 = nn.Linear(in_features = 500, out_features = 100)
    self.lin2 = nn.Linear(100, 100)
    self.lin3 = nn.Linear(100, 1)
    self.sigmoid = nn.Sigmoid()
    self.act = nn.ReLU()

  def forward(self, x):
    x = self.act(self.lin1(x))
    x = self.act(self.lin2(x))
    return self.sigmoid(self.lin3(x))




In [None]:
#training function
model = TextModel()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#torch.save(model, 'textmodel.pt')

In [None]:
from tqdm import tqdm

In [None]:
#evaluate
for epoch in tqdm(range(100)):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

  2%|▏         | 2/100 [00:00<00:22,  4.44it/s]

Epoch 0 Loss: 17.001398101157974


 12%|█▏        | 12/100 [00:02<00:18,  4.83it/s]

Epoch 10 Loss: 0.30588380191235665


 22%|██▏       | 22/100 [00:04<00:14,  5.29it/s]

Epoch 20 Loss: 0.21212174605816655


 31%|███       | 31/100 [00:06<00:14,  4.88it/s]

Epoch 30 Loss: 0.20318245815315306


 42%|████▏     | 42/100 [00:08<00:11,  5.03it/s]

Epoch 40 Loss: 0.17618561931041407


 52%|█████▏    | 52/100 [00:10<00:09,  5.26it/s]

Epoch 50 Loss: 0.17469061255728138


 62%|██████▏   | 62/100 [00:12<00:07,  5.16it/s]

Epoch 60 Loss: 51.92242291022191


 72%|███████▏  | 72/100 [00:14<00:04,  5.60it/s]

Epoch 70 Loss: 6.431578095463443


 82%|████████▏ | 82/100 [00:16<00:03,  5.69it/s]

Epoch 80 Loss: 6.425228226558245


 91%|█████████ | 91/100 [00:18<00:01,  4.88it/s]

Epoch 90 Loss: 6.420938703532891


100%|██████████| 100/100 [00:20<00:00,  4.99it/s]


In [None]:
Xt = torch.tensor(X_test, dtype = torch.float)

In [None]:

output = model(Xt) #model predictions

In [None]:
output

tensor([[1.0000e+00],
        [4.3710e-09],
        [0.0000e+00],
        ...,
        [9.7863e-23],
        [7.6954e-32],
        [1.6741e-13]], grad_fn=<SigmoidBackward0>)

In [None]:
#Converting probabilities to prediction
preds = np.where(np.array(output.detach()) >= .5, 1, 0)

In [None]:
preds.shape

(1115, 1)

In [None]:
y = np.where(spam['type'] == 'ham', 0, 1)

In [None]:
sum(preds[:, 0] == y_test)/len(y_test)

np.float64(0.9865470852017937)

In [None]:
1 - sum(y_test)/len(y_test)

np.float64(0.8780269058295964)

### Basic RNN

![](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/Recurrent_neural_network_unfold.svg/440px-Recurrent_neural_network_unfold.svg.png)

In [None]:
#create sequences
sequences = tokenizer.texts_to_sequences(spam['text'].values)

In [None]:
#look at first sequence
sequences[0]

[49, 471, 64, 8, 88, 123, 351, 148, 67, 58, 145]

In [None]:
X_train[0]

array([0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0.

In [None]:
#compare to text
spam['text'].values[1]

'Ok lar... Joking wif u oni...'

In [None]:
#pad and make all same length
sequences = pad_sequences(sequences, maxlen=100)

In [None]:
#examine results
sequences[1].shape

(100,)

In [None]:
sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,  46, 336, 472,   6], dtype=int32)

In [None]:
#example rnn
rnn = nn.RNN(input_size = 100,
             hidden_size = 30,
             num_layers = 1,
             batch_first = True)

In [None]:
#pass data through
sample_sequence = torch.tensor(sequences[1],
                               dtype = torch.float,
                               ).reshape(1, -1)
sample_sequence.shape

torch.Size([1, 100])

In [None]:
#output
output, hidden = rnn(sample_sequence)

In [None]:
#hidden
hidden.shape

torch.Size([1, 30])

In [None]:
#linear layer
output.shape

torch.Size([1, 30])

In [None]:
#pass through linear
lin1 = nn.Linear(in_features = 30, out_features = 1)

In [None]:
lin1(output)

tensor([[-0.3631]], grad_fn=<AddmmBackward0>)

In [None]:
for x, y in trainloader:
  print(x.shape)
  break

torch.Size([32, 500])


In [None]:
class TextDataset(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.x = torch.tensor(X, dtype = torch.float)
    self.y = torch.tensor(y, dtype = torch.float)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [None]:
#class
class BasicRNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.RNN(input_size = 100,
                    hidden_size = 50,
                    num_layers = 3,
                    batch_first = True)
    self.lin1 = nn.Linear(in_features = 50, out_features=1000)
    self.lin2 = nn.Linear(1000, 100)
    self.lin3 = nn.Linear(100, 1)
    self.act = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x) #extracting important information
    x = self.act(self.lin1(x)) #multilayer perceptron -- to predict
    x = self.act(self.lin2(x))
    x = self.sigmoid(self.lin3(x))
    return x


In [None]:
#data
X = sequences
y = np.where(spam['type'] == 'spam', 1, 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)
traindata = TextDataset(X_train, y_train)
trainloader = DataLoader(traindata, batch_size = 32)

In [None]:
#optimizer and loss
model = BasicRNN()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#train
for epoch in tqdm(range(100)):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

  1%|          | 1/100 [00:01<01:42,  1.04s/it]

Epoch 0 Loss: 53.466545045375824


 11%|█         | 11/100 [00:13<01:47,  1.21s/it]

Epoch 10 Loss: 48.20322820544243


 21%|██        | 21/100 [00:25<01:35,  1.21s/it]

Epoch 20 Loss: 46.3197163939476


 31%|███       | 31/100 [00:38<01:20,  1.17s/it]

Epoch 30 Loss: 46.69928106665611


 41%|████      | 41/100 [00:50<01:10,  1.20s/it]

Epoch 40 Loss: 48.800177067518234


 51%|█████     | 51/100 [01:04<01:06,  1.36s/it]

Epoch 50 Loss: 48.15520025789738


 61%|██████    | 61/100 [01:16<00:51,  1.31s/it]

Epoch 60 Loss: 48.796837240457535


 71%|███████   | 71/100 [01:29<00:37,  1.28s/it]

Epoch 70 Loss: 49.21878685057163


 81%|████████  | 81/100 [01:42<00:24,  1.29s/it]

Epoch 80 Loss: 46.78614544868469


 91%|█████████ | 91/100 [01:54<00:11,  1.23s/it]

Epoch 90 Loss: 47.94999946653843


100%|██████████| 100/100 [02:05<00:00,  1.26s/it]


In [None]:
Xt = torch.tensor(X_test, dtype = torch.float)

In [None]:
output = model(Xt)

In [None]:
preds = np.where(np.array(output.detach()) >= .5, 1, 0)

In [None]:
#preds = output.argmax(axis = 1)

In [None]:
y_test

array([0, 1, 0, ..., 0, 0, 0])

In [None]:
# y = np.where(spam['type'] == 'ham', 0, 1)

In [None]:
# y.shape

In [None]:
sum(preds.reshape(1115,) == y_test)/len(y_test)

np.float64(0.863677130044843)

#### LSTM

In [None]:
# nn.LSTM()
class BasicLSTM(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.LSTM(input_size = 100,
                    hidden_size = 100,
                    num_layers = 1,
                    batch_first = True)

    self.lin1 = nn.Linear(in_features = 100, out_features=100)
    self.lin2 = nn.Linear(in_features = 100, out_features = 1)
    self.act = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.act(self.lin1(x))
    x = self.lin2(x)
    return self.sigmoid(x)

In [None]:
model = BasicLSTM()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#train
for epoch in range(10):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

In [None]:
Xt = torch.tensor(X_test, dtype = torch.float)
output = model(Xt)
preds = np.where(np.array(output.detach()) >= .5, 1, 0)
sum(preds[:, 0] == y_test)/len(y_test)

In [None]:
#pad and make all same length
sequences = pad_sequences(sequences, maxlen=30)

In [None]:
sequences[0]

In [None]:
X = sequences
y = np.where(spam['type'] == 'spam', 1, 0)
data = TextDataset(X, y)
loader = DataLoader(data, batch_size = 32)

In [None]:
class RNN2(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.GRU(input_size = 30,
                    hidden_size = 30,
                    num_layers = 2,
                    batch_first = True)

    self.lin1 = nn.Linear(in_features = 30, out_features=100)
    self.lin2 = nn.Linear(in_features = 100, out_features = 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.lin1(x)
    x = self.lin2(x)
    return self.sigmoid(x)

In [None]:
model = RNN2()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#train
for epoch in range(100):
  losses = 0
  for x,y in loader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

In [None]:
Xt = torch.tensor(sequences, dtype = torch.float)
output = model(Xt)
preds = np.where(np.array(output.detach()) >= .5, 1, 0)
y = np.where(spam['type'] == 'ham', 0, 1)
sum(preds[:, 0] == y)/len(y)