<a href="https://colab.research.google.com/github/josepeon/python_dad_class/blob/main/text_intro_to_rnn_dele.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification with Neural Networks

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#### Classifying Spam

In [3]:
#read in data
spam = pd.read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/refs/heads/master/sms_spam.csv')

In [4]:
#take a peek
spam.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
#create a tokenizer
tokenizer = Tokenizer(num_words = 500)

In [6]:
#fit the tokenizer -- learns the vocabulary
tokenizer.fit_on_texts(spam['text'].values)

In [7]:
#look at tokenizer
tokenizer.num_words

500

In [8]:
#create document term matrix (binarized)
dtm = tokenizer.texts_to_matrix(spam['text'].values)

In [9]:
#take a peek
dtm

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [10]:
tokenizer.index_word[2]

'to'

In [11]:
spam['text'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [12]:
[tokenizer.index_word[i] for i in range(1, 500)]

['i',
 'to',
 'you',
 'a',
 'the',
 'u',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'it',
 'of',
 'call',
 'have',
 'on',
 '2',
 'that',
 'now',
 'are',
 'so',
 'but',
 'not',
 'or',
 'do',
 'can',
 'at',
 "i'm",
 'ur',
 'get',
 'will',
 'if',
 'be',
 'with',
 'just',
 'no',
 'we',
 'this',
 '4',
 'gt',
 'lt',
 'up',
 'when',
 'ok',
 'free',
 'from',
 'go',
 'how',
 'all',
 'out',
 'what',
 'know',
 'like',
 'good',
 'then',
 'got',
 'come',
 'was',
 'its',
 'am',
 'time',
 'only',
 'day',
 'love',
 'there',
 'send',
 'he',
 'want',
 'text',
 'as',
 'txt',
 'one',
 'going',
 'by',
 'ü',
 "i'll",
 'need',
 'home',
 'about',
 'r',
 'lor',
 'sorry',
 'stop',
 'still',
 'see',
 'n',
 'back',
 'today',
 'da',
 'our',
 'dont',
 'reply',
 'k',
 "don't",
 'she',
 'mobile',
 'take',
 'hi',
 'tell',
 'new',
 'please',
 'later',
 'her',
 'pls',
 'any',
 'think',
 'been',
 'they',
 'phone',
 'here',
 'week',
 'did',
 'dear',
 'some',
 'well',
 'has',
 '1',
 'night',
 'much',
 'd',
 'gre

In [13]:
class TextDataset(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.x = torch.tensor(X, dtype = torch.float)
    self.y = torch.tensor(y, dtype = torch.float)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [14]:
X = dtm
y = np.where(spam['type'] == 'ham', 0, 1)

In [15]:
Xt = torch.tensor(X, dtype = torch.float32)
yt = torch.tensor(y, dtype = torch.float32)

In [16]:
from torch.utils.data import TensorDataset, DataLoader

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(Xt, yt, test_size=.2)

In [19]:
X_train

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [20]:
#create data class
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

  self.x = torch.tensor(X, dtype = torch.float)
  self.y = torch.tensor(y, dtype = torch.float)


In [21]:
#dataset and loader -- making batches of our bigger dataset
trainloader = DataLoader(train_dataset, batch_size = 32)
#dataset and loader
testloader = DataLoader(test_dataset, batch_size = 32)

In [22]:
model = nn.Sequential(nn.Linear(in_features=500, out_features=1000),
                      nn.ReLU(),
                      nn.Linear(1000, 100),
                      nn.ReLU(),
                      nn.Linear(100, 1),
                      nn.Sigmoid()
                      )

In [23]:
model(Xt)

tensor([[0.4825],
        [0.4844],
        [0.4775],
        ...,
        [0.4843],
        [0.4790],
        [0.4842]], grad_fn=<SigmoidBackward0>)

In [24]:
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)

In [25]:
from tqdm import tqdm

In [26]:
#keep track of the losses
losses = []
#train it for 20 epochs
for epoch in tqdm(range(20)):
  #iterate over the batches
  for x, y in trainloader:
    #feeds data into the model
    yhat = model(x)
    #evaluate the predictions
    loss =loss_fn(yhat, y.unsqueeze(1))
    #update the weights/params
    optimizer.zero_grad() #pytorch house cleaning
    loss.backward() #pass info backward
    optimizer.step() #step toward less loss
    losses.append(loss.item()) #tracking the loss

100%|██████████| 20/20 [00:10<00:00,  1.83it/s]


In [27]:
train_predictions = model(X_train)
ytrain_preds = torch.where(train_predictions> .5, 1, 0)

In [28]:
torch.sum(ytrain_preds.squeeze(1) == y_train)/len(y_train)

tensor(0.9998)

In [29]:
ytest_preds = torch.where(model(X_test) > .5, 1, 0)
torch.sum(ytest_preds.squeeze(1) == y_test)/len(y_test)

tensor(0.9892)

In [30]:
#loss and optimizer
class TextModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin1 = nn.Linear(in_features = 500, out_features = 100)
    self.lin2 = nn.Linear(100, 100)
    self.lin3 = nn.Linear(100, 1)
    self.sigmoid = nn.Sigmoid()
    self.act = nn.ReLU()

  def forward(self, x):
    x = self.act(self.lin1(x))
    x = self.act(self.lin2(x))
    return self.sigmoid(self.lin3(x))




In [31]:
#training function
model = TextModel()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [32]:
#torch.save(model, 'textmodel.pt')

In [33]:
from tqdm import tqdm

In [34]:
#evaluate
for epoch in tqdm(range(100)):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

  1%|          | 1/100 [00:00<00:27,  3.59it/s]

Epoch 0 Loss: 18.899994273670018


 11%|█         | 11/100 [00:03<00:27,  3.18it/s]

Epoch 10 Loss: 0.23885840699071537


 21%|██        | 21/100 [00:06<00:25,  3.11it/s]

Epoch 20 Loss: 0.20972507091549578


 31%|███       | 31/100 [00:09<00:22,  3.10it/s]

Epoch 30 Loss: 0.18468718037175533


 41%|████      | 41/100 [00:13<00:19,  3.09it/s]

Epoch 40 Loss: 0.1655140457420987


 51%|█████     | 51/100 [00:16<00:16,  3.04it/s]

Epoch 50 Loss: 0.15762542309866615


 61%|██████    | 61/100 [00:19<00:12,  3.19it/s]

Epoch 60 Loss: 6.431761816027574


 71%|███████   | 71/100 [00:22<00:09,  3.16it/s]

Epoch 70 Loss: 0.18006479145404075


 81%|████████  | 81/100 [00:25<00:06,  3.12it/s]

Epoch 80 Loss: 0.17419339651812143


 91%|█████████ | 91/100 [00:29<00:03,  2.79it/s]

Epoch 90 Loss: 0.17720576164739454


100%|██████████| 100/100 [00:32<00:00,  3.10it/s]


In [35]:
Xt = torch.tensor(X_test, dtype = torch.float)

  Xt = torch.tensor(X_test, dtype = torch.float)


In [36]:

output = model(Xt) #model predictions

In [37]:
output

tensor([[1.3712e-14],
        [4.4512e-08],
        [2.3378e-15],
        ...,
        [3.6454e-22],
        [0.0000e+00],
        [1.0000e+00]], grad_fn=<SigmoidBackward0>)

In [38]:
#Converting probabilities to prediction
preds = np.where(np.array(output.detach()) >= .5, 1, 0)

In [39]:
preds.shape

(1115, 1)

In [40]:
y = np.where(spam['type'] == 'ham', 0, 1)

In [41]:
sum(preds[:, 0] == y_test)/len(y_test)

tensor(0.9910)

In [42]:
1 - sum(y_test)/len(y_test)

tensor(0.8771)

### Basic RNN

![](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/Recurrent_neural_network_unfold.svg/440px-Recurrent_neural_network_unfold.svg.png)

In [43]:
#create sequences
sequences = tokenizer.texts_to_sequences(spam['text'].values)

In [44]:
#look at first sequence
sequences[0]

[49, 471, 64, 8, 88, 123, 351, 148, 67, 58, 145]

In [45]:
X_train[0]

tensor([0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [46]:
#compare to text
spam['text'].values[1]

'Ok lar... Joking wif u oni...'

In [47]:
#pad and make all same length
sequences = pad_sequences(sequences, maxlen=100)

In [48]:
#examine results
sequences[1].shape

(100,)

In [49]:
sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,  46, 336, 472,   6], dtype=int32)

In [50]:
#example rnn
rnn = nn.RNN(input_size = 100,
             hidden_size = 30,
             num_layers = 1,
             batch_first = True)

In [51]:
#pass data through
sample_sequence = torch.tensor(sequences[1],
                               dtype = torch.float,
                               ).reshape(1, -1)
sample_sequence.shape

torch.Size([1, 100])

In [52]:
#output
output, hidden = rnn(sample_sequence)

In [53]:
#hidden
hidden.shape

torch.Size([1, 30])

In [54]:
#linear layer
output.shape

torch.Size([1, 30])

In [55]:
#pass through linear
lin1 = nn.Linear(in_features = 30, out_features = 1)

In [56]:
lin1(output)

tensor([[0.6376]], grad_fn=<AddmmBackward0>)

In [57]:
for x, y in trainloader:
  print(x.shape)
  break

torch.Size([32, 500])


In [58]:
X_train, X_test, y_train, y_test = train_test_split(sequences, yt)

In [59]:
train_dataset = TensorDataset(torch.tensor(X_train, dtype = torch.float32), y_train)
test_dataset = TensorDataset(torch.tensor(X_test, dtype = torch.float32), y_test)

In [60]:
trailoader = DataLoader(train_dataset, batch_size = 32)
testloader = DataLoader(test_dataset, batch_size = 32)

In [61]:
model = nn.Sequential(nn.RNN(input_size = 100, hidden_size  =50, num_layers=2),
                      nn.Linear(in_features = 50, out_features=1),
                      nn.Sigmoid())

In [62]:
ex_rnn = nn.RNN(input_size = 100, hidden_size  =50, num_layers=2)
ex_rnn(train_dataset[0][0].unsqueeze(0))

(tensor([[-0.3110, -0.7454,  0.6211,  0.5374, -0.2276, -0.0009,  0.6834, -0.1295,
          -0.1470,  0.5089,  0.7928, -0.7095, -0.6525, -0.3739, -0.8083, -0.7040,
          -0.6949,  0.2339, -0.5524, -0.3887,  0.0822,  0.5426, -0.5781,  0.5967,
           0.7469,  0.2546,  0.5624,  0.2531,  0.2213,  0.4610,  0.1441, -0.3841,
          -0.6865, -0.6703,  0.1129,  0.3458,  0.8523,  0.0295, -0.6390, -0.0526,
           0.5500,  0.8270,  0.0543,  0.3549, -0.4583, -0.1386, -0.0867,  0.4372,
          -0.0756,  0.1814]], grad_fn=<SqueezeBackward1>),
 tensor([[ 1.0000e+00,  1.0000e+00,  1.0000e+00, -1.0000e+00,  1.0000e+00,
          -1.8407e-01, -1.0000e+00, -1.0000e+00,  1.0000e+00, -1.0000e+00,
           1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
           1.0000e+00,  1.0000e+00, -1.0000e+00,  1.0000e+00, -1.0000e+00,
          -1.0000e+00,  1.0000e+00, -1.0000e+00,  1.0000e+00, -1.0000e+00,
          -1.0000e+00,  1.0000e+00, -1.0000e+00, -1.0000e+00,  1.0000e+00,

In [63]:
model(train_dataset[0][0].unsqueeze(0))

TypeError: linear(): argument 'input' (position 1) must be Tensor, not tuple

In [64]:
class TextDataset(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.x = torch.tensor(X, dtype = torch.float)
    self.y = torch.tensor(y, dtype = torch.float)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [65]:
#class
class BasicRNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.RNN(input_size = 100,
                    hidden_size = 50,
                    num_layers = 3,
                    batch_first = True)
    self.lin1 = nn.Linear(in_features = 50, out_features=1000)
    self.lin2 = nn.Linear(1000, 100)
    self.lin3 = nn.Linear(100, 1)
    self.act = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x) #extracting important information
    x = self.act(self.lin1(x)) #multilayer perceptron -- to predict
    x = self.act(self.lin2(x))
    x = self.sigmoid(self.lin3(x))
    return x


In [66]:
#data
X = sequences
y = np.where(spam['type'] == 'spam', 1, 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)
traindata = TextDataset(X_train, y_train)
trainloader = DataLoader(traindata, batch_size = 32)

In [67]:
#optimizer and loss
model = BasicRNN()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [68]:
#train
for epoch in tqdm(range(100)):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

  1%|          | 1/100 [00:01<02:53,  1.75s/it]

Epoch 0 Loss: 54.57173338532448


 11%|█         | 11/100 [00:19<02:35,  1.75s/it]

Epoch 10 Loss: 48.813565865159035


 21%|██        | 21/100 [00:37<02:20,  1.78s/it]

Epoch 20 Loss: 49.457532711327076


 31%|███       | 31/100 [00:55<02:01,  1.77s/it]

Epoch 30 Loss: 51.289592899382114


 41%|████      | 41/100 [01:13<01:46,  1.81s/it]

Epoch 40 Loss: 50.05160667002201


 51%|█████     | 51/100 [01:31<01:28,  1.81s/it]

Epoch 50 Loss: 50.9603927731514


 61%|██████    | 61/100 [01:49<01:09,  1.79s/it]

Epoch 60 Loss: 53.3241548165679


 71%|███████   | 71/100 [02:07<00:53,  1.84s/it]

Epoch 70 Loss: 54.86578965187073


 81%|████████  | 81/100 [02:25<00:34,  1.82s/it]

Epoch 80 Loss: 54.866721376776695


 91%|█████████ | 91/100 [02:43<00:16,  1.83s/it]

Epoch 90 Loss: 54.8669597953558


100%|██████████| 100/100 [03:00<00:00,  1.80s/it]


In [69]:
Xt = torch.tensor(X_test, dtype = torch.float)

In [70]:
output = model(Xt)

In [71]:
preds = np.where(np.array(output.detach()) >= .5, 1, 0)

In [72]:
#preds = output.argmax(axis = 1)

In [73]:
y_test

array([0, 0, 0, ..., 1, 0, 1])

In [74]:
# y = np.where(spam['type'] == 'ham', 0, 1)

In [75]:
# y.shape

In [76]:
sum(preds.reshape(1115,) == y_test)/len(y_test)

np.float64(0.8618834080717489)

#### LSTM

In [77]:
# nn.LSTM()
class BasicLSTM(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.LSTM(input_size = 100,
                    hidden_size = 100,
                    num_layers = 1,
                    batch_first = True)

    self.lin1 = nn.Linear(in_features = 100, out_features=100)
    self.lin2 = nn.Linear(in_features = 100, out_features = 1)
    self.act = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.act(self.lin1(x))
    x = self.lin2(x)
    return self.sigmoid(x)

In [78]:
model = BasicLSTM()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [79]:
#train
for epoch in range(10):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 50.86337895691395


In [80]:
Xt = torch.tensor(X_test, dtype = torch.float)
output = model(Xt)
preds = np.where(np.array(output.detach()) >= .5, 1, 0)
sum(preds[:, 0] == y_test)/len(y_test)

np.float64(0.8618834080717489)

In [81]:
#pad and make all same length
sequences = pad_sequences(sequences, maxlen=30)

In [82]:
sequences[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,  49, 471,  64,   8,  88, 123, 351,
       148,  67,  58, 145], dtype=int32)

In [83]:
X = sequences
y = np.where(spam['type'] == 'spam', 1, 0)
data = TextDataset(X, y)
loader = DataLoader(data, batch_size = 32)

In [84]:
class RNN2(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.GRU(input_size = 30,
                    hidden_size = 30,
                    num_layers = 2,
                    batch_first = True)

    self.lin1 = nn.Linear(in_features = 30, out_features=100)
    self.lin2 = nn.Linear(in_features = 100, out_features = 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.lin1(x)
    x = self.lin2(x)
    return self.sigmoid(x)

In [85]:
model = RNN2()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [86]:
#train
for epoch in range(100):
  losses = 0
  for x,y in loader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 66.51339219510555
Epoch 10 Loss: 60.15635181218386
Epoch 20 Loss: 60.79907284677029
Epoch 30 Loss: 57.837385669350624
Epoch 40 Loss: 57.53713543340564
Epoch 50 Loss: 58.36953868344426
Epoch 60 Loss: 58.41395051777363
Epoch 70 Loss: 57.9004732593894
Epoch 80 Loss: 58.0925337523222
Epoch 90 Loss: 57.60244653373957


In [87]:
Xt = torch.tensor(sequences, dtype = torch.float)
output = model(Xt)
preds = np.where(np.array(output.detach()) >= .5, 1, 0)
y = np.where(spam['type'] == 'ham', 0, 1)
sum(preds[:, 0] == y)/len(y)

np.float64(0.8631144599928239)