# Text Classification and Recurrent Neural Networks



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pip install datasets

In [None]:
from datasets import load_dataset

ds = load_dataset("Paul/hatecheck")

In [None]:
df = ds['test'].to_pandas()

In [None]:
df['test_case'].head()

In [None]:
df['label_gold'].value_counts()

### Encoding the Text



In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cvect = CountVectorizer()

In [None]:
dtm = cvect.fit_transform(df['test_case'])

In [None]:
pd.DataFrame(dtm.toarray(), columns=cvect.get_feature_names_out()).head()

In [None]:
X = dtm
y = df['label_gold']

### Problem

Split the data and build a random forest classifier on the training data.  Compare the train and test scores.  

- What words were most important in making the classifications?
- What elements of the CountVectorizer might you change or grid search to attempt to improve your model?

# Text Classification with Neural Networks

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#### Classifying Spam

In [None]:
#read in data
spam = pd.read_csv('sms_spam.csv')

In [None]:
#take a peek
spam.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [None]:
#create a tokenizer
tokenizer = Tokenizer(num_words = 500)

In [None]:
#fit the tokenizer
tokenizer.fit_on_texts(spam['text'].values)

In [None]:
#look at tokenizer
tokenizer.num_words

500

In [None]:
#create document term matrix (binarized)
dtm = tokenizer.texts_to_matrix(spam['text'].values)

In [None]:
#take a peek
dtm

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
spam['text'][2]

'Am also doing in cbe only. But have to pay.'

In [None]:
class TextDataset(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.x = torch.tensor(X, dtype = torch.float)
    self.y = torch.tensor(y, dtype = torch.float)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [None]:
X = dtm
y = np.where(spam['type'] == 'ham', 0, 1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [None]:
X_train

array([[0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [None]:
#create data class
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

In [None]:
#dataset and loader
trainloader = DataLoader(train_dataset, batch_size = 32)
#dataset and loader
testloader = DataLoader(test_dataset, batch_size = 32)

In [None]:
#loss and optimizer
class TextModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin1 = nn.Linear(in_features = 500, out_features = 100)
    self.lin2 = nn.Linear(100, 100)
    self.lin3 = nn.Linear(100, 1)
    self.sigmoid = nn.Sigmoid()
    self.act = nn.ReLU()

  def forward(self, x):
    x = self.act(self.lin1(x))
    x = self.act(self.lin2(x))
    return self.sigmoid(self.lin3(x))




In [None]:
#training function
model = TextModel()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#evaluate
for epoch in range(100):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 17.463785945263226
Epoch 10 Loss: 0.3230317887267802
Epoch 20 Loss: 0.1858880319979619
Epoch 30 Loss: 0.18419960872658825
Epoch 40 Loss: 0.19837459240657174
Epoch 50 Loss: 0.18659224085559767
Epoch 60 Loss: 45.2216811845803
Epoch 70 Loss: 6.432060421328064
Epoch 80 Loss: 6.417951953306446
Epoch 90 Loss: 6.412450148929762


In [None]:
Xt = torch.tensor(X_test, dtype = torch.float)

In [None]:

output = model(Xt) #model predictions

In [None]:
output

tensor([[2.1603e-23],
        [4.3468e-08],
        [2.5849e-02],
        ...,
        [0.0000e+00],
        [0.0000e+00],
        [1.3465e-31]], grad_fn=<SigmoidBackward0>)

In [None]:
#Converting probabilities to prediction
preds = np.where(np.array(output.detach()) >= .5, 1, 0)

In [None]:
preds.shape

(1112, 1)

In [None]:
y = np.where(spam['type'] == 'ham', 0, 1)

In [None]:
sum(preds[:, 0] == y_test)/len(y_test)

0.9784172661870504

In [None]:
np.unique(y_test, return_counts = True)

(array([0, 1]), array([972, 140]))

In [None]:
951/(951 + 161)

0.8552158273381295

### Basic RNN

![](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/Recurrent_neural_network_unfold.svg/440px-Recurrent_neural_network_unfold.svg.png)

In [None]:
#create sequences
sequences = tokenizer.texts_to_sequences(spam['text'].values)

In [None]:
#look at first sequence
sequences[0]

[122, 3, 22, 313, 4, 53, 110, 37, 8]

In [None]:
#compare to text
spam['text'].values[1]

'K..give back my thanks.'

In [None]:
#pad and make all same length
sequences = pad_sequences(sequences, maxlen=100)

In [None]:
#examine results
sequences[1].shape

(100,)

In [None]:
sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,  92, 134,  86,  11, 170], dtype=int32)

In [None]:
#example rnn
rnn = nn.RNN(input_size = 100,
             hidden_size = 30,
             num_layers = 1,
             batch_first = True)

In [None]:
#pass data through
sample_sequence = torch.tensor(sequences[1],
                               dtype = torch.float,
                               ).reshape(1, -1)
sample_sequence.shape

torch.Size([1, 100])

In [None]:
#output
output, hidden = rnn(sample_sequence)

In [None]:
#hidden
hidden

tensor([[ 1.0000, -1.0000,  1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -0.9969,
          1.0000,  1.0000,  1.0000,  0.9927, -0.9988,  1.0000,  1.0000, -1.0000,
         -1.0000,  0.9980, -0.9968,  1.0000,  0.8639, -1.0000, -1.0000,  0.9869,
          1.0000,  0.7625,  1.0000,  0.6461, -1.0000, -0.9947]],
       grad_fn=<SqueezeBackward1>)

In [None]:
#linear layer
output.shape

torch.Size([1, 30])

In [None]:
#pass through linear
lin1 = nn.Linear(in_features = 30, out_features = 1)

In [None]:
lin1(output)

tensor([[0.1402]], grad_fn=<AddmmBackward0>)

In [None]:
for x, y in trainloader:
  print(x.shape)
  break

torch.Size([32, 500])


In [None]:
class TextDataset(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.x = torch.tensor(X, dtype = torch.float)
    self.y = torch.tensor(y, dtype = torch.float)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [None]:
#class
class BasicRNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.RNN(input_size = 100,
                    hidden_size = 100,
                    num_layers = 3,
                    batch_first = True)
    self.lin1 = nn.Linear(in_features = 100, out_features=1000)
    self.lin2 = nn.Linear(1000, 100)
    self.lin3 = nn.Linear(100, 1)
    self.act = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.act(self.lin1(x))
    x = self.act(self.lin2(x))
    x = self.sigmoid(self.lin3(x))
    return x


In [None]:
#data
X = sequences
y = np.where(spam['type'] == 'spam', 1, 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)
traindata = TextDataset(X_train, y_train)
trainloader = DataLoader(traindata, batch_size = 32)

In [None]:
#optimizer and loss
model = BasicRNN()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#train
for epoch in range(100):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 54.333703458309174
Epoch 10 Loss: 49.14352545142174
Epoch 20 Loss: 50.294177405536175
Epoch 30 Loss: 54.339055240154266
Epoch 40 Loss: 54.340724781155586
Epoch 50 Loss: 54.34115116298199
Epoch 60 Loss: 54.34125591814518
Epoch 70 Loss: 54.341282427310944
Epoch 80 Loss: 54.34128923714161
Epoch 90 Loss: 54.34129001200199


In [None]:
Xt = torch.tensor(X_test, dtype = torch.float)

In [None]:
output = model(Xt)

In [None]:
preds = np.where(np.array(output.detach()) >= .5, 1, 0)

In [None]:
#preds = output.argmax(axis = 1)

In [None]:
y_test

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
# y = np.where(spam['type'] == 'ham', 0, 1)

In [None]:
# y.shape

In [None]:
sum(preds.reshape(1112,) == y_test)/len(y_test)

0.8588129496402878

### Pretrained Models and HuggingFace

- [Huggingface](https://huggingface.co/)
- [Chronos Paper](https://arxiv.org/abs/2403.07815)

In [None]:
pip install git+https://github.com/amazon-science/chronos-forecasting.git

In [None]:
import torch
from chronos import ChronosPipeline

pipeline = ChronosPipeline.from_pretrained(
  "amazon/chronos-t5-large",
  device_map="cuda",
  torch_dtype=torch.bfloat16,
)

df = pd.read_csv("https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv")


In [None]:
# context must be either a 1D tensor, a list of 1D tensors,
# or a left-padded 2D tensor with batch as the first dimension
context = torch.tensor(df["#Passengers"])
prediction_length = 12
forecast = pipeline.predict(context, prediction_length)  # shape [num_series, num_samples, prediction_length]

In [None]:
# visualize the forecast
forecast_index = range(len(df), len(df) + prediction_length)
low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)

plt.figure(figsize=(8, 4))
plt.plot(df["#Passengers"], color="royalblue", label="historical data")
plt.plot(forecast_index, median, color="tomato", label="median forecast")
plt.fill_between(forecast_index, low, high, color="tomato", alpha=0.3, label="80% prediction interval")
plt.legend()
plt.grid();

### Problem

Explore the pretrained models available and try to find one that is either of relevance to your final paper or just of general interest.  Load and use the model in an example -- even just the docs!

#### LSTM

In [None]:
# nn.LSTM()
class BasicLSTM(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.LSTM(input_size = 100,
                    hidden_size = 100,
                    num_layers = 1,
                    batch_first = True)

    self.lin1 = nn.Linear(in_features = 100, out_features=100)
    self.lin2 = nn.Linear(in_features = 100, out_features = 1)
    self.act = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.act(self.lin1(x))
    x = self.lin2(x)
    return self.sigmoid(x)

In [None]:
model = BasicLSTM()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#train
for epoch in range(10):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 49.52611651271582


In [None]:
Xt = torch.tensor(X_test, dtype = torch.float)
output = model(Xt)
preds = np.where(np.array(output.detach()) >= .5, 1, 0)
sum(preds[:, 0] == y_test)/len(y_test)

In [None]:
#pad and make all same length
sequences = pad_sequences(sequences, maxlen=30)

In [None]:
sequences[0]

In [None]:
X = sequences
y = np.where(spam['type'] == 'spam', 1, 0)
data = TextDataset(X, y)
loader = DataLoader(data, batch_size = 32)

In [None]:
class RNN2(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.LSTM(input_size = 30,
                    hidden_size = 30,
                    num_layers = 2,
                    batch_first = True)

    self.lin1 = nn.Linear(in_features = 30, out_features=100)
    self.lin2 = nn.Linear(in_features = 100, out_features = 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.lin1(x)
    x = self.lin2(x)
    return self.sigmoid(x)

In [None]:
model = RNN2()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#train
for epoch in range(100):
  losses = 0
  for x,y in loader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

In [None]:
Xt = torch.tensor(sequences, dtype = torch.float)
output = model(Xt)
preds = np.where(np.array(output.detach()) >= .5, 1, 0)
y = np.where(spam['type'] == 'ham', 0, 1)
sum(preds[:, 0] == y)/len(y)

In [None]:
y

In [None]:
output

In [None]:
spam['type']

In [None]:
preds.sum()

In [None]:
(y == 1).sum()