# Text Classification and Recurrent Neural Networks



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pip install datasets







Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset

ds = load_dataset("Paul/hatecheck")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = ds['test'].to_pandas()

In [5]:
df['test_case'].head()

0              I hate women. 
1       I hate trans people. 
2         I hate gay people. 
3       I hate black people. 
4    I hate disabled people. 
Name: test_case, dtype: object

In [6]:
df['label_gold'].value_counts()

label_gold
hateful        2563
non-hateful    1165
Name: count, dtype: int64

### Encoding the Text



In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cvect = CountVectorizer()

In [9]:
dtm = cvect.fit_transform(df['test_case'])

In [10]:
pd.DataFrame(dtm.toarray(), columns=cvect.get_feature_names_out()).head()

Unnamed: 0,2020,4ssholes,abhor,about,absolute,absolutefilth,absolutely,academics,accepted,accountants,...,writing,wrong,yeah,years,you,your,yours,yourself,yourselves,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X = dtm
y = df['label_gold']

### Problem

Split the data and build a random forest classifier on the training data.  Compare the train and test scores.  

- What words were most important in making the classifications?
- What elements of the CountVectorizer might you change or grid search to attempt to improve your model?

# Text Classification with Neural Networks

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

ModuleNotFoundError: No module named 'torch'

In [247]:
y = np.where(y == 'hateful', 1, 0)

In [248]:
Xt = torch.tensor(X.todense(), dtype = torch.float32)
yt = torch.tensor(y, dtype = torch.float32)

In [249]:
yt

tensor([1., 1., 1.,  ..., 1., 1., 1.])

In [250]:
model = nn.Sequential(nn.Linear(in_features=Xt.shape[1], out_features=1),
                      nn.Sigmoid())

In [251]:
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [252]:
for epoch in range(100):
  yhat = model(Xt)
  loss = loss_fn(yhat, yt.unsqueeze(1))
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {loss.item()}')

Epoch 0 Loss: 0.6889792680740356
Epoch 10 Loss: 0.5407963991165161
Epoch 20 Loss: 0.457720011472702
Epoch 30 Loss: 0.39816051721572876
Epoch 40 Loss: 0.3553771674633026
Epoch 50 Loss: 0.3219773471355438
Epoch 60 Loss: 0.295131117105484
Epoch 70 Loss: 0.27274569869041443
Epoch 80 Loss: 0.25362011790275574
Epoch 90 Loss: 0.23695124685764313


In [253]:
preds = torch.where(model(Xt) > 0.5, 1, 0)

In [254]:
(preds.flatten() == yt).sum()/len(yt)

tensor(0.9372)

In [255]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [256]:
#create a tokenizer
tokenizer = Tokenizer(num_words = 500)

In [257]:
#fit the tokenizer
tokenizer.fit_on_texts(df['test_case'])

In [258]:
#create document term matrix (binarized)
dtm = tokenizer.texts_to_matrix(df['test_case'])

In [259]:
#take a peek
dtm

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [260]:
df['test_case'][2]

'I hate gay people. '

In [261]:
X = dtm

In [262]:
from sklearn.model_selection import train_test_split

In [263]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [264]:
X_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [265]:
X_train = torch.tensor(X_train, dtype = torch.float)
X_test = torch.tensor(X_test, dtype = torch.float)
y_train = torch.tensor(y_train, dtype = torch.float)
y_test = torch.tensor(y_test, dtype = torch.float)

In [266]:
#model definition
class TextModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin1 = nn.Linear(in_features = 500, out_features = 100)
    self.lin2 = nn.Linear(100, 100)
    self.lin3 = nn.Linear(100, 1)
    self.sigmoid = nn.Sigmoid()
    self.act = nn.ReLU()

  def forward(self, x):
    x = self.act(self.lin1(x))
    x = self.act(self.lin2(x))
    return self.sigmoid(self.lin3(x))




In [267]:
#ingredients
model = TextModel()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [268]:
#evaluate
for epoch in range(100):
  yhat = model(X_train)
  y = y_train.reshape(-1, 1)
  loss = loss_fn(yhat, y)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 60.102640464901924
Epoch 10 Loss: 60.102640464901924
Epoch 20 Loss: 60.102640464901924
Epoch 30 Loss: 60.102640464901924
Epoch 40 Loss: 60.102640464901924
Epoch 50 Loss: 60.102640464901924
Epoch 60 Loss: 60.102640464901924
Epoch 70 Loss: 60.102640464901924
Epoch 80 Loss: 60.102640464901924
Epoch 90 Loss: 60.102640464901924


In [269]:
Xt = torch.tensor(X_test, dtype = torch.float)

  Xt = torch.tensor(X_test, dtype = torch.float)


In [270]:
output = model(Xt) #model predictions

In [271]:
output[:5]

tensor([[8.3895e-13],
        [6.5180e-06],
        [9.9800e-01],
        [1.0000e+00],
        [6.9652e-16]], grad_fn=<SliceBackward0>)

In [272]:
#Converting probabilities to prediction
preds = np.where(np.array(output.detach()) >= .5, 1, 0)

In [273]:
preds.shape

(746, 1)

In [274]:
y_test.shape

torch.Size([746])

In [275]:
(preds.flatten() == y_test.flatten()).sum()/len(y_test)

tensor(0.9651)

### Basic RNN

![](https://karpathy.github.io/assets/rnn/diags.jpeg)

[Source](https://karpathy.github.io/2015/05/21/rnn-effectiveness/)

In [276]:
#create sequences
sequences = tokenizer.texts_to_sequences(df['test_case'])

In [277]:
#look at first sequence
sequences[0]

[5, 96, 22]

In [278]:
#compare to text
df['test_case'].values[1]

'I hate trans people. '

In [279]:
#pad and make all same length
sequences = pad_sequences(sequences, maxlen=30)

In [280]:
#examine results
sequences[1].shape

(30,)

In [281]:
sequences[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  5, 96, 15,  1], dtype=int32)

In [282]:
#example rnn
rnn = nn.RNN(input_size = 30,
             hidden_size = 30,
             num_layers = 1,
             batch_first = True)

In [283]:
#pass data through
sample_sequence = torch.tensor(sequences[1],
                               dtype = torch.float,
                               ).reshape(1, -1)
sample_sequence.shape

torch.Size([1, 30])

In [284]:
#output
output, hidden = rnn(sample_sequence)

In [285]:
#hidden
hidden

tensor([[ 0.9990, -1.0000,  1.0000, -0.6773, -1.0000, -1.0000, -0.9506, -0.1827,
          1.0000, -1.0000, -1.0000, -1.0000,  1.0000,  1.0000,  1.0000, -1.0000,
         -0.8065,  0.9974, -0.6964, -1.0000,  0.7627,  1.0000, -1.0000, -1.0000,
          1.0000,  0.9879,  1.0000,  1.0000, -1.0000,  1.0000]],
       grad_fn=<SqueezeBackward1>)

In [286]:
#linear layer
output.shape

torch.Size([1, 30])

In [287]:
#pass through linear
lin1 = nn.Linear(in_features = 30, out_features = 1)

In [288]:
#output to probability
lin1(output)

tensor([[0.8009]], grad_fn=<AddmmBackward0>)

In [289]:
torch.sigmoid(lin1(output))

tensor([[0.6902]], grad_fn=<SigmoidBackward0>)

In [290]:
X_train.shape

torch.Size([2982, 500])

In [291]:
y_train.shape

torch.Size([2982])

In [292]:
# model = nn.Sequential(nn.RNN(input_size = 30,hidden_size = 30, num_layers = 2, batch_first = True),
#                       nn.Linear(in_features = 30, out_features = 1),
#                       nn.Sigmoid())

In [315]:
#class
class BasicRNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.RNN(input_size = 30,
                    hidden_size = 30,
                    num_layers = 3,
                    batch_first = False)
    self.lin1 = nn.Linear(in_features = 30, out_features=30)
    self.lin2 = nn.Linear(30, 100)
    self.lin3 = nn.Linear(100, 1)
    self.act = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.act(self.lin1(x))
    x = self.act(self.lin2(x))
    x = self.sigmoid(self.lin3(x))
    return x


In [316]:
y = np.where(df['label_gold'] == 'hateful', 1, 0)

In [317]:
X.shape

torch.Size([3728, 30])

In [318]:
y.shape

(3728,)

In [319]:
X = sequences
X = torch.tensor(X, dtype = torch.float)
y = torch.tensor(y, dtype = torch.float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

In [320]:
#optimizer and loss
model = BasicRNN()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [331]:
#train
for epoch in range(20):
    yhat = model(X_train)
    y = y_train.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
    if epoch % 10 == 0:
      print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 66.96763409674168
Epoch 10 Loss: 72.60977609455585


In [332]:
Xt = torch.tensor(X_test, dtype = torch.float)

  Xt = torch.tensor(X_test, dtype = torch.float)


In [333]:
output = model(Xt)

In [334]:
preds = np.where(np.array(output.detach()) >= .5, 1, 0)

In [336]:
y_test

tensor([1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0.,
        1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0.,
        1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1.,
        1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0.,
        0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0.,
        0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0.,
        1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0.,
        1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
        1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
        1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1.,
        1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1.,
        0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0.,
        1., 1., 1., 1., 1., 1., 1., 0., 

In [337]:
sum(preds.flatten() == y_test.flatten())/len(y_test)

tensor(0.6139)

Additional improvements to the RNN include the LSTM and GRU layers -- examples at end of notebook.

### Pretrained Models and HuggingFace

- [Huggingface](https://huggingface.co/)
- [Chronos Paper](https://arxiv.org/abs/2403.07815)

In [None]:
pip install git+https://github.com/amazon-science/chronos-forecasting.git

In [None]:
import torch
from chronos import ChronosPipeline

pipeline = ChronosPipeline.from_pretrained(
  "amazon/chronos-t5-large",
  device_map="cuda",
  torch_dtype=torch.bfloat16,
)

df = pd.read_csv("https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv")


In [None]:
# context must be either a 1D tensor, a list of 1D tensors,
# or a left-padded 2D tensor with batch as the first dimension
context = torch.tensor(df["#Passengers"])
prediction_length = 12
forecast = pipeline.predict(context, prediction_length)  # shape [num_series, num_samples, prediction_length]

In [None]:
# visualize the forecast
forecast_index = range(len(df), len(df) + prediction_length)
low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)

plt.figure(figsize=(8, 4))
plt.plot(df["#Passengers"], color="royalblue", label="historical data")
plt.plot(forecast_index, median, color="tomato", label="median forecast")
plt.fill_between(forecast_index, low, high, color="tomato", alpha=0.3, label="80% prediction interval")
plt.legend()
plt.grid();

### Problem

Explore the pretrained models available and try to find one that is either of relevance to your final paper or just of general interest.  Load and use the model in an example -- even just the docs!

#### LSTM

In [338]:
class TextDataset(Dataset):
  def __init__(self, X, y):
    super().__init__()
    self.x = torch.tensor(X, dtype = torch.float)
    self.y = torch.tensor(y, dtype = torch.float)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [350]:
X_train.shape

torch.Size([2982, 30])

In [339]:
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

  self.x = torch.tensor(X, dtype = torch.float)
  self.y = torch.tensor(y, dtype = torch.float)


In [351]:
#dataset and loader
trainloader = DataLoader(train_dataset, batch_size = 32)
#dataset and loader
testloader = DataLoader(test_dataset, batch_size = 32)

In [371]:
# nn.LSTM()
class BasicLSTM(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.LSTM(input_size = 30,
                    hidden_size = 100,
                    num_layers = 1,
                    batch_first = True)

    self.lin1 = nn.Linear(in_features = 100, out_features=100)
    self.lin2 = nn.Linear(in_features = 100, out_features = 1)
    self.act = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, (hn, cn)= self.rnn(x)
    x = self.act(self.lin1(x))
    x = self.lin2(x)
    return self.sigmoid(x)
    return x

In [378]:
model = BasicLSTM()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [379]:
#train
for epoch in range(10):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 58.59839341044426


In [380]:
Xt = torch.tensor(X_test, dtype = torch.float)
output = model(Xt)
preds = np.where(np.array(output.detach()) >= .5, 1, 0)
sum(preds[:, 0] == y_test)/len(y_test)

  Xt = torch.tensor(X_test, dtype = torch.float)


tensor(0.7024)

In [387]:
class RNN2(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.GRU(input_size = 30,
                    hidden_size = 30,
                    num_layers = 2,
                    batch_first = True)

    self.lin1 = nn.Linear(in_features = 30, out_features=100)
    self.lin2 = nn.Linear(in_features = 100, out_features = 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x, _ = self.rnn(x)
    x = self.lin1(x)
    x = self.lin2(x)
    return self.sigmoid(x)

In [388]:
model = RNN2()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss_fn = nn.BCELoss()

In [None]:
#train
for epoch in range(100):
  losses = 0
  for x,y in trainloader:
    yhat = model(x)
    y = y.reshape(-1, 1)
    loss = loss_fn(yhat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses += loss.item()
  if epoch % 10 == 0:
    print(f'Epoch {epoch} Loss: {losses}')

Epoch 0 Loss: 59.61756247282028
Epoch 10 Loss: 57.20614293217659
Epoch 20 Loss: 55.79645165801048
Epoch 30 Loss: 55.00879901647568
Epoch 40 Loss: 55.988073855638504
Epoch 50 Loss: 54.665479958057404
Epoch 60 Loss: 53.79632553458214


In [386]:
# Xt = torch.tensor(sequences, dtype = torch.float)
output = model(X_test)
preds = np.where(np.array(output.detach()) >= .5, 1, 0)
sum(preds[:, 0] == y_test.flatten())/len(y_test)

tensor(0.6689)