# Text Processing Pipeline 

Raw data > preprocessing > encoding > dataset and dataloader



In [None]:
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader
from icecream import ic

In [None]:
%conda install nltk

# Preprocessing 

Preprocessing reduces the amount of features, providing cleaner and more representative datasets.

## Tokenization 

Tokens can be words, parts of words or punctuation


In [None]:


tokenizer = get_tokenizer('basic_english') 
tokens = tokenizer('I am reading a book now. I love to read books!') 
ic(tokens)

## Stop word removal 

Eliminate common words that do not contribute to the meaning.

Stop words: 'a', 'the', 'or' and more

In [None]:
import nltk 

nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

ic(filtered_tokens)

## Stemming

Reduce words to their base form 

For example: *running*, *runs* and *ran* becomes **run**

In [None]:
import nltk 
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

print(stemmed_tokens)



## Rare word removal 

Removing infrequent words that dont add value 

In [None]:
from nltk.probability import FreqDist 

freq_dist = FreqDist(stemmed_tokens)
threshold = 1

common_tokens = [token for token in stemmed_tokens if freq_dist[token] > threshold]

print(common_tokens)

# Encoding Text Data

Converts text into machine-readable numbers 

Enables analysis and modeling 

## One Hot Encoding 

Mapping each word to a distinct vector 

Binary Vector: 
- 1 for the presence of a word
- 0 for the absence of a word

In [None]:
import torch 

vocab = ['cat', 'dog', 'rabbit']

vocab_size = len(vocab) 
one_hot_vectors = torch.eye(vocab_size)
ic(one_hot_vectors)
one_hot_dict = {word: one_hot_vectors[i] for i, word in enumerate(vocab)}
ic(one_hot_dict)

## Bag of words

Treats the text as an unordered collection of words and takes into account the frequency of each word, not the order in which it appears.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() 

corpus= ['Hey, look at me! I am a document', 
         'So you think you are special, first document? I am a document too', 
         'Guys, please. We are all documents, ok? Calm down.'
        ]

X = vectorizer.fit_transform(corpus)

print(X.toarray())
print(vectorizer.get_feature_names_out())

## TF-IDF (Term Frequency - Inverse Document Frequency) 

Scores the importance of words in a document, taking into account the presence of words in other documents: specific words on a document are scored higher and common words present in every document are sored lower. 


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer() 

X = vectorizer.fit_transform(corpus)

print(X.toarray()) 
print(vectorizer.get_feature_names_out())

## Preprocessing in pytorch

In [None]:
class TextDataset(Dataset): 
    def __init__(self, data): 
        self.data = data 

    def __getitem__(self, idx): 
        return self.data[idx]


In [None]:
def preprocess_sentences(sentences): 
    processed_sentences = [] 
    for sentence in sentences: 
        sentence = sentence.lower() 
        tokens = tokenizer(sentence) 
        tokens = [token for token in tokens if token not in stop_words] 
        tokens = [stemmer.stem(token) for token in tokens] 
        freq_dist = FreqDist(tokens) 
        threshold = 0
        tokens = [token for token in tokens if freq_dist[token]>threshold] 
        processed_sentences.append(' '.join(tokens))
    return processed_sentences

In [None]:
preprocess_sentences(['This is the first text data. And here is another one. What do you think about being just data. This is not the first time i think about it. Just being data.'])

In [None]:
def encode_sentences(sentences): 
    vectorizer = CountVectorizer() 
    X = vectorizer.fit_transform(sentences)
    encoded_sentences = X.toarray()
    return encoded_sentences, vectorizer

In [None]:
import re

def extract_sentences(data): 
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]', data) 
    return sentences

In [None]:
def text_preprocessing_pipeline(text): 
    tokens = preprocess_sentences(text) 
    encoded_sentences, vectorizer = encode_sentences(tokens) 
    print(encoded_sentences)
    dataset = TextDataset(encoded_sentences) 
    dataloader = DataLoader(dataset, batch_size=2)
    return dataloader, vectorizer


In [None]:
text_data = 'This is the first text data. And here is another one.'

sentences = extract_sentences(text_data)

dataloader, vectorizer = [text_preprocessing_pipeline(sentences) for text in sentences]

for data in dataloader: 
    print(data)

## Embeddings 

In [None]:
import torch 

from torch import nn 

words = ['The', 'cat', 'sat', 'on', 'the', 'mat'] 
word_to_idx = {word:i for i, word in enumerate(words)}

inputs = torch.LongTensor([word_to_idx[w] for w in words])

embedding = nn.Embedding(num_embeddings=len(words), embedding_dim=10)
output = embedding(inputs)

print(output)

## Recurrent Neural Networks 


In [None]:
from torch.utils.data import Dataset, DataLoader 

class TextDataset(Dataset): 
    def __init__(self, text): 
        self.text = text 

    def __len__(self): 
        return len(self.text)  

    def __getitem__(self, idx): 
        return self.text[idx]

class LSTMModel(nn.Module): 
    def __init__(self, input_size, hidden_size, output_size): 
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x): 
        _, (hidden, _) = self.lstm(x) 
        output = self.fc(hidden.squeeze(0))
        return output

class GRUModel(nn.Module): 
    def __init__(self, input_size, hidden_size, output_size): 
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x): 
        _, (hidden, _) = self.gru(x) 
        output = self.fc(hidden.squeeze(0))
        return output

# Text Generation

In [None]:
import torch 
import torch.nn as nn 

data = 'Hello, how are you?' 
chars = list(set(data)) 
ic(chars) 

char_to_ix = {char:i for i, char in enumerate(chars)}
ix_to_char =  {i:char for i, char in enumerate(chars)}

class RNNmodel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNmodel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
      h0 = torch.zeros(1, x.size(0), self.hidden_size)
      out, _ = self.rnn(x, h0)  
      out = self.fc(out[:, -1, :])  
      return out

model = RNNmodel(1, 16, 1)

criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [None]:
inputs = [char_to_ix[ch] for ch in data[:-1]]
ic(inputs)
targets = [char_to_ix[ch] for ch in data[1:]]
ic(targets)

In [None]:
inputs = torch.tensor(inputs, dtype=torch.long)

In [None]:
inputs = nn.functional.one_hot(inputs, num_classes=len(chars)).float()
targets = torch.tensor(targets, dtype=torch.long)

In [None]:
# Instantiate the loss function
criterion = nn.CrossEntropyLoss()
# Instantiate the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train the model
for epoch in range(100):
    model.train()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}/100, Loss: {loss.item()}')

# Test the model
model.eval()
test_input = char_to_ix['r']
test_input = nn.functional.one_hot(torch.tensor(test_input).view(-1, 1), num_classes=len(chars)).float()
predicted_output = model(test_input)
predicted_char_ix = torch.argmax(predicted_output, 1).item()
print(f"Test Input: 'r', Predicted Output: '{ix_to_char[predicted_char_ix]}'")
        
    

In [None]:
inputs.shape

In [None]:
h0 = torch.zeros(18, 16) 

In [None]:
h0

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel 
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2') 
seed_text = 'Once upon a time' 

input_ids = tokenizer.encode(seed_text, return_tensors='pt')

In [None]:
output = model.generate(input_ids, max_length=40, temperature=0.7, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True) 
print(generated_text)

# Evaluating Text Generation 

Text generation tasks create human like text. 

Standard accuracy metrics such as accuracy, F1 fall short for these tasks 

We need metrics that evaluate the quality of generated text 

BLEU (Bilingual Evaluation Understudy) and ROUGE ()

## BLEU

Compares the generated text and the reference text.

Checks for the occurence of n grams (sequences of n words)

The more matches the best score. The perfect score is 1



In [None]:
from torchmetrics.text import BLEUScore

generated_text = ['The cat is on the mat']
real_text = [['there is a cat on the mat', 'a cat is on the mat']]

bleu = BLEUScore() 
bleu_metric = bleu(generated_text, real_text)
ic(bleu_metric.item())

## ROUGE(Recall-Oriented Understudy for Gisting Evaluation) 

Compares a generated text to a reference text in two ways

ROUGE-N: considers overlapping n-grams in both texts 

ROUGE-L: looks at the longest common subsequence LCS between the texts

ROUGE Metrics: 
- F-measure : Harmonic mean of precision and recall
- Precision: Matches of n-grams in generated text within the reference text
- Recall: Matches of n-grams in reference text within the generated text

In [None]:
from torchmetrics.text import ROUGEScore 

generated_text = 'Hello, how are you doing?' 
real_text = 'Hello, how are you?' 

rouge = ROUGEScore() 

rouge_score = rouge([generated_text],[[real_text]])
ic(rouge_score)

## Considerations and limitations 

Evaluate word presence not semantic understanding 

Sensitive to the lenght of the generated text 

Quality of reference text is crucial

In [None]:
# Transfer Learning for text classification 



In [None]:
texts = [
    "I love this!", 
    "This is terrible.", 
    "Amazing experience!", 
    "Not my cup of tea." 
]

labels = [1,0,1,0]

import torch 
from transformers import BertTokenizer, BertForSequenceClassification 

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=32) 
inputs['labels'] = torch.tensor(labels)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001)
model.train()
for epoch in range(1): 
    outputs = model(**inputs) 
    loss = outputs.loss 
    loss.backward() 
    optimizer.step() 
    optimizer.zero_grad()
    print(f'Epoch:{epoch+1}, Loss: {loss.item()}')


In [None]:
text = 'I had an awesome day!' 

input_eval = tokenizer(text, return_tensors = 'pt', truncation = True, padding=True, max_length=128) 

outputs_eval = model(**input_eval)

In [None]:
input_eval

In [None]:
outputs_eval

In [None]:
predictions =torch.nn.functional.softmax(outputs_eval.logits, dim=-1)
predicted_label = 'positive' if torch.argmax(predictions) > 0  else 'negative'
print(f'text: {text} \nSentiment: {predicted_label}')
