# Preprocessing

## Tokenization

In [None]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')
tokens = tokenizer('I am reading a book now. I love to read books!')
print(tokens)

## Removing stopwords

In [None]:
import nltk 

nltk.download('stopwords')

from nltk.corpus import stopwords 

stop_words = set(stopwords.words('english'))

filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)

## Stemming

In [None]:
from nltk.stem import PorterStemmer 

stemmer = PorterStemmer() 

stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)

## Rare word removal

In [None]:
from nltk.probability import FreqDist 

freq_dist = FreqDist(stemmed_tokens)

threshold = 2 

common_tokens = [token for token in stemmed_tokens if freq_dist[token]>=threshold]
print(common_tokens)

## Encoding text data

### One Hot Encoding

In [None]:
import torch 

vocab=['cat', 'dog', 'rabbit']
vocab_size = len(vocab)

one_hot_vectors = torch.eye(vocab_size)
one_hot_dict = {word: one_hot_vectors[i] for i, word in enumerate(vocab)}
print(one_hot_dict)

### Bag of Words 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() 
corpus = ['Primer documento.', 
          'Otro documento mas, y ya van dos documentos. Cuantos documentos mas necesitamos?', 
          'Un ultimo documento para terminar la coleccion de documentos'] 

X = vectorizer.fit_transform(corpus) 
print(X.toarray())
print(vectorizer.get_feature_names_out())


### TF-IDF 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np 

vectorizer = TfidfVectorizer() 
corpus = ['Primer documento.', 
          'Otro documento mas, y ya van dos documentos. Cuantos documentos mas necesitamos?', 
          'Un ultimo documento para terminar la coleccion de documentos'] 

X = vectorizer.fit_transform(corpus) 
print(np.round(X.toarray(), 2))
print(vectorizer.get_feature_names_out())


### Embeddings

In [None]:
with open('../data/shakespeare.txt', 'r') as file:
    raw_text = file.read()

def extract_sentences(data): 
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]', data) 
    return sentences

shakespeare = extract_sentences(raw_text)

In [None]:
# Create a list of stopwords
stop_words = set(stopwords.words("english"))

# Initialize the tokenizer and stemmer
tokenizer = get_tokenizer("basic_english")
stemmer = PorterStemmer() 

# Complete the function to preprocess sentences
def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
		# Tokenize the sentence
        tokens = tokenizer(sentence)
		# Remove stop words
        tokens = [token for token in tokens if token not in stop_words]        
		# Stem the tokens
        tokens = [stemmer.stem(token) for token in tokens]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences

processed_shakespeare = preprocess_sentences(shakespeare)
print(processed_shakespeare[:5]) 

In [None]:
# Define your Dataset class
class ShakespeareDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

# Complete the encoding function
def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    return X.toarray(), vectorizer
    
# Complete the text processing pipeline
def text_processing_pipeline(sentences):
    processed_sentences = preprocess_sentences(sentences)
    encoded_sentences, vectorizer = encode_sentences(processed_sentences)
    dataset = ShakespeareDataset(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer

dataloader, vectorizer = text_processing_pipeline(processed_shakespeare)


In [None]:
# Print the vectorizer's feature names and the first 10 components of the first item
print(vectorizer.get_feature_names_out()[:500]) 
print(next(iter(dataloader))[0, :50])