# NLTK
___


#### 📦 One-Time Setup (NLTK Resources)

##### Load Dependencies

In [8]:
import pandas as pd
from bs4 import BeautifulSoup
from collections import Counter
import docx
from gensim.models import Word2Vec, KeyedVectors
import html
import ipywidgets as widgets, IPython, platform, ipywidgets, jupyterlab
from importlib import reload
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [9]:
import processing as pc
reload( pc )
from processing import Text

In [10]:
fp = r'C:\Users\terry\Desktop\Test\Text\Balanced Budget and Emergency Deficit Control Act of 1985.txt'
src = r'C:/Users/terry/Desktop/Test/Text/'
dst = r'C:/Users/terry/Desktop/Test/Corpora/'
tx = Text( )

In [5]:
text = tx.load_text( fp )
collapsed = tx.collapse_whitespace( text )
compressed = tx.compress_whitespace( collapsed )
normalized = tx.normalize_text( compressed )
encoded = tx.remove_encodings( normalized )
special = tx.remove_special( encoded )
cleaned = tx.remove_fragments( special )
recompress = tx.compress_whitespace( cleaned )
dataframe = tx.split_sentences( recompress )

In [11]:
tx.chunk_datasets( src, dst )

### 🧮 1. Bag of Words (BoW) using CountVectorizer

In [None]:
corpus = [ 'Bro loves clean code.', 'Code is life.' ]
vectorizer = CountVectorizer( )
X = vectorizer.fit_transform( corpus )

print( vectorizer.get_feature_names_out( ) )
print( X.toarray( ) )


### 📊 2. TF-IDF using TfidfVectorizer

In [None]:
corpus = [ 'Bro writes awesome code.', 'Code must be clean and clear.' ]
vectorizer = TfidfVectorizer( )
X = vectorizer.fit_transform( corpus )

print( vectorizer.get_feature_names_out( ) )
print( X.toarray( ) )


### 🧠 3. Word2Vec using gensim

In [None]:
sentences = [ [ 'bro', 'loves', 'python' ], [ 'clean', 'code', 'rocks' ] ]
model = Word2Vec( sentences, vector_size=100, window=5, min_count=1, workers=4 )

# VectorStore for the word 'bro'
vector = model.wv[ 'bro' ]
print( vector )


### 🌍 4. GloVe using gensim (with pre-trained vectors)

In [None]:
# Load GloVe vec (convert .txt to .word2vec format beforehand if needed)
glove_file = r'C:\Users\terry\source\llm\glove\glove.6B.100d.txt'
model = KeyedVectors.load_word2vec_format( glove_file, unicode_errors='ignore' )

# VectorStore for the word 'code'
vector = model[ 'code' ]
print( vector )

### 🤖 5. BERT / Transformer-based Embeddings using transformers + torch


In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased' )
model = BertModel.from_pretrained( 'bert-base-uncased' )

sentence = "Bro's code always works."
inputs = tokenizer( sentence, return_tensors='pt' )
outputs = model( **inputs )

# Get the vector for [CLS] token (sentence embedding)
sentence_embedding = outputs.last_hidden_state[ :, 0, : ]
print( sentence_embedding.shape )
