# Text Representation Techniques

# Traditional Techniques

## 1. One Hot Encoder

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [None]:
# define vocabulary
vocabulary = ["cat", "dog", "bird"]

In [None]:
# reshape the data to fit sklearn's encoder
words = np.array(vocabulary).reshape(-1, 1)
print(text)

[['cat']
 ['dog']
 ['bird']]


In [None]:
# initialize and fit encoder
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(words)

In [None]:
# print results
for word, encoding in zip(vocabulary, one_hot_encoded):
    print(f"{word}: {encoding}")

cat: [0. 1. 0.]
dog: [0. 0. 1.]
bird: [1. 0. 0.]


## 2. Bag of Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# sample corpus
corpus = [
    "I love elephants",
    "Elephants are great",
    "Elephants are the best"
]

In [None]:
# initial CountVectorizer
vectorizer = CountVectorizer()

In [None]:
# fit and transform the corpus into Bag of Words representation
X = vectorizer.fit_transform(corpus)

In [None]:
# get feature names
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['are' 'best' 'elephants' 'great' 'love' 'the']


In [None]:
# convert sparse matrix to array
bow_representation = X.toarray()
print(bow_representation)

[[0 0 1 0 1 0]
 [1 0 1 1 0 0]
 [1 1 1 0 0 1]]


In [None]:
# Print results
print("Vocabulary:", feature_names)
print("\nBag of Words Representation:")
for i, sentence in enumerate(corpus):
    print(f"{sentence} → {bow_representation[i]}")

Vocabulary: ['are' 'best' 'elephants' 'great' 'love' 'the']

Bag of Words Representation:
I love elephants → [0 0 1 0 1 0]
Elephants are great → [1 0 1 1 0 0]
Elephants are the best → [1 1 1 0 0 1]


## 3. Term Frequency-Inverse Term Requency (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# sample corpus
corpus = [
    "I love elephants",
    "Elephants are great",
    "Elephants are the best"
]

In [None]:
# initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
# fit and transform the corpus
X = vectorizer.fit_transform(corpus)

In [None]:
# get feature names
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['are' 'best' 'elephants' 'great' 'love' 'the']


In [None]:
# convert sparse matrix to array
tfidf_representation = X.toarray()
print(tfidf_representation)

[[0.         0.         0.50854232 0.         0.861037   0.        ]
 [0.54783215 0.         0.42544054 0.72033345 0.         0.        ]
 [0.44451431 0.5844829  0.34520502 0.         0.         0.5844829 ]]


In [None]:
# Print results
print("Vocabulary:", feature_names)
print("\nTF-IDF Representation:")
for i, sentence in enumerate(corpus):
    print(f"{sentence} → {tfidf_representation[i]}")

Vocabulary: ['are' 'best' 'elephants' 'great' 'love' 'the']

TF-IDF Representation:
I love elephants → [0.         0.         0.50854232 0.         0.861037   0.        ]
Elephants are great → [0.54783215 0.         0.42544054 0.72033345 0.         0.        ]
Elephants are the best → [0.44451431 0.5844829  0.34520502 0.         0.         0.5844829 ]


# Modern Techniques

## Word Embeddings

## 1. Word2Vec

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Sample corpus
corpus = [
    "I love NLP and NLP is fun",
    "Machine learning is amazing",
    "NLP and machine learning are part of AI"
]

In [None]:
# tokenize sentences
tokenized_corpus = [word_tokenize(sentence) for sentence in corpus]
tokenized_corpus

[['I', 'love', 'NLP', 'and', 'NLP', 'is', 'fun'],
 ['Machine', 'learning', 'is', 'amazing'],
 ['NLP', 'and', 'machine', 'learning', 'are', 'part', 'of', 'AI']]

In [None]:
# train Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=10, window=2, min_count=1, sg=1)

In [None]:
# Get word embeddings for 'nlp'
print("Vector representation for 'NLP':")
print(model.wv['NLP'])

Vector representation for 'NLP':
[-0.00536227  0.00236431  0.0510335   0.09009273 -0.0930295  -0.07116809
  0.06458873  0.08972988 -0.05015428 -0.03763372]


In [None]:
# Find most similar words to 'nlp'
print("Words similar to 'NLP':")
print(model.wv.most_similar('NLP'))

Words similar to 'NLP':
[('learning', 0.5436005592346191), ('Machine', 0.43182313442230225), ('machine', 0.3792896568775177), ('is', 0.3004249036312103), ('fun', 0.22743143141269684), ('and', 0.10494352877140045), ('are', -0.13091355562210083), ('part', -0.18975044786930084), ('AI', -0.2243170291185379), ('I', -0.23748816549777985)]


## 2. GloVe (Global Vectors for Word Representations)

In [None]:
import gensim.downloader as api

In [None]:
# load pre-trained word embeddings
glove_model = api.load("glove-wiki-gigaword-100")



In [None]:
# get the vector for word "king"
king_vector = glove_model['king']
print(king_vector)

[-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.52881
  0.16483  -0.98878

In [None]:
# find words similar to rohit
similar_words = glove_model.most_similar('rohit')
print(similar_words)

[('raina', 0.7623310685157776), ('yuvraj', 0.7188029289245605), ('virat', 0.714231014251709), ('suresh', 0.709095299243927), ('sharma', 0.6950598359107971), ('gambhir', 0.677060067653656), ('dinesh', 0.6754335761070251), ('karthik', 0.6737046837806702), ('gautam', 0.6715025901794434), ('kohli', 0.661833643913269)]


## Contextualized Word Embeddings

## 1. ELMo (Embeddings from Language Models)

In [None]:
from allennlp.commands.elmo import ElmoEmbedder

# Load ELMo pre-trained model
elmo = ElmoEmbedder()

# Get word embeddings for a sentence
sentence = ["I", "love", "NLP"]
embedding = elmo.embed_sentence(sentence)

# Print shape of embeddings (3 layers, words, embedding size)
print(embedding.shape)  # (3, 3, 1024)

# Extract the final ELMo embedding for the word "NLP"
nlp_vector = embedding[-1][2]  # Last layer's embedding for 'NLP'
print(nlp_vector.shape)  # (1024,)


## 2. BERT (Bidirectional Encoder Representations from Transformers)

In [1]:
from transformers import BertTokenizer, BertModel

In [2]:
# load pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
# example sentence
text = 'Rohit Sharma is great Indian captain India ever had.'

In [8]:
# tokenize the text and convert into tensor
inputs = tokenizer(text, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101, 20996, 16584, 14654,  2003,  2307,  2796,  2952,  2634,  2412,
          2018,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [9]:
# get embedding from BERT
outputs = model(**inputs)
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5845, -0.1094, -0.0036,  ..., -0.5498,  1.3696,  0.0747],
         [ 0.2960, -0.9703, -0.0999,  ...,  0.1414,  0.8264, -1.2552],
         [ 0.5845, -1.2327, -0.0246,  ..., -0.7282,  0.0314, -1.6449],
         ...,
         [-0.3901, -0.4430, -0.6670,  ..., -0.3343, -0.1105, -0.0956],
         [-0.3747, -0.8162, -0.0424,  ...,  0.5324,  0.9964, -0.8633],
         [ 0.8201,  0.1030, -0.1315,  ..., -0.0158, -0.3773, -0.2751]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.8340, -0.6524, -0.9825,  0.8098,  0.8601, -0.1653,  0.6097,  0.4162,
         -0.9247, -1.0000, -0.7535,  0.9737,  0.9636,  0.8156,  0.7490, -0.7862,
         -0.6177, -0.5976,  0.5597,  0.3024,  0.7904,  1.0000, -0.4806,  0.3569,
          0.5712,  0.9958, -0.8670,  0.8367,  0.9108,  0.7398, -0.6173,  0.4795,
         -0.9896, -0.2559, -0.9894, -0.9876,  0.6727, -0.4899,  0.0371, -0.2646,
         -0.7660,  0.4151,  1.00

In [10]:
# extract last hidden state
last_hidden_state = outputs.last_hidden_state
print(last_hidden_state.shape)

torch.Size([1, 13, 768])


## 3. Generative Pre-trained Transformers

In [16]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

In [12]:
# load gpt tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [14]:
# tokenize input
tokens = tokenizer.encode('Rohit Sharma is my favourite cricketer.', return_tensors='pt')
print(tokens)

tensor([[   49,  1219,   270, 40196,   318,   616, 12507,  1067,   624,  2357,
            13]])


In [17]:
# get embeddings
with torch.no_grad():
    outputs = model(tokens)

In [18]:
# extract last hidden state
last_hidden_state = outputs.last_hidden_state
print(last_hidden_state.shape)

torch.Size([1, 11, 768])
