In [None]:
# Text representation is to convert the text into the numbers
# the different techniques are: 
# 1. One hot encoding
# 2. Bag of words - Unigram, Bigram, Trigram, n-gram
# 3. TF-IDF - Term Frequency-Inverse Document Frequency - used to reflect the importance/weightage of a word in a document relative to a corpus.
# 4. Word embeddings (Word2Vec, GloVe, FastText)

In [None]:
# 2. Bag of Words - frequency of words in the text matter, but the order of words does not matter.
# Uni-gram: single word
# Author: Muhammad Humayun Khan

import pandas as pd
import numpy as np

# Sample DataFrame - Create a DataFrame with some text data
df = pd.DataFrame({
    'text': ["People watch Youtube", "Youtube watch People", "People write comment", "Youtube write comment"]
})

df

Unnamed: 0,text
0,People watch Youtube
1,Youtube watch People
2,People write comment
3,Youtube write comment


In [None]:
# At first the object of the CountVectorizer class is created, which will be used to convert the text data into a matrix of token counts.
# Now to get the unique words from the text data, I print the vocabulary of the CountVectorizer object.
# The vocabulary displayed all the unique words in order of the index number and index number is shown against each word.
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

# Print the vocabulary
print("Vocabulary:", vectorizer.vocabulary_)

# the ouput below showing comment as index 0, People as index 1, watch as index 2 and Youtube as index 3

Vocabulary: {'people': 1, 'watch': 2, 'youtube': 4, 'write': 3, 'comment': 0}


In [6]:
print("Shape of the matrix:", X.shape)

Shape of the matrix: (4, 5)


In [7]:
# now want to print the matrix in array format
print("Matrix in array format:\n", X.toarray())

Matrix in array format:
 [[0 1 1 0 1]
 [0 1 1 0 1]
 [1 1 0 1 0]
 [1 0 0 1 1]]


In [None]:
# lets predict a new text
new_text = ["People watch Youtube and write of comment"]
# Transform the new text using the same vectorizer
vectorizer.transform(new_text).toarray()

# the output below showing that the words "and", "of" are not in the vocabulary, so they are not counted.

array([[1, 1, 1, 1, 1]])

In [None]:
# 2. Bag of Words - bigram: two words together, trigram: three words together, n-gram: n words together
#
# Sample text data
text_data = [
    "People watch Youtube",
    "Youtube watch People",
    "People write comment",
    "Youtube write comment"
]   

# Initialize CountVectorizer with ngram_range for bigrams
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2)) 

# Fit and transform the text data for bigrams
X_bigram = vectorizer_bigram.fit_transform(text_data)

# Print the vocabulary for bigrams
print("Bigram Vocabulary:", vectorizer_bigram.vocabulary_)

Bigram Vocabulary: {'people watch': 0, 'watch youtube': 3, 'youtube watch': 5, 'watch people': 2, 'people write': 1, 'write comment': 4, 'youtube write': 6}


In [11]:
# Trigram example
vectorizer_trigram = CountVectorizer(ngram_range=(3, 3))

# Fit and transform the text data for trigrams
X_trigram = vectorizer_trigram.fit_transform(text_data)

# Print the vocabulary for trigrams
print("Trigram Vocabulary:", vectorizer_trigram.vocabulary_)


Trigram Vocabulary: {'people watch youtube': 0, 'youtube watch people': 2, 'people write comment': 1, 'youtube write comment': 3}


In [12]:
# n-gram example
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))

# Fit and transform the text data for n-grams
X_ngram = vectorizer_ngram.fit_transform(text_data)

# Print the vocabulary for n-grams
print("N-gram Vocabulary:", vectorizer_ngram.vocabulary_)

N-gram Vocabulary: {'people': 1, 'watch': 4, 'youtube': 9, 'people watch': 2, 'watch youtube': 6, 'youtube watch': 10, 'watch people': 5, 'write': 7, 'comment': 0, 'people write': 3, 'write comment': 8, 'youtube write': 11}


In [None]:
# 3. TF-IDF - Term Frequency-Inverse Document Frequency

# Sample text data
text_data = [
    "People watch Youtube",
    "Youtube watch People",
    "People write comment",
    "Youtube write comment"
]

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(text_data).toarray()

# Print the Term Frequencey Matrix for each word
print("TF-IDF Matrix:\n", X_tfidf)  

TF-IDF Matrix:
 [[0.         0.53256952 0.65782931 0.         0.53256952]
 [0.         0.53256952 0.65782931 0.         0.53256952]
 [0.61366674 0.49681612 0.         0.61366674 0.        ]
 [0.61366674 0.         0.         0.61366674 0.49681612]]


In [None]:
# print the IDF and feature names
print("IDF values:", tfidf_vectorizer.idf_)
print("Feature names:", tfidf_vectorizer.get_feature_names_out())

IDF values: [1.51082562 1.22314355 1.51082562 1.51082562 1.22314355]
Feature names: ['comment' 'people' 'watch' 'write' 'youtube']


In [18]:
# Now create the TF-IDF matrix for the same text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

# Print the shape of the TF-IDF matrix
print("Shape of the TF-IDF matrix:", tfidf_matrix.shape)



Shape of the TF-IDF matrix: (4, 5)


In [7]:
# 4. Word2Vec - Word2Vec is a technique to convert words into vectors of fixed size, capturing semantic meaning.

import gensim
import os
import pandas as pd
import numpy as np

In [3]:
import os
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk # Import nltk


path = 'datasets/gameofthronesbooks/'
story = []
for filename in os.listdir(path):
    # Ensure processing files, not directories
    if os.path.isfile(os.path.join(path, filename)):
        try:
            # Use 'with' statement for proper file handling
            with open(os.path.join(path, filename), 'r', encoding='utf-8') as f:
                corpus = f.read()
                raw_sent = sent_tokenize(corpus)
                for sent in raw_sent:
                    story.append(simple_preprocess(sent))
        except UnicodeDecodeError:
            print(f"Skipping {filename} due to UnicodeDecodeError. Try different encoding if needed.")
            continue
        except Exception as e:
            print(f"Error processing {filename}: {e}")

print(f"Successfully processed {len(story)} sentences.")

Skipping 004ssb.txt due to UnicodeDecodeError. Try different encoding if needed.
Skipping 005ssb.txt due to UnicodeDecodeError. Try different encoding if needed.
Successfully processed 94740 sentences.


In [4]:
len(story) # length of total sentences

94740

In [5]:
story # tokenization of each sentence

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait'],
 ['he',
  'was',
  'an',
  'old',
  'man',
  'past',
  'fifty',
  'and',
  'he',
  'had',
  'seen',
  'the',
  'lordlings',
  'come',
  'and',
  'go'],
 ['dead', 'is', 'dead', 'he', 'said'],
 ['we', 'have', 'no', 'business', 'with', 'the', 'dead'],
 ['are', 'they', 'dead'],
 ['royce', 'asked', 'softly'],
 ['what', 'proof', 'have', 'we'],
 ['will', 'saw', 'them', 'gared', 'said'],
 ['if',
  'he',
  'says',
  'they',
  'are',
  'dead',
  'that',
  'proof',


In [8]:
# model the story
# I kept the window as 10 for slide and minimum words in the sentence should be two
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
)

In [9]:
# Build the vocabulary from the tokenized sentences
model.build_vocab(story)

In [10]:
# train the Word2Vec model
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(3826815, 5045030)

In [11]:
# predict the vector for a word
model.wv.most_similar('jon', topn=10)

[('sam', 0.8194167613983154),
 ('bran', 0.7877230644226074),
 ('theon', 0.7440493702888489),
 ('tyrion', 0.7268936634063721),
 ('davos', 0.7126198410987854),
 ('he', 0.708119809627533),
 ('ygritte', 0.7041485905647278),
 ('ned', 0.6543896794319153),
 ('chett', 0.6432512998580933),
 ('qhorin', 0.6404738426208496)]

In [12]:
model.wv.doesnt_match(['jon', 'snow', 'tyrion', 'lannister', 'stark'])

'lannister'

In [13]:
# now i want my all words vectors
model.wv.get_normed_vectors()


array([[-0.09193305,  0.08078799, -0.09936025, ..., -0.08791839,
        -0.09165169,  0.08794969],
       [-0.14372875,  0.24407156,  0.1717603 , ..., -0.04477134,
        -0.00274225, -0.13736151],
       [-0.04041661, -0.02263945, -0.06969877, ..., -0.01212117,
         0.16388576, -0.13783574],
       ...,
       [-0.14129306,  0.02563207, -0.03849892, ...,  0.01760021,
        -0.05107817,  0.08165814],
       [-0.09054715,  0.11219621,  0.0053741 , ...,  0.01451441,
        -0.04676858, -0.11470304],
       [ 0.12161145,  0.05343146,  0.02303997, ..., -0.11276472,
        -0.06477016, -0.21190847]], dtype=float32)

In [14]:
# Get the words
y = model.wv.index_to_key
print("Words in the vocabulary:", y)



In [15]:
# now i want to display all words in 3 dimension and not in the 100 dimension
# for this the process PCA works
from sklearn.decomposition import PCA

# Create a PCA model to reduce the dimensions to 3
pca = PCA(n_components=3)

# Fit the PCA model to the word vectors
X = pca.fit_transform(model.wv.get_normed_vectors())

In [16]:
X.shape  # Check the shape of the PCA result - it should be (number of words, 3)

(13774, 3)

In [17]:
# now plot the PCA result
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()