In [None]:
#ASCII Encoding
word = "Hello"
for char in word:
  print(f"The ASCII value of '{char}' is {ord(char)}")

The ASCII value of 'H' is 72
The ASCII value of 'e' is 101
The ASCII value of 'l' is 108
The ASCII value of 'l' is 108
The ASCII value of 'o' is 111


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
words = ['Apple','Ball','Cat','Ball','Cat']
encoded_word = le.fit_transform(words)
print("\nLabel Encoded Word:", encoded_word)


Label Encoded Word: [0 1 2 1 2]


In [None]:
import pandas as pd
df = pd.DataFrame({'Colors':['Red','Green','Yellow','Green','Yellow','Black']})
df

Unnamed: 0,Colors
0,Red
1,Green
2,Yellow
3,Green
4,Yellow
5,Black


In [None]:
print(le.fit_transform(df))

[2 1 3 1 3 0]


In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
print(ohe.fit_transform(df).toarray())

[[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]


In [None]:
df1 = pd.DataFrame({'Sentences':['Cat ate Dog','Dog ate Cat','Apple is red','Apple is red in color','Apples are good','Apple is tasty']})
print(ohe.fit_transform(df1).toarray())

[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [None]:
!pip install nltk



In [None]:
#If OHE has to work for sentences, we need to break the sentences into words (tokens).
#A corpus of all words will be created and Based on presence of words, each sentence's vector will be created.
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   #incase downloading punkt isn't working
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
dataset = "Hello Everyone. Welcome to this course. We are Studying NLP! We are good."
sent_tokenize(dataset,language = 'english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['Hello Everyone.',
 'Welcome to this course.',
 'We are Studying NLP!',
 'We are good.']

In [None]:
word_tokenize(dataset,language='english',preserve_line=True)

['Hello',
 'Everyone.',
 'Welcome',
 'to',
 'this',
 'course.',
 'We',
 'are',
 'Studying',
 'NLP',
 '!',
 'We',
 'are',
 'good',
 '.']

#### For encoding or any further processing to work efficiently, the data needs to be cleaned/preprocessed.

Preprocessing involves
1. Removing tense/suffixes/prefixes/degree from word and reduce it to root word
2. Converting case of all words to maintain uniform case around
3. Removing punctuation marks
4. Removing stop words
5. Any other cleaning using regex patterns. Regex is a python library that allows you to search, find, substitute, split a sentence/document based on pattern.

In [None]:
#Stemming: Reducing the word to base form by removing suffix and prefix. The stems may not be physical words
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

words_to_stem = ['running', 'runner', 'ran', 'runs', 'easily', 'fairly']

print("Porter Stemmer:")
for word in words_to_stem:
  print(f"{word} -> {porter.stem(word)}")

print("\nLancaster Stemmer:")
for word in words_to_stem:
  print(f"{word} -> {lancaster.stem(word)}")

print("\nSnowball Stemmer (English):")
for word in words_to_stem:
  print(f"{word} -> {snowball.stem(word)}")



Porter Stemmer:
running -> run
runner -> runner
ran -> ran
runs -> run
easily -> easili
fairly -> fairli

Lancaster Stemmer:
running -> run
runner -> run
ran -> ran
runs -> run
easily -> easy
fairly -> fair

Snowball Stemmer (English):
running -> run
runner -> runner
ran -> ran
runs -> run
easily -> easili
fairly -> fair


In [None]:
#Lemmatization: Reducing words into their roots called lemmas. The lemmas are linguistically existent, they are basic english words.
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words_to_lemmatize = ['running', 'runner', 'ran', 'runs', 'easily', 'fairly', 'better', 'geese']

print("\nWordNet Lemmatizer:")
# Word net Lemmatization typically requires a part-of-speech tag for better accuracy
# Default is 'n' (noun)
print("Default (Noun) Lemmatization:")
for word in words_to_lemmatize:
  print(f"{word} -> {lemmatizer.lemmatize(word)}")

print("\nVerb Lemmatization:")
for word in words_to_lemmatize:
  print(f"{word} -> {lemmatizer.lemmatize(word, pos='v')}")

print("\nAdjective Lemmatization:")
for word in words_to_lemmatize:
  print(f"{word} -> {lemmatizer.lemmatize(word, pos='a')}")


WordNet Lemmatizer:
Default (Noun) Lemmatization:
running -> running
runner -> runner
ran -> ran
runs -> run
easily -> easily
fairly -> fairly
better -> better
geese -> goose

Verb Lemmatization:
running -> run
runner -> runner
ran -> run
runs -> run
easily -> easily
fairly -> fairly
better -> better
geese -> geese

Adjective Lemmatization:
running -> running
runner -> runner
ran -> ran
runs -> runs
easily -> easily
fairly -> fairly
better -> good
geese -> geese


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
sentence = "The quick brown foxes are jumping over the lazy dogs."
tokens = word_tokenize(sentence)

print(f"Original sentence: {sentence}")
print("Lemmatized tokens (guessing POS):")
for token in tokens:
  # A simple way to guess POS (this is not robust)
  if token.endswith('ing'):
    pos = 'v'
  elif token.endswith('es') or token.endswith('s'):
    pos = 'n'
  elif len(token) > 3 and token.isalpha(): # simple check for possible adjectives
      pos = 'a'
  else:
      pos = 'n' # default to noun

  print(f"{token} -> {lemmatizer.lemmatize(token, pos=pos)}")

print("\nLemmatized tokens (Verb POS):")
for token in tokens:
    print(f"{token} -> {lemmatizer.lemmatize(token, pos='v')}")

print("\nLemmatized tokens (Adjective POS):")
for token in tokens:
    print(f"{token} -> {lemmatizer.lemmatize(token, pos='a')}")

Original sentence: The quick brown foxes are jumping over the lazy dogs.
Lemmatized tokens (guessing POS):
The -> The
quick -> quick
brown -> brown
foxes -> fox
are -> are
jumping -> jump
over -> over
the -> the
lazy -> lazy
dogs -> dog
. -> .

Lemmatized tokens (Verb POS):
The -> The
quick -> quick
brown -> brown
foxes -> fox
are -> be
jumping -> jump
over -> over
the -> the
lazy -> lazy
dogs -> dog
. -> .

Lemmatized tokens (Adjective POS):
The -> The
quick -> quick
brown -> brown
foxes -> foxes
are -> are
jumping -> jumping
over -> over
the -> the
lazy -> lazy
dogs -> dogs
. -> .


In [None]:
#POS (Parts of Speech) Tagging
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
post_tagging = pos_tag(tokens)
post_tagging

[('The', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('foxes', 'NNS'),
 ('are', 'VBP'),
 ('jumping', 'VBG'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dogs', 'NNS'),
 ('.', '.')]

In [None]:
#nltk.download('tagsets')
nltk.download('tagsets_json')
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets_json to /root/nltk_data...
[nltk_data]   Package tagsets_json is already up-to-date!


In [None]:
#Using tags generated by pos_tagger of nltk to lemmatize a word correctly using word net lemmatizer.
#To do that we need to convert complex tags to simpler tags, using a if-else condition in a function
def nltk_tag_to_wordnet_pos(nltk_tag):
  """Convert NLTK POS tags to WordNet POS tags for lemmatization."""
  if nltk_tag.startswith('J'):
    return 'a'  # Adjective
  elif nltk_tag.startswith('V'):
    return 'v'  # Verb
  elif nltk_tag.startswith('N'):
    return 'n'  # Noun
  elif nltk_tag.startswith('R'):
    return 'r'  # Adverb
  else:
    return None # Return None if no suitable WordNet tag is found

lemmatizer = WordNetLemmatizer()
tagged_tokens = pos_tag(tokens)

print("\nLemmatized tokens using POS tags:")
lemmatized_tokens = []
for word, tag in tagged_tokens:
    # Get the WordNet POS tag
    wntag = nltk_tag_to_wordnet_pos(tag)

    # If a WordNet tag is found, lemmatize with the tag
    if wntag is not None:
        lemmatized_token = lemmatizer.lemmatize(word, pos=wntag)
    else:
        # If no suitable tag is found, default to noun lemmatization
        lemmatized_token = lemmatizer.lemmatize(word)
    lemmatized_tokens.append(lemmatized_token)
    print(f"{word}/{tag} -> {lemmatized_token}")

print("\nLemmatized Sentence:", " ".join(lemmatized_tokens))


Lemmatized tokens using POS tags:
The/DT -> The
quick/JJ -> quick
brown/NN -> brown
foxes/NNS -> fox
are/VBP -> be
jumping/VBG -> jump
over/IN -> over
the/DT -> the
lazy/JJ -> lazy
dogs/NNS -> dog
./. -> .

Lemmatized Sentence: The quick brown fox be jump over the lazy dog .


In [None]:
#StopWords
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#stopwords.words()
stop_words = stopwords.words('english')
stop_words

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [None]:
#Cleaning databy removing stop words can be done using simple if condition.
for i in tokens:
  if i not in stop_words:
    print(i)

The
quick
brown
foxes
jumping
lazy
dogs
.


In [None]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
#Cleaning databy removing stop words can be done using simple if condition.
for i in tokens:
  if i not in stop_words and i not in string.punctuation:
    print(i)

The
quick
brown
foxes
jumping
lazy
dogs


In [None]:
#Regex
#!pip install re
import re
sen = 'Machine learning is a growing technology which enables machines to learn automatically from past data.'
re.search('Machine',sen),re.findall('machine',sen),re.findall('machine',sen,re.IGNORECASE),re.split('machine',sen,re.IGNORECASE),re.sub('machine','AI',sen,re.IGNORECASE)

(<re.Match object; span=(0, 7), match='Machine'>,
 ['machine'],
 ['Machine', 'machine'])

In [None]:
#Patterns may not be simple as this, it may go crazy
sen = 'She is 20. She learnt driving when she was 18. I only taught her.'
pattern = r'\b[A-Za-z]+\b'
#Try pattern as ., [a-z],[a-z]*,[a-z]+,[a-zA-Z0-9]+,[a-zA-Z0-9]*. Each time try putting \b in starting and ending of pattern
re.findall(pattern,sen)

['She',
 'is',
 'She',
 'learnt',
 'driving',
 'when',
 'she',
 'was',
 'I',
 'only',
 'taught',
 'her']

In [None]:
sen = 'ma Ma mac machine bmac cmacd macd m'
pattern = r'\b[a-z]{3}\b'
#Try pattern as r'\b[a-z]{3,5}\b',r'\bm[a-z]+\b',r'\bm[a-z]*\b',r'\bm[a-z]?\b'

re.findall(pattern,sen)

['mac']

In [None]:
sen = 'I have list of email ids here. url@connect.com, apple@berry.com, web@browser.com. @Navin, @Pravin, please trace the mails.'
#Find the people who are tagged in this message
print(re.findall(r' @([A-Za-z]+)',sen))
#print(re.findall(r'(?<!\w)@([A-Za-z]+)',sen))

#Find the email ids listed in the message
print(re.findall(r'[a-zA-Z_0-9]+@[a-z]+\.com',sen))

#Check if the message ends with 'mails.'
print(re.findall('mails.$',sen))

#Check if the message starts with I or We
print(re.findall('^(I|We)',sen))   #^ indicates beginning of sentence and | indicates 'or'

In [None]:
a = 'https://docs.google.com/spreadsheets/d/1PcuXPW88fBeEUGwgAmdfE5LJn_i7AHK32IRMJc1hDVKiM/edit?usp=sharing'
re.findall(r'd/([a-zA-Z0-9_]+)',a)

['1PcuXPW88fBeEUGwgAmdfE5LJn_i7AHK32IRMJc1hDVKiM']

In [None]:
repo = 'https://github.com/Haritha-Vedam/nginx_repo.git'
username = re.findall('/([a-zA-Z-]+)/',repo)
reponame = re.findall(r'/([a-zA-Z_]+)\.git',repo)
username,reponame

(['Haritha-Vedam'], ['nginx_repo'])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents (sentences)
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The dog is lazy.",
    "The fox is quick and brown.",
    "A quick brown dog.",
    "The quick fox is quick."
]

# Initialize CountVectorizer
# The CountVectorizer converts a collection of text documents to a matrix of token counts.
# It tokenizes the text and builds a vocabulary of known words, then encodes each document
# as a vector where each element represents the count of a word in the vocabulary.
# Parameters:
#   - stop_words='english': Remove common English stop words like 'the', 'is', 'a', etc.
#   - lowercase=True: Convert all text to lowercase before tokenizing. (This is the default)
vectorizer = CountVectorizer()

# Fit and transform the documents
# fit phase learns the vocabulary from the documents and transform, then transforms the documents into a matrix of token counts.
# The output is a sparse matrix, which can be converted to a dense array using .toarray()
X = vectorizer.fit_transform(documents)

# Get the vocabulary learned by the vectorizer
# This is a dictionary where keys are the words and values are their indices in the vocabulary.
vocabulary = vectorizer.vocabulary_

# Get the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary:")
print(vocabulary)
print("\nFeature Names (Words in Vocabulary):")
print(feature_names)

print("\nDocument-Term Matrix (Sparse Matrix):")
print(X)

print("\nDocument-Term Matrix (Dense Array):")
print(X.toarray())

# Details of the matrix:
# - Each row represents a document.
# - Each column represents a word in the vocabulary (corresponding to feature_names).
# - The value in cell (i, j) is the count of the word j in document i.

print("\nRerunning with stop_words='english'")
vectorizer_with_stopwords = CountVectorizer(stop_words='english')
X1 = vectorizer_with_stopwords.fit_transform(documents)
feature_names_with_stopwords = vectorizer_with_stopwords.get_feature_names_out()

print("\nFeature Names (with stop_words='english'):")
print(feature_names_with_stopwords) # e.g., ['brown' 'dog' 'fox' 'jumps' 'lazy' 'quick']

print("\nDocument-Term Matrix (with stop_words='english'):")
print(X1.toarray())

# Key points:
# - Converts text data into numerical feature vectors.
# - Creates a vocabulary of unique words from the corpus.
# - Each document's vector contains the frequency count of each word in the vocabulary.
# - Can remove stop words and handle capitalization.
# - The output is a sparse matrix to save memory, especially for large vocabularies and many documents.


Vocabulary:
{'the': 9, 'quick': 8, 'brown': 1, 'fox': 3, 'jumps': 5, 'over': 7, 'lazy': 6, 'dog': 2, 'is': 4, 'and': 0}

Feature Names (Words in Vocabulary):
['and' 'brown' 'dog' 'fox' 'is' 'jumps' 'lazy' 'over' 'quick' 'the']

Document-Term Matrix (Sparse Matrix):
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 25 stored elements and shape (5, 10)>
  Coords	Values
  (0, 9)	2
  (0, 8)	1
  (0, 1)	1
  (0, 3)	1
  (0, 5)	1
  (0, 7)	1
  (0, 6)	1
  (0, 2)	1
  (1, 9)	1
  (1, 6)	1
  (1, 2)	1
  (1, 4)	1
  (2, 9)	1
  (2, 8)	1
  (2, 1)	1
  (2, 3)	1
  (2, 4)	1
  (2, 0)	1
  (3, 8)	1
  (3, 1)	1
  (3, 2)	1
  (4, 9)	1
  (4, 8)	2
  (4, 3)	1
  (4, 4)	1

Document-Term Matrix (Dense Array):
[[0 1 1 1 0 1 1 1 1 2]
 [0 0 1 0 1 0 1 0 0 1]
 [1 1 0 1 1 0 0 0 1 1]
 [0 1 1 0 0 0 0 0 1 0]
 [0 0 0 1 1 0 0 0 2 1]]

Rerunning with stop_words='english'

Feature Names (with stop_words='english'):
['brown' 'dog' 'fox' 'jumps' 'lazy' 'quick']

Document-Term Matrix (with stop_words='english'):
[[1 1 1 1 1 1]


In [None]:
import pandas as pd
pd.DataFrame(X1.toarray(), columns= feature_names_with_stopwords)

Unnamed: 0,brown,dog,fox,jumps,lazy,quick
0,1,1,1,1,1,1
1,0,1,0,0,1,0
2,1,0,1,0,0,1
3,1,1,0,0,0,1
4,0,0,1,0,0,2


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# The TfidfVectorizer calculates the Term Frequency-Inverse Document Frequency for each word in each document.
# TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
# It is often used as a weighting factor in information retrieval and text mining.
# Formula: TF-IDF(t, d, D) = TF(t, d) * IDF(t, D)
# TF(t, d): Term Frequency - The number of times a term (t) appears in a document (d). Usually normalized by the document length.
# IDF(t, D): Inverse Document Frequency - log_e(Total number of documents (D) / Number of documents with term t in it).
# IDF measures how common or rare a word is across all documents. Words that appear in many documents have a lower IDF.
# Parameters are similar to CountVectorizer, with added options for weighting schemes.
#   - stop_words='english': Remove common English stop words.
#   - lowercase=True: Convert all text to lowercase.
#   - use_idf=True: Enable inverse-document-frequency reweighting. (This is the default)
#   - smooth_idf=True: Add 1 to the document frequencies to prevent division by zero for terms that appear in only one document. (This is the default)
#   - sublinear_tf=False: Apply sublinear term frequency scaling, i.e., replace tf with 1 + log(tf). (Default is False)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the documents
# fit phase learns the vocabulary and IDF values from the documents.
# transform phase transforms the documents into a matrix of TF-IDF scores.
Y = tfidf_vectorizer.fit_transform(documents)

# Get the vocabulary learned by the vectorizer
tfidf_vocabulary = tfidf_vectorizer.vocabulary_

# Get the feature names (words in the vocabulary)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Get the IDF values for each term in the vocabulary
idf_values = tfidf_vectorizer.idf_

print("\nTF-IDF Vocabulary:")
print(tfidf_vocabulary)
print("\nTF-IDF Feature Names (Words in Vocabulary):")
print(tfidf_feature_names)
print("\nIDF values for each term:")
# Pair feature names with their IDF values for better understanding
for word, idf in zip(tfidf_feature_names, idf_values):
    print(f"{word}: {idf:.4f}")


print("\nDocument-TF-IDF Matrix (Sparse Matrix):")
print(Y)

print("\nDocument-TF-IDF Matrix (Dense Array):")
print(Y.toarray())

# Details of the matrix:
# - Each row represents a document.
# - Each column represents a word in the vocabulary (corresponding to tfidf_feature_names).
# - The value in cell (i, j) is the TF-IDF score of word j in document i.
# - Higher TF-IDF scores indicate that the word is more important to that specific document, considering its frequency within the document
#   and its rarity across the entire corpus.

# Convert to a DataFrame for better readability
tfidf_df = pd.DataFrame(Y.toarray(), columns=tfidf_feature_names)
print("\nDocument-TF-IDF DataFrame:")
tfidf_df

# In summary, TF-IDF vectorization:
# 1. Counts word occurrences within each document (Term Frequency - TF).
# 2. Weighs down words that appear frequently across the entire set of documents (Inverse Document Frequency - IDF).
# 3. Multiplies TF and IDF to get a score for each word in each document.
# 4. Normalizes the resulting vectors (typically using L2 norm).
# It produces a numerical representation of text documents that emphasizes words that are important to a specific document but not overly common across the corpus.




TF-IDF Vocabulary:
{'quick': 5, 'brown': 0, 'fox': 2, 'jumps': 3, 'lazy': 4, 'dog': 1}

TF-IDF Feature Names (Words in Vocabulary):
['brown' 'dog' 'fox' 'jumps' 'lazy' 'quick']

IDF values for each term:
brown: 1.4055
dog: 1.4055
fox: 1.4055
jumps: 2.0986
lazy: 1.6931
quick: 1.1823

Document-TF-IDF Matrix (Sparse Matrix):
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (5, 6)>
  Coords	Values
  (0, 5)	0.30948279390742617
  (0, 0)	0.3678925296568199
  (0, 2)	0.3678925296568199
  (0, 3)	0.5493297408757738
  (0, 4)	0.4431957760772619
  (0, 1)	0.3678925296568199
  (1, 4)	0.7694470729725092
  (1, 1)	0.6387105775654869
  (2, 5)	0.5112315343529393
  (2, 0)	0.6077179931033552
  (2, 2)	0.6077179931033552
  (3, 5)	0.5112315343529393
  (3, 0)	0.6077179931033552
  (3, 1)	0.6077179931033552
  (4, 5)	0.8596219362398577
  (4, 2)	0.5109306476766079

Document-TF-IDF Matrix (Dense Array):
[[0.36789253 0.36789253 0.36789253 0.54932974 0.44319578 0.30948279]
 [0

Unnamed: 0,brown,dog,fox,jumps,lazy,quick
0,0.367893,0.367893,0.367893,0.54933,0.443196,0.309483
1,0.0,0.638711,0.0,0.0,0.769447,0.0
2,0.607718,0.0,0.607718,0.0,0.0,0.511232
3,0.607718,0.607718,0.0,0.0,0.0,0.511232
4,0.0,0.0,0.510931,0.0,0.0,0.859622


In [None]:
from nltk.text import TextCollection
from nltk.tokenize import word_tokenize
corpus = ["I am thinking to buy a new phone","You are very good","I like you","This world is beautiful"]
sents=[word_tokenize(i) for i in corpus]
#print(sents)
corpus=TextCollection(sents)
print(corpus)

<Text: I am thinking to buy a new phone...>


In [None]:
corpus.tf('thinking',sents[0])

0.125

In [None]:
print(corpus.idf("I"))
print(corpus.idf("phone"))
print(corpus.tf("I",sents[0]))
print(corpus.tf("You",sents[1]))
print(corpus.tf_idf("eat",sents[0]))

0.6931471805599453
1.3862943611198906
0.125
0.25
0.0


In [None]:
# N-grams: Contiguous sequences of N items (words or characters) from a given sample of text.
# Bigrams are N-grams where N=2. They capture pairs of consecutive words.
sentence_for_ngrams = "The quick brown fox jumps over the lazy dog."
tokens_for_ngrams = word_tokenize(sentence_for_ngrams.lower()) # Convert to lowercase for consistency

print(f"\nOriginal Sentence: {sentence_for_ngrams}")
print(f"Tokens: {tokens_for_ngrams}")

# Generate Bigrams using NLTK
from nltk.util import ngrams
# n=2 for bigrams
bigrams_list = list(ngrams(tokens_for_ngrams, 2))
print("\nBigrams list: ", bigrams_list)

# Generating N-grams (Trigrams, N=3)
trigrams_list = list(ngrams(tokens_for_ngrams, 3))
print("\nTrigrams:")
print(trigrams_list)


Original Sentence: The quick brown fox jumps over the lazy dog.
Tokens: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']

Bigrams list:  [('the', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog'), ('dog', '.')]

Trigrams:
[('the', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'jumps'), ('fox', 'jumps', 'over'), ('jumps', 'over', 'the'), ('over', 'the', 'lazy'), ('the', 'lazy', 'dog'), ('lazy', 'dog', '.')]


In [None]:
# Generating N-grams using CountVectorizer with ngram_range parameter. CountVectorizer can directly create token counts based on n-grams.
print("\nGenerating N-grams using CountVectorizer:")

# Example documents
ngram_documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first, second or third document?"
]
print("\nCountVectorizer for Bigrams (ngram_range=(2, 2)):")
#bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')
#bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words=None)
custom_stop_words = ['the', 'is', 'this', 'and', 'or']
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words=custom_stop_words)
bigram_matrix = bigram_vectorizer.fit_transform(ngram_documents)
bigram_feature_names = bigram_vectorizer.get_feature_names_out()

print("Bigram Feature Names:")
print(bigram_feature_names)
print("\nBigram Document-Term Matrix:")
print(bigram_matrix.toarray())
print(pd.DataFrame(bigram_matrix.toarray(), columns=bigram_feature_names))

# Using CountVectorizer for Unigrams and Bigrams (ngram_range=(1, 2))
print("\n\nCountVectorizer for Unigrams and Bigrams (ngram_range=(1, 2)):")
unigram_bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english',min_df=1)
unigram_bigram_matrix = unigram_bigram_vectorizer.fit_transform(ngram_documents)
unigram_bigram_feature_names = unigram_bigram_vectorizer.get_feature_names_out()

print("Unigram and Bigram Feature Names:")
print(unigram_bigram_feature_names)
print("\nUnigram and Bigram Document-Term Matrix:")
print(unigram_bigram_matrix.toarray())
print(pd.DataFrame(unigram_bigram_matrix.toarray(), columns=unigram_bigram_feature_names))

# Key points about N-grams:
# - Capture local word order and context that single words (unigrams) miss.
# - Bigrams (N=2) are the most common type of n-gram used in text processing.
# - Can be generated using NLTK's `ngrams` function or directly incorporated into vectorization using `ngram_range` in scikit-learn's CountVectorizer/TfidfVectorizer.
# - Increase the size of the feature space significantly as N increases.



Generating N-grams using CountVectorizer:

CountVectorizer for Bigrams (ngram_range=(2, 2)):
Bigram Feature Names:
['document second' 'first document' 'first second' 'second document'
 'second third' 'third document' 'third one']

Bigram Document-Term Matrix:
[[0 1 0 0 0 0 0]
 [1 0 0 1 0 0 0]
 [0 0 0 0 0 0 1]
 [0 0 1 0 1 1 0]]
   document second  first document  first second  second document  \
0                0               1             0                0   
1                1               0             0                1   
2                0               0             0                0   
3                0               0             1                0   

   second third  third document  third one  
0             0               0          0  
1             0               0          0  
2             0               0          1  
3             1               1          0  


CountVectorizer for Unigrams and Bigrams (ngram_range=(1, 2)):
Unigram and Bigram Feature Names:
