#    Aim:  Text preprocessing level two operations-Feature engineering of textual data


In [1]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

In [2]:
def vocab_adder(vocab, new_tokens):
    """
    Add new tokens to the vocabulary and sort it.

    Args:
        vocab (list): The existing vocabulary.
        new_tokens (list): New tokens to add to the vocabulary.

    Returns:
        list: Updated and sorted vocabulary.
    """
    if len(vocab) == 0:
        #print("IN IF")
        vocab = new_tokens
        vocab.sort()
    else:
        #print("IN ELSE")
        temp = [wrd for wrd in new_tokens if wrd not in vocab]
        #print(temp)
        vocab.extend(temp)
        vocab.sort()
        #print(vocab)
        
    return(vocab)

In [3]:
def preprocess(string, vocab = None):
    """
    Preprocess a string by removing stopwords and optionally updating vocabulary.

    Args:
        string (str): Input text string to preprocess.
        vocab (list, optional): Existing vocabulary to update.

    Returns:
        list or None: Processed tokens if no vocabulary provided, else updated vocabulary.
    """
    if vocab is not None:
        print(string, vocab)
        new_tokens = remove_stopword(string)
        #print(f"Length Of Tokens After Stop Word Removal: {len(new_tokens)}")
        #print(type(vocab))
        vocab = vocab_adder(vocab, new_tokens)
        return(vocab)
    else:
        new_tokens = remove_stopword(string)
        return new_tokens

In [4]:
def freq_idf(word, doc):
    """
    Calculate the frequency of a word in a document.

    Args:
        word (str): The word to calculate frequency for.
        doc (list): The list of documents.

    Returns:
        int: Frequency of the word in the document.
    """
    cnt = 0
    for i in doc:
        for wrd in i.split(" "):
            if wrd == word:
                cnt += 1
    return cnt

In [5]:
def remove_stopword(string):
    """
    Remove stopwords from a string.

    Args:
        string (str): Input text string.

    Returns:
        list: List of tokens after removing stopwords.
    """
    string_without_punctuation = remove_punctuation(string)
    #print("String Without Punctuation: ", string_without_punctuation)
    stp = stopwords.words("english")
    new_tokens = [wrd for wrd in string_without_punctuation if wrd not in stp]
    new_tokens.sort()
    #print(new_tokens)
    return new_tokens

In [6]:
def freq(word, corpus):
    """
    Calculate the frequency of a word in a corpus.

    Args:
        word (str): The word to calculate frequency for.
        corpus (list): The list of words.

    Returns:
        int: Frequency of the word in the corpus.
    """
    cnt = 0
    for wrd in corpus:
        if wrd == word:
            cnt =+ 1
    #print(word, corpus)
    return cnt

In [7]:
def remove_punctuation(string, vocab = None):
    """
    Remove punctuation from a string and update vocabulary if provided.

    Args:
        string (str): Input text string.
        vocab (list, optional): Existing vocabulary to update.

    Returns:
        list or None: List of tokens after removing punctuation if no vocabulary provided, else updated vocabulary.
    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    string_without_punctuation = tokenizer.tokenize(string)
    lst = [wrd.lower() for wrd in string_without_punctuation]
    lst.sort()
    
    if vocab is not None:
        vocab = vocab_adder(vocab, lst)
        return vocab
    
    else:
        return lst

In [8]:
def init_dic(vocab):
    """
    Initialize a dictionary with vocabulary words as keys and initial counts as values.

    Args:
        vocab (list): Vocabulary list.

    Returns:
        dict: Dictionary with vocabulary words as keys and initial counts as values.
    """
    dic = {}
    for i in vocab:
        dic[i] = 0
    return dic

In [9]:
def bow(rev1, vocab):
    """
    Create a Bag of Words representation for a given text using a specified vocabulary.

    Args:
        rev1 (str): Input text string.
        vocab (list): Vocabulary list.

    Returns:
        dict: Bag of Words representation as a dictionary with words as keys and counts as values.
    """
    cnt = init_dic(vocab)

    for wrd in remove_punctuation(rev1):
        if wrd in vocab:
            x = cnt[wrd]
            cnt[wrd] = x+1

        else:
            cnt[wrd] = 0
    
    return cnt

In [10]:
# Input sentences
sent1= "India, country that occupies the greater part of South Asia."
sent2= "Its capital is New Delhi."

In [11]:
# Remove stopwords from a string
remove_stopword(sent2)

['capital', 'delhi', 'new']

##  a. To implement label encoding 


### Algorithm


    -Create a vocabulary from the given corpus. 
    -Assign a number to each word in the vocabulary. 

In [13]:
# Initialize an empty vocabulary list
vocab = []
# Preprocess sent1 and update vocabulary
vocab = preprocess(sent1, vocab)
vocab

India, country that occupies the greater part of South Asia. []


['asia', 'country', 'greater', 'india', 'occupies', 'part', 'south']

In [14]:
# Preprocess sent2 and update vocabulary
vocab = preprocess(sent2, vocab)
vocab

Its capital is New Delhi. ['asia', 'country', 'greater', 'india', 'occupies', 'part', 'south']


['asia',
 'capital',
 'country',
 'delhi',
 'greater',
 'india',
 'new',
 'occupies',
 'part',
 'south']

In [15]:
print(vocab)

['asia', 'capital', 'country', 'delhi', 'greater', 'india', 'new', 'occupies', 'part', 'south']


In [16]:
# Create an index dictionary for words in the vocabulary
ind = {}
for i in range(len(vocab)):
    ind[vocab[i]] = i+1
ind

{'asia': 1,
 'capital': 2,
 'country': 3,
 'delhi': 4,
 'greater': 5,
 'india': 6,
 'new': 7,
 'occupies': 8,
 'part': 9,
 'south': 10}

In [17]:
# Tokenize and preprocess sent1
ls_1 = remove_stopword(sent1)

# Perform label encoding for sent1
le_1 = []
for i in ls_1:
    le_1.append(ind[i])

# Print the tokenized sentence and label encoded result for sent1
print(ls_1)
le_1

['asia', 'country', 'greater', 'india', 'occupies', 'part', 'south']


[1, 3, 5, 6, 8, 9, 10]

In [18]:
# Tokenize and preprocess sent2
ls_2 = remove_stopword(sent2)

# Perform label encoding for sent2
le_2 = []
for i in ls_2:
    le_2.append(ind[i])

# Print the tokenized sentence and label encoded result for sent2
print(ls_2)
le_2

['capital', 'delhi', 'new']


[2, 4, 7]

##  b. To implement one hot encoding


### Algorithm

    -Create a vocabulary from the given corpus. 
    -Assign binary vector to each word in the vocabulary.

In [19]:
vocab = []
vocab = preprocess(sent1,vocab)
vocab = preprocess(sent2,vocab)
vocab.sort()
vocab

India, country that occupies the greater part of South Asia. []
Its capital is New Delhi. ['asia', 'country', 'greater', 'india', 'occupies', 'part', 'south']


['asia',
 'capital',
 'country',
 'delhi',
 'greater',
 'india',
 'new',
 'occupies',
 'part',
 'south']

In [20]:
# Initialize a dictionary to store one-hot encoded representation for sent1
label_sent1 = {}
vocab_sent1=[]
vocab_sent1 = preprocess(sent1, vocab_sent1)
vocab_sent1.sort()
print(f"Vocab Of Sentence 1: {vocab_sent1}")

# Generate one-hot encoded representation for each word in vocab_sent1
for wrd in vocab_sent1:
    label_sent1[wrd] = [1 if x==wrd else 0 for x in vocab]

# Create a DataFrame to store one-hot encoded representation for sent1
df_sent1 = pd.DataFrame(columns = vocab, index=label_sent1.keys(), data=label_sent1.values()) 
df_sent1

India, country that occupies the greater part of South Asia. []
Vocab Of Sentence 1: ['asia', 'country', 'greater', 'india', 'occupies', 'part', 'south']


Unnamed: 0,asia,capital,country,delhi,greater,india,new,occupies,part,south
asia,1,0,0,0,0,0,0,0,0,0
country,0,0,1,0,0,0,0,0,0,0
greater,0,0,0,0,1,0,0,0,0,0
india,0,0,0,0,0,1,0,0,0,0
occupies,0,0,0,0,0,0,0,1,0,0
part,0,0,0,0,0,0,0,0,1,0
south,0,0,0,0,0,0,0,0,0,1


In [21]:
label_sent2 = {}
vocab_sent2=[]
vocab_sent2 = preprocess(sent2, vocab_sent2)
vocab_sent2.sort()
print(f"Vocab Of Sentence 2: {vocab_sent2}")

for wrd in vocab_sent2:
    label_sent2[wrd] = [1 if x==wrd else 0 for x in vocab]

df_sent2 = pd.DataFrame(columns = vocab, index=label_sent2.keys(), data=label_sent2.values()) 
df_sent2

Its capital is New Delhi. []
Vocab Of Sentence 2: ['capital', 'delhi', 'new']


Unnamed: 0,asia,capital,country,delhi,greater,india,new,occupies,part,south
capital,0,1,0,0,0,0,0,0,0,0
delhi,0,0,0,1,0,0,0,0,0,0
new,0,0,0,0,0,0,1,0,0,0


## c. To implement BoW


A bag-of-words model, or BoW for short, is a way of extracting features from text for use in modeling, such as with machine learning algorithms. A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things:

    1) A vocabulary of known words.
    2) A measure of the presence of known words.

It is called a “bag” of words, because any information about the order or structure of words in the document is discarded. The model is only concerned with whether known words occur in the document, not where in the document


A very common feature extraction procedures for sentences and documents is the bag-of-words approach (BOW). In this approach, we look at the histogram of the words within the text, i.e. considering each word count as a feature

In [22]:
rev1 = "Game of Thrones is an amazing tv series!"
rev2 = "Game of Thrones is the best tv series!"
rev3 = "Game of Thrones is so great"
rev4 = "game game of of thrones is is great"

In [23]:
rev1.split(" ")

['Game', 'of', 'Thrones', 'is', 'an', 'amazing', 'tv', 'series!']

In [24]:
bow_vocab = []
bow_vocab = remove_punctuation(rev1, bow_vocab)
bow_vocab = remove_punctuation(rev2, bow_vocab)
bow_vocab = remove_punctuation(rev3, bow_vocab)
bow_vocab

['amazing',
 'an',
 'best',
 'game',
 'great',
 'is',
 'of',
 'series',
 'so',
 'the',
 'thrones',
 'tv']

In [25]:
cnt1 = bow(rev1, bow_vocab)
print(rev1)
cnt1

Game of Thrones is an amazing tv series!


{'amazing': 1,
 'an': 1,
 'best': 0,
 'game': 1,
 'great': 0,
 'is': 1,
 'of': 1,
 'series': 1,
 'so': 0,
 'the': 0,
 'thrones': 1,
 'tv': 1}

In [26]:
cnt2 = bow(rev2, bow_vocab)
print(rev2)
cnt2

Game of Thrones is the best tv series!


{'amazing': 0,
 'an': 0,
 'best': 1,
 'game': 1,
 'great': 0,
 'is': 1,
 'of': 1,
 'series': 1,
 'so': 0,
 'the': 1,
 'thrones': 1,
 'tv': 1}

In [27]:
cnt3 = bow(rev3, bow_vocab)
print(rev3)
cnt3

Game of Thrones is so great


{'amazing': 0,
 'an': 0,
 'best': 0,
 'game': 1,
 'great': 1,
 'is': 1,
 'of': 1,
 'series': 0,
 'so': 1,
 'the': 0,
 'thrones': 1,
 'tv': 0}

In [28]:
cnt4 = bow(rev4, bow_vocab)
print(rev4)
cnt4

game game of of thrones is is great


{'amazing': 0,
 'an': 0,
 'best': 0,
 'game': 2,
 'great': 1,
 'is': 2,
 'of': 2,
 'series': 0,
 'so': 0,
 'the': 0,
 'thrones': 1,
 'tv': 0}

## d. To implement TF-IDF



TF-IDF stands for Term Frequency Inverse Document Frequency of records. It can be defined as the calculation of how relevant a word in a series or corpus is to a text. The meaning increases proportionally to the number of times in the text a word appears but is compensated by the word frequency in the corpus (data-set)
Idfgivesrelevace of  word in  the  corpus

In [29]:
# Sample documents
doc1 = "Inflation has increased unemployement."
doc2 = "The company has increased its sales"
doc3 = "Fear increased his pulse"

In [30]:
# Preprocess each document and create cleaned versions
doc1_clean = preprocess(doc1)
doc2_clean = preprocess(doc2)
doc3_clean = preprocess(doc3)

# Combine cleaned documents into a list
doc = [' '.join(doc1_clean), ' '.join(doc2_clean), ' '.join(doc3_clean)]
doc

['increased inflation unemployement',
 'company increased sales',
 'fear increased pulse']

In [31]:
# Initialize an empty vocabulary list for TF calculations
tf_vocab = []

# Preprocess each document and update vocabulary for TF calculations
tf_vocab = preprocess(doc1, tf_vocab)
tf_vocab = preprocess(doc2, tf_vocab)
tf_vocab = preprocess(doc3, tf_vocab)
tf_vocab

Inflation has increased unemployement. []
The company has increased its sales ['increased', 'inflation', 'unemployement']
Fear increased his pulse ['company', 'increased', 'inflation', 'sales', 'unemployement']


['company',
 'fear',
 'increased',
 'inflation',
 'pulse',
 'sales',
 'unemployement']

In [32]:
# Calculate TF scores for each word in the vocabulary
tf_score = {}
for i in tf_vocab:
    tf_score[i] = np.array([freq(i, doc1_clean), freq(i, doc2_clean), freq(i, doc3_clean)])/len(doc)
tf_score

{'company': array([0.        , 0.33333333, 0.        ]),
 'fear': array([0.        , 0.        , 0.33333333]),
 'increased': array([0.33333333, 0.33333333, 0.33333333]),
 'inflation': array([0.33333333, 0.        , 0.        ]),
 'pulse': array([0.        , 0.        , 0.33333333]),
 'sales': array([0.        , 0.33333333, 0.        ]),
 'unemployement': array([0.33333333, 0.        , 0.        ])}

In [33]:
# Create a DataFrame to display TF scores for each word in each document
pd.DataFrame(columns=doc, index=tf_score.keys(), data = tf_score.values())

Unnamed: 0,increased inflation unemployement,company increased sales,fear increased pulse
company,0.0,0.333333,0.0
fear,0.0,0.0,0.333333
increased,0.333333,0.333333,0.333333
inflation,0.333333,0.0,0.0
pulse,0.0,0.0,0.333333
sales,0.0,0.333333,0.0
unemployement,0.333333,0.0,0.0


In [34]:
# Calculate IDF scores for each word in the vocabulary
idf_score = {}
for i in tf_vocab:
    idf_score[i] = np.log10(len(doc)/freq_idf(i, doc))  
idf_score

{'company': 0.47712125471966244,
 'fear': 0.47712125471966244,
 'increased': 0.0,
 'inflation': 0.47712125471966244,
 'pulse': 0.47712125471966244,
 'sales': 0.47712125471966244,
 'unemployement': 0.47712125471966244}

In [35]:
pd.DataFrame(columns=["IDF Score"], index=idf_score.keys(), data=idf_score.values())

Unnamed: 0,IDF Score
company,0.477121
fear,0.477121
increased,0.0
inflation,0.477121
pulse,0.477121
sales,0.477121
unemployement,0.477121


In [36]:
tf_idf = {}
for i in idf_score:
    tf_idf[i] = tf_score[i]*idf_score[i]
tf_idf

{'company': array([0.        , 0.15904042, 0.        ]),
 'fear': array([0.        , 0.        , 0.15904042]),
 'increased': array([0., 0., 0.]),
 'inflation': array([0.15904042, 0.        , 0.        ]),
 'pulse': array([0.        , 0.        , 0.15904042]),
 'sales': array([0.        , 0.15904042, 0.        ]),
 'unemployement': array([0.15904042, 0.        , 0.        ])}

In [37]:
pd.DataFrame(index=tf_idf.keys(), columns=[doc1_clean, doc2_clean, doc3_clean], data=tf_idf.values())

Unnamed: 0_level_0,increased,inflation,unemployement
Unnamed: 0_level_1,company,increased,sales
Unnamed: 0_level_2,fear,increased,pulse
company,0.0,0.15904,0.0
fear,0.0,0.0,0.15904
increased,0.0,0.0,0.0
inflation,0.15904,0.0,0.0
pulse,0.0,0.0,0.15904
sales,0.0,0.15904,0.0
unemployement,0.15904,0.0,0.0


In [38]:
freq_idf("increased", doc)

3

In [39]:
doc

['increased inflation unemployement',
 'company increased sales',
 'fear increased pulse']

In [40]:
tf_vocab[0]

'company'

In [41]:
freq(preprocess(doc1)[0], tf_vocab)

1

##  e. Explore Scikit learn to implement TF-IDF


In [42]:
sentence="""Inflation has increased unemployement. The company has increased its sales
Fear increased his pulse"""



In [43]:
# create object
tfidf = TfidfVectorizer()
sent=[sentence]
# get tf-df values
result = tfidf.fit_transform(sent)
print(result)

  (0, 7)	0.21320071635561041
  (0, 3)	0.21320071635561041
  (0, 1)	0.21320071635561041
  (0, 8)	0.21320071635561041
  (0, 6)	0.21320071635561041
  (0, 0)	0.21320071635561041
  (0, 9)	0.21320071635561041
  (0, 10)	0.21320071635561041
  (0, 4)	0.6396021490668313
  (0, 2)	0.42640143271122083
  (0, 5)	0.21320071635561041
