<a href="https://colab.research.google.com/github/imukoki/NLP-FELLOWSHIP/blob/Week-3/Text_To_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text to features
This is the process of converting tokens to numbers. This is because the machine only works with numbers. Moreover, for manipulation of text, the tokens need to be in digit form to apply any transformations.

The input of the function will be the matrix of tokens and output will be matrix with digits.

## Simplest form of featurization
The simplest way is to assign each unique text a number starting from 0 and increase by one until all the text has been assigned numbers

In [1]:
import numpy as np

In [32]:
# fitting


list_sentences = ['this is a list of sentences example', 'second sentence in list of sentence', 'a word for complexity',]
all_word = set()
#word_to_index = {}
#index_to_word = {}

for sent in list_sentences:
    #split the sentence into words and adding words into a set
    all_word.update(sent.split())
  
# Creating dictionaries    
word_to_index = dict(zip(all_word, range(len(all_word))))
index_to_word = dict(zip(range(len(all_word)), all_word))
print(word_to_index)

{'word': 0, 'for': 1, 'list': 2, 'sentence': 3, 'sentences': 4, 'in': 5, 'this': 6, 'is': 7, 'second': 8, 'a': 9, 'example': 10, 'complexity': 11, 'of': 12}


## Bag Of Words (BoW)
* Split the sentences into words
* Create a dictionary with all unique words and their indices
* Create a vector, size same as the total number of unique words
* For every word in a sentence, get the index and add 1. 
* The result will be a vector for each sentence with length same as all the unique words in all sentences, with frequency of each word in one particular sentence. If a word is not in that sentence, the frequency is 0


In [3]:
transformed_list_words = np.zeros(len(all_word))
transformed_list_words

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [4]:
# Create a vector, size same as the total number of unique words

transformed_list_sentences = np.empty((len(list_sentences),len(all_word)))
transformed_list_sentences

array([[ 2.33582538e-316,  2.33419537e-312,  2.16443571e-312,
         2.41907520e-312,  2.14321575e-312,  2.01589600e-312,
         2.22809558e-312,  2.46151512e-312,  2.52517499e-312,
         2.41907520e-312,  2.44029516e-312,  1.29441743e-312,
         2.33419537e-312],
       [ 9.76118064e-313,  2.14321575e-312,  2.35541533e-312,
         8.48798317e-313,  2.14321575e-312,  8.48798317e-313,
         2.29175545e-312,  2.01589600e-312,  2.35541533e-312,
         2.12199579e-312,  8.70018274e-313,  2.46151512e-312,
         2.05833592e-312],
       [ 2.44029516e-312,  2.35541533e-312,  2.31297541e-312,
         2.12199579e-312,  2.29175545e-312,  2.44029516e-312,
         2.01589600e-312,  2.35541533e-312,  2.12199579e-312,
        -7.61117174e+190,  6.91661028e-310,  6.91661028e-310,
         1.53437343e+162]])

In [5]:
for row,sentence in enumerate(list_sentences):
    #replace row with sentence BoW
    
    
    transformed_list_words = np.zeros(len(all_word))
    for word in sentence.split():
        if word in all_word:
            #get the index of the word
            word_index = word_to_index[word]
            #increase the value by 1
            transformed_list_words[word_index]+=1
            #print(transformed_list_words)

    transformed_list_sentences[row] =  transformed_list_words

print(dict(sorted(word_to_index.items(), key=lambda item: item[1])))
transformed_list_sentences


{'word': 0, 'for': 1, 'list': 2, 'sentence': 3, 'sentences': 4, 'in': 5, 'this': 6, 'is': 7, 'second': 8, 'a': 9, 'example': 10, 'complexity': 11, 'of': 12}


array([[0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1.],
       [0., 0., 1., 2., 0., 1., 0., 0., 1., 0., 0., 0., 1.],
       [1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.]])

In [6]:
test_sentence = 'this is an example of sentence with sentence that are complexity with many example'
transformed_list_words = np.zeros(len(all_word))
for word in test_sentence.split():
    if word in all_word:
        #get the index of the word
        word_index = word_to_index[word]
        #increase the value by 1
        transformed_list_words[word_index]+=1
        #print(transformed_list_words)
transformed_list_words

array([0., 0., 0., 2., 0., 0., 1., 1., 0., 0., 2., 1., 1.])

## In-class Practicals

Convert this into a class and functions:


In [42]:
class Bow:
    def __init__(self):
        # Enter code here
      self.all_word = set()

    def fit(self,document):
      #Enter code
      for sent in document:
        self.all_word.update(sent.strip())
        self.word_to_index = dict(zip(self.all_word, range(len(self.all_word))))
        self.index_to_word = dict(zip(range(len(self.all_word)), self.all_word))
        

    def transform(self,data):
      #Enter code
      for row,sentence in enumerate(data): 
        transformed_list_words = np.zeros(len(all_word))
        for word in sentence.split():
            if word in all_word:
                #get the index of the word
                word_index = word_to_index[word]
                #increase the value by 1
                transformed_list_words[word_index]+=1
                #print(transformed_list_words)
        transformed_list_sentences[row] =  transformed_list_words

      return transformed_list_sentences

    def fit_transform(self,data):
      self.fit(data)

      return self.transform(data)

    def _transform_single(self,list_words):
      # This code is for transforming a single sentence
      transformed_list_words = np.zeros(len(all_word))
      for word in test_sentence.split():
          if word in all_word:
              #get the index of the word
              word_index = word_to_index[word]
              #increase the value by 1
              transformed_list_words[word_index]+=1

      return transformed_list_words

## Assignment
BOW using your tokens and share some sentences to text

In [50]:
# The Bow object
bow = Bow()

# Fitting the words
bow.fit(list_sentences)

# Transforming the words
print(bow.transform(list_sentences))

print('-'*41)
# Fitting and transforming the words
print(bow.fit_transform(list_sentences))

print('-'*41)
# Transforming a sentence
print(bow._transform_single(test_sentence))


[[0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1.]
 [0. 0. 1. 2. 0. 1. 0. 0. 1. 0. 0. 0. 1.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]]
-----------------------------------------
[[0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1.]
 [0. 0. 1. 2. 0. 1. 0. 0. 1. 0. 0. 0. 1.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]]
-----------------------------------------
[0. 0. 0. 2. 0. 0. 1. 1. 0. 0. 2. 1. 1.]
