# Transforming Text into Data Structures

Nama  : Ramanda Ajisaka Asyraf

NPM   : 20312067

Kelas : IF Gab 1

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Matrix Representation

kita dapat membangun sebuah vektor dan matriks berdasarkan data text dengan menggunakan modul **CountVectorizer** dari **sklearn**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

X = ("Computers can analyze text",
     "They do it using vectors and matrices",
     "Computers can process massive amounts of text data")

vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
print(vectorizer.vocabulary_)
print(X_vec.todense()) 

{'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}
[[0 1 1 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 1]
 [1 0 1 1 1 0 1 1 0 0]]


## CountVectorizer for Bag of Words Model
kita dapat membangun vocabolary dengan metode BoW

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Building a corpus of sentences

In [None]:
sentences = ["We are reading about Natural Language Processing Here",
            "Natural Language Processing making computers comprehend language data",
            "The field of Natural Language Processing is evolving everyday"]

In [None]:
corpus = pd.Series(sentences)
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

#### Data preprocessing pipeline

In [None]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [None]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [None]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [None]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [None]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [None]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  # This is added back by InteractiveShellApp.init_path()


['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

### CountVectorizer
merupakan sebuah API yang mempermudah kita untuk membangun BoW model dengan cara mengkonversi dokumen teks kedalam matriks

In [None]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer.get_feature_names())
print(bow_matrix.toarray())

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0 0 0 0 0 0 1 0 1 1 1]
 [1 1 1 0 0 0 2 1 1 1 0]
 [0 0 0 1 1 1 1 0 1 1 0]]




In [None]:
print(bow_matrix.toarray().shape)

(3, 11)


### Features offered by CountVectorizer

In [None]:
vectorizer_ngram_range = CountVectorizer(analyzer='word', ngram_range=(1,3))
bow_matrix_ngram = vectorizer_ngram_range.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer_ngram_range.get_feature_names())
print(bow_matrix_ngram.toarray())

['comprehend', 'comprehend language', 'comprehend language data', 'computers', 'computers comprehend', 'computers comprehend language', 'data', 'everyday', 'evolve', 'evolve everyday', 'field', 'field natural', 'field natural language', 'language', 'language data', 'language process', 'language process evolve', 'language process make', 'make', 'make computers', 'make computers comprehend', 'natural', 'natural language', 'natural language process', 'process', 'process evolve', 'process evolve everyday', 'process make', 'process make computers', 'read', 'read natural', 'read natural language']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1]
 [1 1 1 1 1 1 1 0 0 0 0 0 0 2 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]]




#### Understanding Max Features
max_features yang akan membangun kosakata sedemikian rupa sehingga ukuran kosakata
akan kurang dari atau sama dengan max_features yang dipesan oleh frekuensi token yang terjadi
dalam korpus, seperti yang diilustrasikan dalam blok kode berikut

In [None]:
vectorizer_max_features = CountVectorizer(analyzer='word', ngram_range=(1,3), max_features = 6)
bow_matrix_max_features = vectorizer_max_features.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer_max_features.get_feature_names())
print(bow_matrix_max_features.toarray())

['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']
[[1 1 1 1 1 1]
 [2 1 1 1 1 1]
 [1 1 1 1 1 1]]




#### Thresholding using Max_df and Min_df
max_df dan min_df dapat dilakukan dan
akibatnya memberikan ambang batas minimum dan maksimum terhadap terjadinya
frasa dalam korpus

In [None]:
vectorizer_max_features = CountVectorizer(analyzer='word', ngram_range=(1,3), max_df = 3, min_df = 2)
bow_matrix_max_features = vectorizer_max_features.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer_max_features.get_feature_names())
print(bow_matrix_max_features.toarray())

['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']
[[1 1 1 1 1 1]
 [2 1 1 1 1 1]
 [1 1 1 1 1 1]]




## Term Frequency-Inverse Document Frequency based Vectorizer

### TfIdfVectorizer
Vectorizer TF-IDF dasar dapat instantiated, seperti yang ditunjukkan dalam dua langkah yang ditunjukkan dalam
cuplikan kode berikut. Langkah kedua memungkinkan data dipasang ke TF-IDF
vectorizer, diikuti oleh transformasi data menjadi bentuk vektor TF-IDF menggunakan
fit_transform

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer.get_feature_names())
print(tf_idf_matrix.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix.shape)

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0.         0.         0.         0.         0.         0.
  0.41285857 0.         0.41285857 0.41285857 0.69903033]
 [0.40512186 0.40512186 0.40512186 0.         0.         0.
  0.478543   0.40512186 0.2392715  0.2392715  0.        ]
 [0.         0.         0.         0.49711994 0.49711994 0.49711994
  0.29360705 0.         0.29360705 0.29360705 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)




### Changing the norm to l1, default option is l2 which was used above
l2: Sum of squares of vector elements is 1.

l1: Sum of absolute values of vector elements is 1.

In [None]:
vectorizer_l1_norm = TfidfVectorizer(norm="l1")
tf_idf_matrix_l1_norm = vectorizer_l1_norm.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer_l1_norm.get_feature_names())
print(tf_idf_matrix_l1_norm.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix_l1_norm.shape)

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0.         0.         0.         0.         0.         0.
  0.21307663 0.         0.21307663 0.21307663 0.3607701 ]
 [0.1571718  0.1571718  0.1571718  0.         0.         0.
  0.1856564  0.1571718  0.0928282  0.0928282  0.        ]
 [0.         0.         0.         0.2095624  0.2095624  0.2095624
  0.12377093 0.         0.12377093 0.12377093 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)




### N-grams and Max features with TfidfVectorizer
Mirip dengan CountVectorizer, vectorizer TF-IDF menawarkan kemampuan menggunakan n-gram.
dan max_features untuk membatasi kosa kata kita

In [None]:
vectorizer_n_gram_max_features = TfidfVectorizer(norm="l2", analyzer='word', ngram_range=(1,3), max_features = 6)
tf_idf_matrix_n_gram_max_features = vectorizer_n_gram_max_features.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer_n_gram_max_features.get_feature_names())
print(tf_idf_matrix_n_gram_max_features.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix_n_gram_max_features.shape)

['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']
[[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]
 [0.66666667 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333]
 [0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]

The shape of the TF-IDF matrix is:  (3, 6)




## Cosine Similarity
Kesamaan cosine membantu dalam mengukur cosine dari sudut antara dua vektor. Si
nilai kesamaan cosine akan terletak pada kisaran -1 hingga +1.

### Measuring Cosine Similarity between Document Vectors

#### Cosine Similarity Calculation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

#### CountVectorizer

In [None]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer.get_feature_names())
print(bow_matrix.toarray())

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0 0 0 0 0 0 1 0 1 1 1]
 [1 1 1 0 0 0 2 1 1 1 0]
 [0 0 0 1 1 1 1 0 1 1 0]]




#### Cosine similarity between the document vectors built using CountVectorizer

In [None]:
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",
              cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.6324555320336759
The cosine similarity between the documents  0 and 2 is:  0.6123724356957946
The cosine similarity between the documents  1 and 2 is:  0.5163977794943223


#### TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [None]:
print(vectorizer.get_feature_names())
print(tf_idf_matrix.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix.shape)

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0.         0.         0.         0.         0.         0.
  0.41285857 0.         0.41285857 0.41285857 0.69903033]
 [0.40512186 0.40512186 0.40512186 0.         0.         0.
  0.478543   0.40512186 0.2392715  0.2392715  0.        ]
 [0.         0.         0.         0.49711994 0.49711994 0.49711994
  0.29360705 0.         0.29360705 0.29360705 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)




#### Cosine similarity between the document vectors built using TfidfVectorizer

In [None]:
for i in range(tf_idf_matrix.shape[0]):
    for j in range(i + 1, tf_idf_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",
              cosine_similarity(tf_idf_matrix.toarray()[i], tf_idf_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.39514115766749125
The cosine similarity between the documents  0 and 2 is:  0.36365455673761865
The cosine similarity between the documents  1 and 2 is:  0.2810071916500233


## One Hot Vectors

In [None]:
sentence = ["We are reading about Natural Language Processing Here"]
corpus = pd.Series(sentence)
corpus
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  # This is added back by InteractiveShellApp.init_path()


['read natural language process']

### Building the vocabulary

In [None]:
set_of_words = set()
for word in preprocessed_corpus[0].split():
    set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['process', 'natural', 'read', 'language']


### Fetching the position of each word in the vocabulary

In [None]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'process': 0, 'natural': 1, 'read': 2, 'language': 3}


### Instantiating the one hot matrix

In [None]:
one_hot_matrix = np.zeros((len(preprocessed_corpus[0].split()), len(vocab)))
one_hot_matrix.shape

(4, 4)

### Building One Hot Vectors

In [None]:
for i, token in enumerate(preprocessed_corpus[0].split()):
    one_hot_matrix[i][position[token]] = 1

### Visualizing the One Hot Vectors

In [None]:
one_hot_matrix

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])

## Building a basic chatbot

In [2]:
#!pip install scikit-learn
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


#loading questions and answers in separate lists
import ast 
questions = []
answers = [] 
with open('/content/drive/MyDrive/Colab Notebooks/PBA/qa_Electronics.json','r') as f:
    for line in f:
        data = ast.literal_eval(line)
        questions.append(data['question'].lower())
        answers.append(data['answer'].lower())

In [3]:
# tokenize the text and convert data in matrix format
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(questions)

In [4]:
# Transform data by applying term frequency inverse document frequency (TF-IDF) 
tfidf = TfidfTransformer() #by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)

In [5]:
def conversation(im):
    global tfidf, answers, X_tfidf
    Y_vec = vectorizer.transform(im)
    Y_tfidf = tfidf.fit_transform(Y_vec)
    cos_sim = np.rad2deg(np.arccos(max(cosine_similarity(Y_tfidf, X_tfidf)[0])))
    if cos_sim > 60 :
        return "sorry, I did not quite understand that"
    else:
        return answers[np.argmax(cosine_similarity(Y_tfidf, X_tfidf)[0])]

def main():
    usr = input("Please enter your username: ")
    print("support: Hi, welcome to Q&A support. How can I help you?")
    while True:
        im = input("{}: ".format(usr))
        if im.lower() == 'bye':
            print("Q&A support: bye!")
            break
        else:
            print("Q&A support: "+conversation([im]))

In [7]:
main()

Please enter your username: ramanda
support: Hi, welcome to Q&A support. How can I help you?
ramanda: bluescreen
Q&A support: goes completely black
ramanda: how to repair
Q&A support: i have never had any problem that needed any type of repair. pioneer is a long time trusted name in sound electronics. there are hundreds of pioneer repair centers around the country. i had no worries about that. it can be difficult to operate sometimes mostly because it uses wifi. sometimes i've had to turn it off and then back on to get it working again. performance has been great.
ramanda: bye
Q&A support: bye!


## Bag of Words in Action

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np

# Take in a list of sentences
sentences = ["We are reading about Natural Language Processing Here",
            "Natural Language Processing making computers comprehend language data",
            "The field of Natural Language Processing is evolving everyday"]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Create a Pandas Series of the object


In [11]:
corpus = pd.Series(sentences)
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

### Data preprocessing

In [12]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [13]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [14]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [15]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [16]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [17]:
common_dot_words = ['U.S.', 'Mr.', 'Mrs.', 'D.C.']

In [18]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = common_dot_words, stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  # This is added back by InteractiveShellApp.init_path()


['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

### Building the vocabulary

In [19]:
set_of_words = set()
for sentence in preprocessed_corpus:
    for word in sentence.split():
        set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['everyday', 'evolve', 'field', 'language', 'read', 'computers', 'data', 'make', 'process', 'comprehend', 'natural']


### Fetching the position of each word in the vocabulary

In [20]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'everyday': 0, 'evolve': 1, 'field': 2, 'language': 3, 'read': 4, 'computers': 5, 'data': 6, 'make': 7, 'process': 8, 'comprehend': 9, 'natural': 10}


### Creating a matrix to hold the Bag of Words representation

In [21]:
bow_matrix = np.zeros((len(preprocessed_corpus), len(vocab)))

In [22]:
for i, preprocessed_sentence in enumerate(preprocessed_corpus):
    for token in preprocessed_sentence.split():   
        bow_matrix[i][position[token]] = bow_matrix[i][position[token]] + 1

### Bag of Words representation

In [23]:
bow_matrix

array([[0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 2., 0., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1.]])