## Texts_to_matrix vs Texts_to_sequences

In [7]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer

In [41]:
def text_to_mat(doc):
    df = pd.Series(doc)
    tok = Tokenizer(num_words=10)
    tok.fit_on_texts(df)
    print ( "index_docs",tok.index_docs); print ()
    print ( "index_words",tok.index_word); print ()
    print ( "word_index",tok.word_index); print ()

    print ( "word_docs",tok.word_docs); print ()

    mat_texts = tok.texts_to_matrix(doc, mode='count')
    print (mat_texts)

In [42]:
doc = ['Python is great great and useful',
       'Python is easy to learn',
       'Python is easy to implement']
text_to_mat(doc)

index_docs defaultdict(<class 'int'>, {7: 1, 3: 1, 2: 3, 1: 3, 6: 1, 5: 2, 8: 1, 4: 2, 9: 1})

index_words {1: 'python', 2: 'is', 3: 'great', 4: 'easy', 5: 'to', 6: 'and', 7: 'useful', 8: 'learn', 9: 'implement'}

word_index {'python': 1, 'is': 2, 'great': 3, 'easy': 4, 'to': 5, 'and': 6, 'useful': 7, 'learn': 8, 'implement': 9}

word_docs defaultdict(<class 'int'>, {'useful': 1, 'great': 1, 'is': 3, 'python': 3, 'and': 1, 'to': 2, 'learn': 1, 'easy': 2, 'implement': 1})

[[0. 1. 1. 2. 0. 0. 1. 1. 0. 0.]
 [0. 1. 1. 0. 1. 1. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 1. 0. 0. 0. 1.]]


In [64]:
def text_to_seq(doc, Print=True):
    df = pd.Series(doc)
    tok = Tokenizer(num_words=10)
    tok.fit_on_texts(df)
    
    texts_mat = tok.texts_to_matrix(doc, mode='count')
    texts_seq = tok.texts_to_sequences(doc)

    if Print:
        print ( "index_docs",tok.index_docs); print ()
        print ( "index_words",tok.index_word); print ()
        print ( "word_index",tok.word_index); print ()
        print ( "word_docs",tok.word_docs); print ()
        print ("texts_to_Matrix:\n",texts_mat); print()
        print ("texts_to_Seq:\n",texts_seq)
    
    return texts_seq


In [65]:
doc = ['Python is great great and useful',
       'Python is easy to learn',
       'Python is easy to implement']
text_to_seq(doc, Print=True)

index_docs defaultdict(<class 'int'>, {7: 1, 3: 1, 2: 3, 1: 3, 6: 1, 5: 2, 8: 1, 4: 2, 9: 1})

index_words {1: 'python', 2: 'is', 3: 'great', 4: 'easy', 5: 'to', 6: 'and', 7: 'useful', 8: 'learn', 9: 'implement'}

word_index {'python': 1, 'is': 2, 'great': 3, 'easy': 4, 'to': 5, 'and': 6, 'useful': 7, 'learn': 8, 'implement': 9}

word_docs defaultdict(<class 'int'>, {'useful': 1, 'great': 1, 'is': 3, 'python': 3, 'and': 1, 'to': 2, 'learn': 1, 'easy': 2, 'implement': 1})

texts_to_Matrix:
 [[0. 1. 1. 2. 0. 0. 1. 1. 0. 0.]
 [0. 1. 1. 0. 1. 1. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 1. 0. 0. 0. 1.]]

texts_to_Seq:
 [[1, 2, 3, 3, 6, 7], [1, 2, 4, 5, 8], [1, 2, 4, 5, 9]]


[[1, 2, 3, 3, 6, 7], [1, 2, 4, 5, 8], [1, 2, 4, 5, 9]]

## Pad_Sequences

In [71]:
doc = ['Python is nice',
       'Python is easy to learn',
       'Python is easy to implement',
       'c++ is very very hard to learn'
      ]

seq = text_to_seq(doc, Print=False)
print (seq)

[[2, 1, 7], [2, 1, 4, 3, 5], [2, 1, 4, 3, 8], [9, 1, 6, 6, 3, 5]]


In [72]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
print ("actual doc:\n", doc); print()
print ("document to sequence:", seq); print ()
for maxlen in [3,8]:  
    seq_padded = pad_sequences(seq, maxlen=maxlen, padding='post', truncating='post')
    print ("max length of the sequence:",maxlen)
    print (f"Padded Sequence with maxlen={maxlen}: \n{seq_padded}"); print()

actual doc:
 ['Python is nice', 'Python is easy to learn', 'Python is easy to implement', 'c++ is very very hard to learn']

document to sequence: [[2, 1, 7], [2, 1, 4, 3, 5], [2, 1, 4, 3, 8], [9, 1, 6, 6, 3, 5]]

max length of the sequence: 3
Padded Sequence with maxlen=3: 
[[2 1 7]
 [2 1 4]
 [2 1 4]
 [9 1 6]]

max length of the sequence: 8
Padded Sequence with maxlen=8: 
[[2 1 7 0 0 0 0 0]
 [2 1 4 3 5 0 0 0]
 [2 1 4 3 8 0 0 0]
 [9 1 6 6 3 5 0 0]]

