# setup

## import statments

In [46]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## variables and initializations

In [32]:
availible_corpus_ = 100
tokenizer = Tokenizer(num_words = availible_corpus_)

# obtain data

In [33]:
sentences = [
             'Thank you, Tony., And good afternoon, everyone.', 
             'Q3 was a solid quarter where we navigated a challenging supply environment to deliver year-over-year growth on the top line while beating expectations on gross margin and EPS.',
             'We had record third quarter revenue in DCG and Mobileye, while IOTG had an all-time record as it continued its recovery from COVID slowdowns.',
             'Our focus on execution continued as we delivered on our initial IDM 2.0 commitments.',
             'We broke ground on new fabs, shared our accelerated path to regain process performance leadership, and made our most dramatic architecture announcements in the decade.',
             'We also announced major customer wins across every part of our business, including in the data center with AWS and Google, new EVO designs and client, and exciting Mobileye partnerships with ZEEKR and Sixt SE.',
             'The demand for semiconductors remains strong, and our factories performed exceptionally well in a highly dynamic environment, where matched sets post huge challenges for our customers, and overall, industry supply remained very constrained.'
            ]

# sentences = [
#     'I love my dog',
#     'I love my cat',
#     'You love my dog!',
#     'Do you think my dog is amazing?'
# ]


## processing : tokenizing words

In [34]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
space = 3*' '
print(f'\n count of words{space}:{space}word')
for key, value in word_index.items():
    if len(str(value)) == 1:
        value = 13*' ' + str(value)
    elif len(str(value)) == 2:
        value = 12*' ' + str(value)
    elif len(str(value)) == 3:
        value = 11*' ' + str(value) 
    print(f' {value}{space}:{space}{key} ')


 count of words   :   word
              1   :   and 
              2   :   our 
              3   :   we 
              4   :   on 
              5   :   the 
              6   :   in 
              7   :   a 
              8   :   quarter 
              9   :   where 
             10   :   supply 
             11   :   environment 
             12   :   to 
             13   :   year 
             14   :   while 
             15   :   had 
             16   :   record 
             17   :   mobileye 
             18   :   as 
             19   :   continued 
             20   :   new 
             21   :   with 
             22   :   for 
             23   :   thank 
             24   :   you 
             25   :   tony 
             26   :   good 
             27   :   afternoon 
             28   :   everyone 
             29   :   q3 
             30   :   was 
             31   :   solid 
             32   :   navigated 
             33   :   challenging 
             34   :   d

## processing the word count in the sentences

In [44]:
sequences = tokenizer.texts_to_sequences(sentences)

for i,j in zip(sequences,sentences):
    print(f'\n{j}\n{i}')


Thank you, Tony., And good afternoon, everyone.
[24, 25, 26, 2, 27, 28, 29]

Q3 was a solid quarter where we navigated a challenging supply environment to deliver year-over-year growth on the top line while beating expectations on gross margin and EPS.
[30, 31, 8, 32, 9, 10, 4, 33, 8, 34, 11, 12, 13, 35, 14, 36, 14, 37, 5, 6, 38, 39, 15, 40, 41, 5, 42, 43, 2, 44]

We had record third quarter revenue in DCG and Mobileye, while IOTG had an all-time record as it continued its recovery from COVID slowdowns.
[4, 16, 17, 45, 9, 46, 7, 47, 2, 18, 15, 48, 16, 49, 50, 51, 17, 19, 52, 20, 53, 54, 55, 56, 57]

Our focus on execution continued as we delivered on our initial IDM 2.0 commitments.
[3, 58, 5, 59, 20, 19, 4, 60, 5, 3, 61, 62, 63, 64, 65]

We broke ground on new fabs, shared our accelerated path to regain process performance leadership, and made our most dramatic architecture announcements in the decade.
[4, 66, 67, 5, 21, 68, 69, 3, 70, 71, 13, 72, 73, 74, 75, 2, 76, 3, 77, 78, 79, 80

## processing the word count in the sentences WITH out of corpus vocabulary (oov)

In [42]:
tokenizer = Tokenizer(num_words = availible_corpus_, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
test_seq = tokenizer.texts_to_sequences(sentences)

for i,j in zip(test_seq,sentences):
    #replace 1's with 0's
    i = ['oov' if x==1 else x for x in i]
    print(f'\n{j}\n{i}')


Thank you, Tony., And good afternoon, everyone.
[24, 25, 26, 2, 27, 28, 29]

Q3 was a solid quarter where we navigated a challenging supply environment to deliver year-over-year growth on the top line while beating expectations on gross margin and EPS.
[30, 31, 8, 32, 9, 10, 4, 33, 8, 34, 11, 12, 13, 35, 14, 36, 14, 37, 5, 6, 38, 39, 15, 40, 41, 5, 42, 43, 2, 44]

We had record third quarter revenue in DCG and Mobileye, while IOTG had an all-time record as it continued its recovery from COVID slowdowns.
[4, 16, 17, 45, 9, 46, 7, 47, 2, 18, 15, 48, 16, 49, 50, 51, 17, 19, 52, 20, 53, 54, 55, 56, 57]

Our focus on execution continued as we delivered on our initial IDM 2.0 commitments.
[3, 58, 5, 59, 20, 19, 4, 60, 5, 3, 61, 62, 63, 64, 65]

We broke ground on new fabs, shared our accelerated path to regain process performance leadership, and made our most dramatic architecture announcements in the decade.
[4, 66, 67, 5, 21, 68, 69, 3, 70, 71, 13, 72, 73, 74, 75, 2, 76, 3, 77, 78, 79, 80

In [57]:
tokenizer = Tokenizer(num_words = availible_corpus_, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
test_seq = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences)

print(f'input sentences\ntokened sentences')
for i,j in zip(test_seq,sentences):
    #replace 1's with 0's
    i = ['oov' if x==1 else x for x in i]
    print(f'\n{j}\n{i}')
    
print(f'\n\npadded sentences')
for k in padded:
    print(f'\n{k}\n')

input sentences
tokened sentences

Thank you, Tony., And good afternoon, everyone.
[24, 25, 26, 2, 27, 28, 29]

Q3 was a solid quarter where we navigated a challenging supply environment to deliver year-over-year growth on the top line while beating expectations on gross margin and EPS.
[30, 31, 8, 32, 9, 10, 4, 33, 8, 34, 11, 12, 13, 35, 14, 36, 14, 37, 5, 6, 38, 39, 15, 40, 41, 5, 42, 43, 2, 44]

We had record third quarter revenue in DCG and Mobileye, while IOTG had an all-time record as it continued its recovery from COVID slowdowns.
[4, 16, 17, 45, 9, 46, 7, 47, 2, 18, 15, 48, 16, 49, 50, 51, 17, 19, 52, 20, 53, 54, 55, 56, 57]

Our focus on execution continued as we delivered on our initial IDM 2.0 commitments.
[3, 58, 5, 59, 20, 19, 4, 60, 5, 3, 61, 62, 63, 64, 65]

We broke ground on new fabs, shared our accelerated path to regain process performance leadership, and made our most dramatic architecture announcements in the decade.
[4, 66, 67, 5, 21, 68, 69, 3, 70, 71, 13, 72, 73