In [1]:
text ="""Data analysis is not just a mere process; it's a tool that empowers organizations to make informed decisions, predict trends, and improve operational efficiency.
It's the backbone of strategic planning in businesses, governments, and other organizations.Consider some examples. Take, for instance, a leading e-commerce company.
Through data analysis, the company can understand their customers' buying behavior, preferences, and patterns.
They can then use this information to personalize customer experiences, forecast sales, and optimize marketing strategies, ultimately driving business growth and customer satisfaction.
Another good example is the healthcare industry.
Through data analysis, healthcare providers can predict disease outbreaks, improve patient care, and make informed decisions about treatment strategies. """

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
# Initiate the Tokenizer
tokenizer = Tokenizer()

In [4]:
tokenizer.fit_on_texts([text])

In [5]:
len(tokenizer.word_index)

81

In [6]:
for sentence in text.split('\n'):
    print(sentence)

Data analysis is not just a mere process; it's a tool that empowers organizations to make informed decisions, predict trends, and improve operational efficiency.
It's the backbone of strategic planning in businesses, governments, and other organizations.Consider some examples. Take, for instance, a leading e-commerce company. 
Through data analysis, the company can understand their customers' buying behavior, preferences, and patterns. 
They can then use this information to personalize customer experiences, forecast sales, and optimize marketing strategies, ultimately driving business growth and customer satisfaction.
Another good example is the healthcare industry. 
Through data analysis, healthcare providers can predict disease outbreaks, improve patient care, and make informed decisions about treatment strategies. 


In [7]:
for sentence in text.split('\n'):
  print(tokenizer.texts_to_sequences([sentence]))

[[2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13, 14, 28, 1, 15, 29, 30]]
[[8, 5, 31, 32, 33, 34, 35, 36, 37, 1, 38, 9, 39, 40, 41, 42, 43, 44, 4, 45, 46, 47, 16]]
[[17, 2, 3, 5, 16, 6, 48, 49, 50, 51, 52, 53, 1, 54]]
[[55, 6, 56, 57, 58, 59, 10, 60, 18, 61, 62, 63, 1, 64, 65, 19, 66, 67, 68, 69, 1, 18, 70]]
[[71, 72, 73, 7, 5, 20, 74]]
[[17, 2, 3, 20, 75, 6, 14, 76, 77, 15, 78, 79, 1, 11, 12, 13, 80, 81, 19]]


In [8]:
input_squences=[]

for sentence in text.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1, len(tokenized_sentence)):
      input_squences.append(tokenized_sentence[:i+1])

In [9]:
input_squences

[[2, 3],
 [2, 3, 7],
 [2, 3, 7, 21],
 [2, 3, 7, 21, 22],
 [2, 3, 7, 21, 22, 4],
 [2, 3, 7, 21, 22, 4, 23],
 [2, 3, 7, 21, 22, 4, 23, 24],
 [2, 3, 7, 21, 22, 4, 23, 24, 8],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13, 14],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13, 14, 28],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13, 14, 28, 1],
 [2,
  3,
  7,
  21,
  22,
  4,
  23,
  24,
  8,
  4,
  25,
  26,
  27,
  9,
  10,
  11,
  12,
  13,
  14,
  28,
  1,
  15],
 [2,

In [10]:
max_len = max([len(x) for x in input_squences])

In [11]:
max_len

24

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_squences, maxlen=max_len, padding='pre')

In [13]:
padded_input_sequences

array([[ 0,  0,  0, ...,  0,  2,  3],
       [ 0,  0,  0, ...,  2,  3,  7],
       [ 0,  0,  0, ...,  3,  7, 21],
       ...,
       [ 0,  0,  0, ..., 12, 13, 80],
       [ 0,  0,  0, ..., 13, 80, 81],
       [ 0,  0,  0, ..., 80, 81, 19]], dtype=int32)

In [14]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]

In [15]:
tokenizer.word_index

{'and': 1,
 'data': 2,
 'analysis': 3,
 'a': 4,
 'the': 5,
 'can': 6,
 'is': 7,
 "it's": 8,
 'organizations': 9,
 'to': 10,
 'make': 11,
 'informed': 12,
 'decisions': 13,
 'predict': 14,
 'improve': 15,
 'company': 16,
 'through': 17,
 'customer': 18,
 'strategies': 19,
 'healthcare': 20,
 'not': 21,
 'just': 22,
 'mere': 23,
 'process': 24,
 'tool': 25,
 'that': 26,
 'empowers': 27,
 'trends': 28,
 'operational': 29,
 'efficiency': 30,
 'backbone': 31,
 'of': 32,
 'strategic': 33,
 'planning': 34,
 'in': 35,
 'businesses': 36,
 'governments': 37,
 'other': 38,
 'consider': 39,
 'some': 40,
 'examples': 41,
 'take': 42,
 'for': 43,
 'instance': 44,
 'leading': 45,
 'e': 46,
 'commerce': 47,
 'understand': 48,
 'their': 49,
 "customers'": 50,
 'buying': 51,
 'behavior': 52,
 'preferences': 53,
 'patterns': 54,
 'they': 55,
 'then': 56,
 'use': 57,
 'this': 58,
 'information': 59,
 'personalize': 60,
 'experiences': 61,
 'forecast': 62,
 'sales': 63,
 'optimize': 64,
 'marketing': 65,
 

In [16]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)

In [17]:
y.shape

(104, 82)

In [18]:
X.shape

(104, 23)

## Model Building

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [22]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=max_len-1))
model.add(GRU(150))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

history = model.fit(X_train, y_train,
                   epochs=100,
                   validation_data=(X_test, y_test),
                   verbose=1)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 314ms/step - accuracy: 0.0099 - loss: 4.4077 - val_accuracy: 0.0476 - val_loss: 4.4107
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - accuracy: 0.1507 - loss: 4.3879 - val_accuracy: 0.0476 - val_loss: 4.4090
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.4040 - loss: 4.3691 - val_accuracy: 0.0476 - val_loss: 4.4064
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.4612 - loss: 4.3494 - val_accuracy: 0.0476 - val_loss: 4.4018
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step - accuracy: 0.4519 - loss: 4.3279 - val_accuracy: 0.0476 - val_loss: 4.3938
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.4197 - loss: 4.2896 - val_accuracy: 0.0476 - val_loss: 4.3762
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━

In [24]:
model.summary()

In [25]:
text2 = "Data"

## tokenization
token_text = tokenizer.texts_to_sequences([text2])[0]

## padding
padded_text = pad_sequences([token_text], maxlen=max_len-1, padding='pre')

## model prediction
prediction = model.predict(padded_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308ms/step


In [27]:
import numpy as np
pos = np.argmax(prediction)

In [30]:
for word, index in tokenizer.word_index.items():
  if index==pos:
    print(word)

analysis


In [31]:
text

"Data analysis is not just a mere process; it's a tool that empowers organizations to make informed decisions, predict trends, and improve operational efficiency.\nIt's the backbone of strategic planning in businesses, governments, and other organizations.Consider some examples. Take, for instance, a leading e-commerce company. \nThrough data analysis, the company can understand their customers' buying behavior, preferences, and patterns. \nThey can then use this information to personalize customer experiences, forecast sales, and optimize marketing strategies, ultimately driving business growth and customer satisfaction.\nAnother good example is the healthcare industry. \nThrough data analysis, healthcare providers can predict disease outbreaks, improve patient care, and make informed decisions about treatment strategies. "

In [32]:
text3 = "Data analysis is not just a mere process; it's a tool that empowers organizations to make informed decisions,"

token_text = tokenizer.texts_to_sequences([text3])[0]
padded_text = pad_sequences([token_text], maxlen=max_len-1, padding='pre')
prediction = model.predict(padded_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step


In [33]:
pos1 = np.argmax(prediction)
for word, index in tokenizer.word_index.items():
  if index==pos1:
    print(word)

predict
