In [None]:
# pip install --upgrade tensorflow

In [None]:
text ="""Data analysis is not just a mere process; it's a tool that empowers
organizations to make informed decisions, predict trends, and improve operational efficiency.
It's the backbone of strategic planning in businesses, governments, and other
organizations.Consider some examples. Take, for instance, a leading e-commerce
company. Through data analysis, the company can understand their customers'
buying behavior, preferences, and patterns. They can then use this information
to personalize customer experiences, forecast sales, and optimize marketing
strategies, ultimately driving business growth and customer satisfaction.
Another good example is the healthcare industry. Through data analysis,
healthcare providers can predict disease outbreaks, improve patient care, and
make informed decisions about treatment strategies. """

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# Initiate the Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts([text])

In [None]:
len(tokenizer.word_index)

81

In [None]:
for sentence in text.split('.'):
  print(tokenizer.texts_to_sequences([sentence])[0])

[2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13, 14, 28, 1, 15, 29, 30]
[8, 5, 31, 32, 33, 34, 35, 36, 37, 1, 38, 9]
[39, 40, 41]
[42, 43, 44, 4, 45, 46, 47, 16]
[17, 2, 3, 5, 16, 6, 48, 49, 50, 51, 52, 53, 1, 54]
[55, 6, 56, 57, 58, 59, 10, 60, 18, 61, 62, 63, 1, 64, 65, 19, 66, 67, 68, 69, 1, 18, 70]
[71, 72, 73, 7, 5, 20, 74]
[17, 2, 3, 20, 75, 6, 14, 76, 77, 15, 78, 79, 1, 11, 12, 13, 80, 81, 19]
[]


In [None]:
input_squences=[]

for sentence in text.split('.'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1, len(tokenized_sentence)):
      input_squences.append(tokenized_sentence[:i+1])

In [None]:
input_squences

[[2, 3],
 [2, 3, 7],
 [2, 3, 7, 21],
 [2, 3, 7, 21, 22],
 [2, 3, 7, 21, 22, 4],
 [2, 3, 7, 21, 22, 4, 23],
 [2, 3, 7, 21, 22, 4, 23, 24],
 [2, 3, 7, 21, 22, 4, 23, 24, 8],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13, 14],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13, 14, 28],
 [2, 3, 7, 21, 22, 4, 23, 24, 8, 4, 25, 26, 27, 9, 10, 11, 12, 13, 14, 28, 1],
 [2,
  3,
  7,
  21,
  22,
  4,
  23,
  24,
  8,
  4,
  25,
  26,
  27,
  9,
  10,
  11,
  12,
  13,
  14,
  28,
  1,
  15],
 [2,

In [None]:
max_len = max([len(x) for x in input_squences])

In [None]:
max_len

24

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
paded_input_squences = pad_sequences(input_squences, maxlen=max_len, padding='pre')

In [None]:
paded_input_squences

array([[ 0,  0,  0, ...,  0,  2,  3],
       [ 0,  0,  0, ...,  2,  3,  7],
       [ 0,  0,  0, ...,  3,  7, 21],
       ...,
       [ 0,  0,  0, ..., 12, 13, 80],
       [ 0,  0,  0, ..., 13, 80, 81],
       [ 0,  0,  0, ..., 80, 81, 19]], dtype=int32)

In [None]:
X = paded_input_squences[:,:-1]
y = paded_input_squences[:,-1]

In [None]:
X

array([[ 0,  0,  0, ...,  0,  0,  2],
       [ 0,  0,  0, ...,  0,  2,  3],
       [ 0,  0,  0, ...,  2,  3,  7],
       ...,
       [ 0,  0,  0, ..., 11, 12, 13],
       [ 0,  0,  0, ..., 12, 13, 80],
       [ 0,  0,  0, ..., 13, 80, 81]], dtype=int32)

In [None]:
y

array([ 3,  7, 21, 22,  4, 23, 24,  8,  4, 25, 26, 27,  9, 10, 11, 12, 13,
       14, 28,  1, 15, 29, 30,  5, 31, 32, 33, 34, 35, 36, 37,  1, 38,  9,
       40, 41, 43, 44,  4, 45, 46, 47, 16,  2,  3,  5, 16,  6, 48, 49, 50,
       51, 52, 53,  1, 54,  6, 56, 57, 58, 59, 10, 60, 18, 61, 62, 63,  1,
       64, 65, 19, 66, 67, 68, 69,  1, 18, 70, 72, 73,  7,  5, 20, 74,  2,
        3, 20, 75,  6, 14, 76, 77, 15, 78, 79,  1, 11, 12, 13, 80, 81, 19],
      dtype=int32)

In [None]:
tokenizer.word_index

{'and': 1,
 'data': 2,
 'analysis': 3,
 'a': 4,
 'the': 5,
 'can': 6,
 'is': 7,
 "it's": 8,
 'organizations': 9,
 'to': 10,
 'make': 11,
 'informed': 12,
 'decisions': 13,
 'predict': 14,
 'improve': 15,
 'company': 16,
 'through': 17,
 'customer': 18,
 'strategies': 19,
 'healthcare': 20,
 'not': 21,
 'just': 22,
 'mere': 23,
 'process': 24,
 'tool': 25,
 'that': 26,
 'empowers': 27,
 'trends': 28,
 'operational': 29,
 'efficiency': 30,
 'backbone': 31,
 'of': 32,
 'strategic': 33,
 'planning': 34,
 'in': 35,
 'businesses': 36,
 'governments': 37,
 'other': 38,
 'consider': 39,
 'some': 40,
 'examples': 41,
 'take': 42,
 'for': 43,
 'instance': 44,
 'leading': 45,
 'e': 46,
 'commerce': 47,
 'understand': 48,
 'their': 49,
 "customers'": 50,
 'buying': 51,
 'behavior': 52,
 'preferences': 53,
 'patterns': 54,
 'they': 55,
 'then': 56,
 'use': 57,
 'this': 58,
 'information': 59,
 'personalize': 60,
 'experiences': 61,
 'forecast': 62,
 'sales': 63,
 'optimize': 64,
 'marketing': 65,
 

In [None]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)

In [None]:
y.shape

(102, 82)

In [None]:
X.shape

(102, 23)

### Model Building

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
model = Sequential()
model.add(Embedding(input_dim=83, output_dim=100))
model.add(LSTM(150))
model.add(Dense(82, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

history = model.fit(X_train, y_train,
                   epochs=100,
                   validation_data=(X_test, y_test),
                   verbose=1)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 280ms/step - accuracy: 0.0179 - loss: 4.4074 - val_accuracy: 0.0476 - val_loss: 4.4067
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.0924 - loss: 4.3910 - val_accuracy: 0.0476 - val_loss: 4.4048
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.1002 - loss: 4.3734 - val_accuracy: 0.0000e+00 - val_loss: 4.4014
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.0699 - loss: 4.3338 - val_accuracy: 0.0000e+00 - val_loss: 4.4167
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.0403 - loss: 4.2595 - val_accuracy: 0.0000e+00 - val_loss: 4.5663
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - accuracy: 0.0559 - loss: 4.1980 - val_accuracy: 0.0476 - val_loss: 4.6469
Epoch 7/100
[1m3/3[0m [3

In [None]:
model.summary()

#### *Test the model*

In [None]:
text2= "Data"

# Tokenizer
token_text = tokenizer.texts_to_sequences([text2])[0]

# Padding
padded_text = pad_sequences([token_text], maxlen=max_len-1, padding='pre')

# Predict
predicted = model.predict(padded_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


In [None]:
predicted

array([[4.10959983e-05, 1.28434986e-04, 1.93160519e-01, 1.34249628e-01,
        2.48359353e-03, 1.39928624e-01, 5.50409779e-02, 1.70170218e-01,
        1.66222904e-04, 4.89087506e-05, 1.00370077e-03, 5.14242623e-04,
        9.54321222e-05, 3.17834754e-04, 5.43840928e-04, 5.33401675e-04,
        1.21122890e-03, 2.77114541e-05, 2.69305776e-04, 2.77052313e-04,
        1.48135843e-02, 1.66900866e-02, 3.83977429e-03, 3.03897308e-04,
        2.13171006e-04, 2.90307769e-04, 1.29129439e-05, 1.86316771e-04,
        4.24639766e-05, 1.07719379e-05, 6.20197625e-06, 2.20925771e-02,
        4.61700605e-03, 4.12560548e-05, 6.68598339e-04, 9.80787227e-05,
        2.83038768e-04, 2.48467604e-05, 2.00805836e-04, 4.66855709e-05,
        3.00779473e-02, 2.10966114e-02, 3.08988565e-05, 4.18333858e-02,
        1.61782373e-02, 4.38339950e-04, 1.43750331e-05, 9.63398779e-05,
        2.28187142e-04, 7.78836766e-05, 9.56284057e-05, 2.73681089e-05,
        4.91993087e-05, 1.15188232e-04, 1.03923558e-04, 3.238688

In [None]:
import numpy as np
print(np.argmax(predicted))

2


In [None]:
tokenizer.word_index

{'and': 1,
 'data': 2,
 'analysis': 3,
 'a': 4,
 'the': 5,
 'can': 6,
 'is': 7,
 "it's": 8,
 'organizations': 9,
 'to': 10,
 'make': 11,
 'informed': 12,
 'decisions': 13,
 'predict': 14,
 'improve': 15,
 'company': 16,
 'through': 17,
 'customer': 18,
 'strategies': 19,
 'healthcare': 20,
 'not': 21,
 'just': 22,
 'mere': 23,
 'process': 24,
 'tool': 25,
 'that': 26,
 'empowers': 27,
 'trends': 28,
 'operational': 29,
 'efficiency': 30,
 'backbone': 31,
 'of': 32,
 'strategic': 33,
 'planning': 34,
 'in': 35,
 'businesses': 36,
 'governments': 37,
 'other': 38,
 'consider': 39,
 'some': 40,
 'examples': 41,
 'take': 42,
 'for': 43,
 'instance': 44,
 'leading': 45,
 'e': 46,
 'commerce': 47,
 'understand': 48,
 'their': 49,
 "customers'": 50,
 'buying': 51,
 'behavior': 52,
 'preferences': 53,
 'patterns': 54,
 'they': 55,
 'then': 56,
 'use': 57,
 'this': 58,
 'information': 59,
 'personalize': 60,
 'experiences': 61,
 'forecast': 62,
 'sales': 63,
 'optimize': 64,
 'marketing': 65,
 

In [None]:
for word, index in tokenizer.word_index.items():
  if index == np.argmax(predicted):
    print(word)

data


In [None]:
text3= "Data analysis is not just a mere process it's a tool"

# Tokenizer
token_text3 = tokenizer.texts_to_sequences([text3])[0]

# Padding
padded_text = pad_sequences([token_text3], maxlen=max_len-1, padding='pre')

# Predict
predicted = model.predict(padded_text)

pos = np.argmax(predicted)

for word, index in tokenizer.word_index.items():
  if index == np.argmax(predicted):
    print(word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
tool


#### Improvements:
1. Increasing the LSTM units
2. Increasing the Dense layers
3. Increasing the Data
4. Using a pre-treained model on larger corpus of data
5. Hyper parameter optimization