In [1]:
text = """ABOUT PUNE
Pune is the second largest city in the state of Maharashtra. Pune city stands bordered by the ranges of the Sahyadri Mountain. The moderate weather as well as the rich flora and fauna of the city make it an ideal travel destination. It has been known by a plethora of sobriquets. Popular among them: ‘Queen of the Deccan’, ‘Cultural capital of Maharashtra’, and ‘Oxford of the East’. It is also among the greenest urban areas in the country with more than 40 per cent of its area under green cover.

Pune is the ninth populous city of the India and one of the top 10 “Most Developed Cities” in India by GDP 2016 and one of the fastest developing cities in the Asia Pacific region. It has a very strong presence in the automobile sector and is on its way to consolidate its position as the 'Detroit of India' too. Once referred as 'Pensioner's Paradise' it is now home to many software and IT companies.

In Pune, the past meets the present. It is one of those rare cities with a twin image: that of a tradition-bound place - generally considered the quintessence of Maharashtrian culture - and that of a modern industrial metropolis.
Pune Climate
The climate of Pune is quite pleasant. Summer here begins from early March to July. Daytimes are very sunny with dry heat. Early mornings are pleasant and evenings after six, cool and breezy. Pune stands on the leeward side of the Western Ghats on an altitude of 559m. (1863 ft.).

Summers are typically hot with maximum temperatures ranging from 35℃ to 42℃. The warmest month in Pune is generally April as compared to the rest of the Deccan Plateau where it is May. Sunlight can be intense leading to skin tan and sun-burn, be sure to carry and use sunscreens. The nights in Pune are significantly cooler than most other parts in this region owing to its high altitude.

Guideline regarding clothing and conference attire:
Formal dress for Men and Women"""

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

In [4]:
len(tokenizer.word_index)

199

In [5]:
input_sequences = []
for sentence in text.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [6]:
input_sequences

[[35, 4],
 [4, 5],
 [4, 5, 1],
 [4, 5, 1, 36],
 [4, 5, 1, 36, 37],
 [4, 5, 1, 36, 37, 11],
 [4, 5, 1, 36, 37, 11, 6],
 [4, 5, 1, 36, 37, 11, 6, 1],
 [4, 5, 1, 36, 37, 11, 6, 1, 38],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11, 18],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11, 18, 40],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11, 18, 40, 15],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11, 18, 40, 15, 1],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11, 18, 40, 15, 1, 41],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11, 18, 40, 15, 1, 41, 2],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11, 18, 40, 15, 1, 41, 2, 1],
 [4, 5, 1, 36, 37, 11, 6, 1, 38, 2, 39, 4, 11, 18, 40, 15, 1, 41, 2, 1, 42],
 [4,
  5,
  1,
  36,
  37,
  11,
  6,
  1,
  38,
  2,
  39,
  4,
  11,
  18,
  40,
  15,
  1,
  41,
  2,
  1,
  42,
  43],
 [4,
  

In [7]:
max_len = max([len(x) for x in input_sequences])
max_len

90

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [9]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,  35,   4],
       [  0,   0,   0, ...,   0,   4,   5],
       [  0,   0,   0, ...,   4,   5,   1],
       ...,
       [  0,   0,   0, ..., 196, 197, 198],
       [  0,   0,   0, ..., 197, 198,   3],
       [  0,   0,   0, ..., 198,   3, 199]])

In [10]:
X = padded_input_sequences[:,:-1]

In [11]:
X

array([[  0,   0,   0, ...,   0,   0,  35],
       [  0,   0,   0, ...,   0,   0,   4],
       [  0,   0,   0, ...,   0,   4,   5],
       ...,
       [  0,   0,   0, ..., 195, 196, 197],
       [  0,   0,   0, ..., 196, 197, 198],
       [  0,   0,   0, ..., 197, 198,   3]])

In [12]:
X.shape

(330, 89)

In [13]:
y = padded_input_sequences[:,-1]

In [14]:
y

array([  4,   5,   1,  36,  37,  11,   6,   1,  38,   2,  39,   4,  11,
        18,  40,  15,   1,  41,   2,   1,  42,  43,   1,  44,  45,   9,
        46,   9,   1,  47,  48,   3,  49,   2,   1,  11,  50,   7,  19,
        51,  52,  53,   7,  20,  54,  55,  15,  10,  56,   2,  57,  58,
        21,  59,  60,   2,   1,  61,  62,  63,   2,  64,   3,  65,   2,
         1,  66,   7,   5,  67,  21,   1,  68,  69,  70,   6,   1,  71,
        12,  72,  22,  73,  74,  75,   2,  13,  76,  77,  78,  79,   5,
         1,  80,  81,  11,   2,   1,  23,   3,  16,   2,   1,  82,  83,
        84,  85,  86,   6,  23,  15,  87,  88,   3,  16,   2,   1,  89,
        90,  24,   6,   1,  91,  92,  25,   7,  20,  10,  26,  93,  94,
         6,   1,  95,  96,   3,   5,  17,  13,  97,   8,  98,  13,  99,
         9,   1, 100,   2, 101, 102, 103, 104,   9, 105, 106,   7,   5,
       107, 108,   8, 109, 110,   3,   7, 111,   4,   1, 112, 113,   1,
       114,   7,   5,  16,   2, 115, 116,  24,  12,  10, 117, 11

In [15]:
y.shape

(330,)

In [16]:
len(tokenizer.word_index)

199

In [17]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=len(tokenizer.word_index)+1)

In [18]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [19]:
y.shape

(330, 200)

In [20]:
max_len

90

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [22]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, 100, input_shape=((max_len)-1,)))
model.add(LSTM(150))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))

  super().__init__(**kwargs)


In [23]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

# model.summary()


In [25]:
model.fit(X,y,epochs=100)

Epoch 1/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 55ms/step - accuracy: 0.0341 - loss: 5.2914
Epoch 2/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.0945 - loss: 5.0187
Epoch 3/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.0660 - loss: 4.8521
Epoch 4/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.0942 - loss: 4.8416
Epoch 5/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - accuracy: 0.0697 - loss: 4.9034
Epoch 6/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - accuracy: 0.0793 - loss: 4.7252
Epoch 7/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - accuracy: 0.0662 - loss: 4.7896
Epoch 8/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - accuracy: 0.0996 - loss: 4.6670
Epoch 9/100
[1m11/11[0m [32m━━━━━━━━━

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 64ms/step - accuracy: 0.9969 - loss: 0.2765
Epoch 69/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 87ms/step - accuracy: 0.9842 - loss: 0.2773
Epoch 70/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.9898 - loss: 0.2557
Epoch 71/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - accuracy: 0.9972 - loss: 0.2479
Epoch 72/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.9941 - loss: 0.2400
Epoch 73/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - accuracy: 0.9955 - loss: 0.2337
Epoch 74/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 79ms/step - accuracy: 0.9945 - loss: 0.2098
Epoch 75/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.9958 - loss: 0.2150
Epoch 76/100
[1m11/11[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2511c56e050>

In [26]:
import time
text = "Pune is the "

for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=max_len-1, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step
Pune is the  second
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Pune is the  second largest
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Pune is the  second largest city
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Pune is the  second largest city in
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Pune is the  second largest city in the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Pune is the  second largest city in the state
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Pune is the  second largest city in the state of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Pune is the  second largest city in the state of maharashtra
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Pune is the  second largest city in th