In [1]:
!pip install tensorflow==2.16.2


Collecting tensorflow==2.16.2
  Downloading tensorflow-2.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (590.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow==2.16.2)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow==2.16.2)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.17,>=2.16 (from tensorflow==2.16.2)
  Downloading tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m85.3 MB/

In [2]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [3]:
data_text  = pd.read_csv('/content/df_commentary_new.csv')

In [4]:
def data_processing(data,number_of_rows, list_of_columns):
    data = data.drop(columns= list_of_columns,axis=1)
    data = data.head(number_of_rows)
    return data

In [5]:
def tokenize(data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data['Modified_Commentary'])
    return tokenizer

In [6]:
def input_sequences(data):
  input_sequences = []
  for i in range(len(data_text['Modified_Commentary'])):
      text = data_text['Modified_Commentary'][i]
      token_list = tokenizer.texts_to_sequences([text])[0]
      for i in range(1, len(token_list)):
          n_gram_sequence = token_list[:i+1]
          input_sequences.append(n_gram_sequence)

  max_len = max(len(x) for x in input_sequences)

  return input_sequences, max_len


In [7]:
### Calling all the functions

dropped_columns = ['Unnamed: 0']
data_text =  data_processing(data_text, 1000, dropped_columns)
tokenizer  = tokenize(data_text)
input_sequences, max_len = input_sequences(data_text)
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len + 1, padding='pre')
padded_input_sequences
x = padded_input_sequences[:, :-1]
y = padded_input_sequences[:, -1]
y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)
print('x shape : ',{x.shape})
print('y shape : ',{y.shape})


x shape :  {(45086, 153)}
y shape :  {(45086, 3583)}


In [8]:

# print maximum word index

max_word_index = max([index for word, index in tokenizer.word_index.items()])
print(f"Maximum word index: {max_word_index}")

# print max length each string

max_length = data_text['Modified_Commentary'].str.len().max()

print(f"Maximum length of a string: {max_length}")

max_text_index = data_text[data_text['Modified_Commentary'].str.len() == max_length].index[0]

print(f"Index of maximum length of a string: {max_text_index}")


Maximum word index: 3582
Maximum length of a string: 798
Index of maximum length of a string: 325


In [9]:
def lstm_model():

  model = Sequential()
  model.add(Embedding(input_dim=3583, output_dim= 100, input_length=max_len))
  model.add(LSTM(100, return_sequences=True)) # Pass return_sequences to LSTM
  model.add(LSTM(100))
  model.add(Dense(3583, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model


In [10]:
model = lstm_model()



In [11]:
model.summary()

In [12]:
model.fit(x,y,epochs=50)

Epoch 1/50
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 16ms/step - accuracy: 0.0674 - loss: 6.3617
Epoch 2/50
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 16ms/step - accuracy: 0.0919 - loss: 5.6823
Epoch 3/50
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 17ms/step - accuracy: 0.1337 - loss: 5.2318
Epoch 4/50
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 17ms/step - accuracy: 0.1669 - loss: 4.8916
Epoch 5/50
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 16ms/step - accuracy: 0.1943 - loss: 4.6226
Epoch 6/50
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 16ms/step - accuracy: 0.2144 - loss: 4.4041
Epoch 7/50
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 17ms/step - accuracy: 0.2257 - loss: 4.2211
Epoch 8/50
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 16ms/step - accuracy: 0.2347 - loss: 4.0685
Epoch 9/

<keras.src.callbacks.history.History at 0x7d6a01affa90>

In [13]:
eval_data = data_text.iloc[-5:]

In [14]:
eval_sequences = tokenizer.texts_to_sequences(eval_data['Modified_Commentary'])
eval_padded = pad_sequences(eval_sequences, maxlen=max_len + 1, padding='pre')

x_eval = eval_padded[:, :-1]
y_eval = eval_padded[:, -1]
y_eval = to_categorical(y_eval, num_classes=len(tokenizer.word_index)+1)

# Evaluate the model
results = model.evaluate(x_eval, y_eval, verbose=0)
print(f"Evaluation results on the last 5 sentences: {results}")

# Calculate accuracy on the last 5 sentences
accuracy = results[1]  # The second element in results corresponds to accuracy
print(f"Accuracy on the last 5 sentences: {accuracy}")

# Make predictions on the last 5 sentences
predictions = model.predict(x_eval)
print(f"Predictions on the last 5 sentences: {predictions}")

Evaluation results on the last 5 sentences: [0.9745728373527527, 0.800000011920929]
Accuracy on the last 5 sentences: 0.800000011920929
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step
Predictions on the last 5 sentences: [[6.7445174e-11 2.8243122e-07 1.2627577e-05 ... 7.9858522e-26
  3.3178926e-26 2.4318808e-18]
 [2.2047382e-10 9.9783674e-02 6.8807247e-04 ... 1.4604362e-19
  2.0284194e-19 4.4226571e-19]
 [2.4552094e-12 6.0368648e-06 4.1524290e-06 ... 9.1795321e-30
  2.6861045e-27 4.7719846e-32]
 [3.9265768e-11 3.0926611e-07 8.2775739e-07 ... 5.7052353e-24
  1.4696759e-24 6.7838443e-18]
 [9.0894064e-10 4.5546920e-05 8.2390343e-06 ... 1.7422334e-22
  5.8705002e-28 4.7136469e-27]]


### with 65 percent accuracy

In [15]:
# Separate into x_eval and y_eval
x_eval = eval_padded[:, :-1]
y_eval = eval_padded[:, -1]
y_eval = to_categorical(y_eval, num_classes=len(tokenizer.word_index)+1)

# Evaluate the model
results = model.evaluate(x_eval, y_eval, verbose=0)
print(f"Evaluation results on the last 5 sentences: {results}")

# Make predictions on the last 5 sentences
predictions = model.predict(x_eval)
predicted_indices = np.argmax(predictions, axis=1)

# Function to convert sequences back to words
def sequences_to_texts(tokenizer, sequences):
    reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
    texts = []
    for sequence in sequences:
        words = [reverse_word_index.get(i, '?') for i in sequence]
        texts.append(' '.join(words))
    return texts

# Convert original and predicted sequences to texts
original_texts = sequences_to_texts(tokenizer, eval_sequences)
predicted_texts = sequences_to_texts(tokenizer, predicted_indices.reshape(-1, 1))

# Print each of the words from the original and predicted sequences
for i in range(len(original_texts)):
    print(f"Original sentence {i+1}: {original_texts[i]}")
    print(f"Predicted next word for sentence {i+1}: {predicted_texts[i]}")
    print()

Evaluation results on the last 5 sentences: [0.9745728373527527, 0.800000011920929]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Original sentence 1: out caught by uthappa umesh yadav strikes and removes the big fish he bowled much better compared to his first spell and earns himself a wicket it was a pacy test match line and length short of a good length delivery on fourth fifth stump line batsman gets a faint outside tickle looking to run it down towards third man settles into the safe mitts of uthappa batsman c uthappa b bowler 39 25 4s 7
Predicted next word for sentence 1: 7

Original sentence 2: four that's harsh on the bowler that didn't deserve to go to the boundary it was banged in short and bowler bent his back there iyer is surprised by the pace and bounce on that delivery he jumps and just sticks his bat out hoping that it doesn't hit him the ball flies past the keeper off his shoulder and dribbles away into the third man fence the umpire thinks it

In [16]:
eval_sequences = tokenizer.texts_to_sequences(eval_data['Modified_Commentary'])
max_predict_len = 10

# Function to predict the next n words
def predict_next_n_words(model, tokenizer, seed_text, n_words):
    result = []
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=1)[0]
        predicted_word = tokenizer.index_word[predicted_index]
        result.append(predicted_word)
        seed_text += ' ' + predicted_word
    return result

# Evaluate the model
reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
for i, seq in enumerate(eval_sequences):
    if len(seq) > 10:
        first_10_words = seq[:10]
    else:
        first_10_words = seq

    first_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in first_10_words)
    predicted_words = predict_next_n_words(model, tokenizer, first_10_words_text, max_predict_len)
    actual_next_10_words = seq[10:20] if len(seq) > 20 else seq[10:]
    actual_next_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in actual_next_10_words)

    print(f"Sentence {i+1}:")
    print(f"First 10 words: {first_10_words_text}")
    print(f"Actual next 10 words: {actual_next_10_words_text}")
    print(f"Predicted next 10 words: {' '.join(predicted_words)}")
    print()








Sentence 1:
First 10 words: out caught by uthappa umesh yadav strikes and removes the
Actual next 10 words: big fish he bowled much better compared to his first
Predicted next 10 words: big fish he is in some form alright comes up

Sentence 2:
First 10 words: four that's harsh on the bowler that didn't deserve to
Actual next 10 words: go to the boundary it was banged in short and
Predicted next 10 words: fetch it from batsman who waits with the ball flying

Sentence 3:
First 10 words: four welcome to delhi says iyer shortish and wide outside
Actual next 10 words: off iyer treks back and fiercely cuts it behind square
Predicted next 10 words: off batsman gets inside the line and helps it on

Sentence 4:
First 10 words: out caught by billings zak attack strikes trap set and
Actual next 10 words: trap sprung so batsman's promotion fails lasted for just two
Predicted next 10 words: trap sprung the finger bowler gets a pie out of

Sentence 5:
First 10 words: four that's a gift and batsman a

## with the accuracy of 90 percent

In [23]:
model.fit(x,y,epochs=15)

Epoch 1/15
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 16ms/step - accuracy: 0.8590 - loss: 0.6160
Epoch 2/15
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 16ms/step - accuracy: 0.8628 - loss: 0.6055
Epoch 3/15
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 16ms/step - accuracy: 0.8657 - loss: 0.5946
Epoch 4/15
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 16ms/step - accuracy: 0.8654 - loss: 0.5887
Epoch 5/15
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 16ms/step - accuracy: 0.8686 - loss: 0.5806
Epoch 6/15
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 16ms/step - accuracy: 0.8734 - loss: 0.5673
Epoch 7/15
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 16ms/step - accuracy: 0.8728 - loss: 0.5694
Epoch 8/15
[1m1409/1409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 16ms/step - accuracy: 0.8746 - loss: 0.5541
Epoch 9/

<keras.src.callbacks.history.History at 0x7d6a025eacb0>

In [18]:
from tensorflow.keras.models import load_model, save_model

In [24]:
save_model(model, 'model_lstm_5.keras')

In [25]:
eval_sequences = tokenizer.texts_to_sequences(eval_data['Modified_Commentary'])
eval_padded = pad_sequences(eval_sequences, maxlen=max_len + 1, padding='pre')

x_eval = eval_padded[:, :-1]
y_eval = eval_padded[:, -1]
y_eval = to_categorical(y_eval, num_classes=len(tokenizer.word_index)+1)

# Evaluate the model
results = model.evaluate(x_eval, y_eval, verbose=0)
print(f"Evaluation results on the last 5 sentences: {results}")

# Calculate accuracy on the last 5 sentences
accuracy = results[1]  # The second element in results corresponds to accuracy
print(f"Accuracy on the last 5 sentences: {accuracy}")

# Make predictions on the last 5 sentences
predictions = model.predict(x_eval)
print(f"Predictions on the last 5 sentences: {predictions}")

Evaluation results on the last 5 sentences: [0.178151935338974, 1.0]
Accuracy on the last 5 sentences: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Predictions on the last 5 sentences: [[2.6682059e-11 9.3183551e-11 8.3306261e-08 ... 3.5693134e-28
  1.8218195e-33 9.3182855e-22]
 [1.5750453e-11 9.6197281e-04 4.3774176e-07 ... 5.8988461e-26
  4.5774917e-27 3.0567537e-21]
 [1.7673569e-13 4.0504697e-09 8.6667762e-07 ... 1.9472560e-34
  8.1974051e-35 2.3082990e-34]
 [4.1753745e-12 1.5166934e-10 3.2718833e-10 ... 9.8802642e-28
  1.1610399e-34 2.7810996e-21]
 [1.9452910e-10 3.5226595e-08 8.7374674e-10 ... 1.6121795e-26
  0.0000000e+00 3.8950349e-31]]


In [26]:
eval_sequences = tokenizer.texts_to_sequences(eval_data['Modified_Commentary'])
max_predict_len = 10

# Function to predict the next n words
def predict_next_n_words(model, tokenizer, seed_text, n_words):
    result = []
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=1)[0]
        predicted_word = tokenizer.index_word[predicted_index]
        result.append(predicted_word)
        seed_text += ' ' + predicted_word
    return result

# Evaluate the model
reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
for i, seq in enumerate(eval_sequences):
    if len(seq) > 10:
        first_10_words = seq[:10]
    else:
        first_10_words = seq

    first_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in first_10_words)
    predicted_words = predict_next_n_words(model, tokenizer, first_10_words_text, max_predict_len)
    actual_next_10_words = seq[10:20] if len(seq) > 20 else seq[10:]
    actual_next_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in actual_next_10_words)

    print(f"Sentence {i+1}:")
    print(f"First 10 words: {first_10_words_text}")
    print(f"Actual next 10 words: {actual_next_10_words_text}")
    print(f"Predicted next 10 words: {' '.join(predicted_words)}")
    print()








Sentence 1:
First 10 words: out caught by uthappa umesh yadav strikes and removes the
Actual next 10 words: big fish he bowled much better compared to his first
Predicted next 10 words: big fish he bowled much better compared to his first

Sentence 2:
First 10 words: four that's harsh on the bowler that didn't deserve to
Actual next 10 words: go to the boundary it was banged in short and
Predicted next 10 words: go to the boundary it was banged in short and

Sentence 3:
First 10 words: four welcome to delhi says iyer shortish and wide outside
Actual next 10 words: off iyer treks back and fiercely cuts it behind square
Predicted next 10 words: off iyer treks back and fiercely cuts it behind square

Sentence 4:
First 10 words: out caught by billings zak attack strikes trap set and
Actual next 10 words: trap sprung so batsman's promotion fails lasted for just two
Predicted next 10 words: trap sprung so batsman's promotion fails lasted for the bowling

Sentence 5:
First 10 words: four that

In [27]:
eval_data_1 = data_text.iloc[-10:]

In [28]:
eval_sequences = tokenizer.texts_to_sequences(eval_data_1['Modified_Commentary'])
max_predict_len = 10

# Function to predict the next n words
def predict_next_n_words(model, tokenizer, seed_text, n_words):
    result = []
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=1)[0]
        predicted_word = tokenizer.index_word[predicted_index]
        result.append(predicted_word)
        seed_text += ' ' + predicted_word
    return result

# Evaluate the model
reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
for i, seq in enumerate(eval_sequences):
    if len(seq) > 10:
        first_10_words = seq[:10]
    else:
        first_10_words = seq

    first_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in first_10_words)
    predicted_words = predict_next_n_words(model, tokenizer, first_10_words_text, max_predict_len)
    actual_next_10_words = seq[10:20] if len(seq) > 20 else seq[10:]
    actual_next_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in actual_next_10_words)

    print(f"Sentence {i+1}:")
    print(f"First 10 words: {first_10_words_text}")
    print(f"Actual next 10 words: {actual_next_10_words_text}")
    print(f"Predicted next 10 words: {' '.join(predicted_words)}")
    print()








Sentence 1:
First 10 words: four exquisite he's seeing the cricket ball like a football
Actual next 10 words: all timing he is not trying to hit the leather
Predicted next 10 words: all timing he was scarced for runs wanted to throw

Sentence 2:
First 10 words: four starts with a full toss on leg easy peasy
Actual next 10 words: for batsman he clips it in front of square on
Predicted next 10 words: for batsman he clips it in front of square on

Sentence 3:
First 10 words: 2 runs to long on 50 run stand in the
Actual next 10 words: fifth over a gentle nudge into the leg side followed
Predicted next 10 words: fifth full one in this man didn't deserve the pace

Sentence 4:
First 10 words: out caught by uthappa the umpire took ages to raise
Actual next 10 words: his finger perhaps it looked like he had a look
Predicted next 10 words: his finger perhaps it looked like he had to turn

Sentence 5:
First 10 words: four cheeky delicate and effective uses the slightest of width
Actual next 10 wo