In [5]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [6]:
data_text  = pd.read_csv('/content/df_commentary_new.csv')

In [7]:
data_text.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
data_text = data_text.head(1000)

In [9]:
data_text.shape

(1000, 1)

In [10]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(data_text['Modified_Commentary'])


In [11]:
len(tokenizer.word_index)

3582

In [12]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'it': 3,
 'a': 4,
 'to': 5,
 'batsman': 6,
 'on': 7,
 'of': 8,
 'off': 9,
 'in': 10,
 'four': 11,
 'his': 12,
 'bowler': 13,
 'ball': 14,
 'for': 15,
 'he': 16,
 'over': 17,
 'was': 18,
 'out': 19,
 'is': 20,
 'that': 21,
 'from': 22,
 'this': 23,
 'with': 24,
 'up': 25,
 'back': 26,
 'at': 27,
 'short': 28,
 'leg': 29,
 'length': 30,
 'six': 31,
 'down': 32,
 'away': 33,
 'but': 34,
 'by': 35,
 'long': 36,
 'b': 37,
 'mid': 38,
 'has': 39,
 'into': 40,
 'wicket': 41,
 'delivery': 42,
 'gets': 43,
 'outside': 44,
 'full': 45,
 'as': 46,
 'just': 47,
 'one': 48,
 '1': 49,
 'run': 50,
 'shot': 51,
 'caught': 52,
 'bat': 53,
 '4s': 54,
 'him': 55,
 'through': 56,
 'team': 57,
 'no': 58,
 'wide': 59,
 'man': 60,
 'c': 61,
 'cover': 62,
 'square': 63,
 'edge': 64,
 'all': 65,
 'deep': 66,
 'right': 67,
 'fence': 68,
 'not': 69,
 'point': 70,
 'get': 71,
 'side': 72,
 'way': 73,
 'have': 74,
 'good': 75,
 'goes': 76,
 'fine': 77,
 'well': 78,
 'had': 79,
 "that's": 80,

In [13]:
data_text['Modified_Commentary']

0      FOUR, first boundary for batsman and team. Ful...
1      FOUR, back-to-back boundaries to end the first...
2      FOUR, hit straight back at bowler and he was l...
3      FOUR, another full toss, it's Jadhav this time...
4      FOUR, four more. Jadhav starting to really fin...
                             ...                        
995    out Caught by Uthappa!! Umesh Yadav strikes an...
996    FOUR, that's harsh on the bowler. That didn't ...
997    FOUR, welcome to Delhi says Iyer. Shortish and...
998    out Caught by Billings!! Zak attack strikes. T...
999    FOUR, that's a gift and batsman accepts it gle...
Name: Modified_Commentary, Length: 1000, dtype: object

In [14]:
max_length = data_text['Modified_Commentary'].str.len().max()
max_length

798

In [15]:
max_text_index = data_text[data_text['Modified_Commentary'].str.len() == max_length].index[0]
max_text_index


325

In [16]:
for i in range(len(data_text['Modified_Commentary'])):
    text = data_text['Modified_Commentary'][i]



In [17]:
input_sequences = []
for i in range(len(data_text['Modified_Commentary'])):
    text = data_text['Modified_Commentary'][i]
    token_list = tokenizer.texts_to_sequences([text])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [18]:
input_sequences

[[11, 99],
 [11, 99, 94],
 [11, 99, 94, 15],
 [11, 99, 94, 15, 6],
 [11, 99, 94, 15, 6, 2],
 [11, 99, 94, 15, 6, 2, 57],
 [11, 99, 94, 15, 6, 2, 57, 45],
 [11, 99, 94, 15, 6, 2, 57, 45, 2],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141, 322],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141, 322, 5],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141, 322, 5, 119],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141, 322, 5, 119, 270],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141, 322, 5, 119, 270, 33],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141, 322, 5, 119, 270, 33, 2],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141, 322, 5, 119, 270, 33, 2, 6],
 [11, 99, 94, 15, 6, 2, 57, 45, 2, 7, 1, 141, 322, 5, 119, 270, 33, 2, 6, 194],
 [11,
  99,
  94,
  15,
  6,
  2,
  57,
  45,
  2,
  7,
  1,
  141,
  322,
  5,
  119,
  270,
  33,
  2,
  6,
  194,
  47],
 [11,
  99,
  94,
  15,
  6,
 

In [19]:
max_len = max(len(x) for x in input_sequences)


In [20]:
max_len

153

In [21]:
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len + 1, padding='pre')

In [22]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,   11,   99],
       [   0,    0,    0, ...,   11,   99,   94],
       [   0,    0,    0, ...,   99,   94,   15],
       ...,
       [   0,    0,    0, ..., 1472,   33,    5],
       [   0,    0,    0, ...,   33,    5,    1],
       [   0,    0,    0, ...,    5,    1,   68]], dtype=int32)

In [23]:
x = padded_input_sequences[:, :-1]
y = padded_input_sequences[:, -1]

In [24]:
x

array([[   0,    0,    0, ...,    0,    0,   11],
       [   0,    0,    0, ...,    0,   11,   99],
       [   0,    0,    0, ...,   11,   99,   94],
       ...,
       [   0,    0,    0, ...,    7, 1472,   33],
       [   0,    0,    0, ..., 1472,   33,    5],
       [   0,    0,    0, ...,   33,    5,    1]], dtype=int32)

In [25]:
x.shape

(45086, 153)

In [26]:
y.shape

(45086,)

In [27]:
y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)
y.shape

(45086, 3583)

In [28]:
max_len

153

In [29]:
len(tokenizer.word_index)+1

3583

In [30]:
model = Sequential()
model.add(Embedding(input_dim=3583, output_dim= 100, input_length=max_len))
model.add(LSTM(100, return_sequences=True)) # Pass return_sequences to LSTM
model.add(LSTM(100))
model.add(Dense(3583, activation='softmax'))

In [31]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 153, 100)          358300    
                                                                 
 lstm (LSTM)                 (None, 153, 100)          80400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 3583)              361883    
                                                                 
Total params: 880983 (3.36 MB)
Trainable params: 880983 (3.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [47]:
model.fit(x,y,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x799f67f9fdf0>

In [39]:
import numpy as np


In [35]:
eval_data = data_text.iloc[-5:]

In [37]:
eval_sequences = tokenizer.texts_to_sequences(eval_data['Modified_Commentary'])
eval_padded = pad_sequences(eval_sequences, maxlen=max_len + 1, padding='pre')

x_eval = eval_padded[:, :-1]
y_eval = eval_padded[:, -1]
y_eval = to_categorical(y_eval, num_classes=len(tokenizer.word_index)+1)

# Evaluate the model
results = model.evaluate(x_eval, y_eval, verbose=0)
print(f"Evaluation results on the last 5 sentences: {results}")

# Calculate accuracy on the last 5 sentences
accuracy = results[1]  # The second element in results corresponds to accuracy
print(f"Accuracy on the last 5 sentences: {accuracy}")

# Make predictions on the last 5 sentences
predictions = model.predict(x_eval)
print(f"Predictions on the last 5 sentences: {predictions}")

Evaluation results on the last 5 sentences: [1.7894920110702515, 0.800000011920929]
Accuracy on the last 5 sentences: 0.800000011920929
Predictions on the last 5 sentences: [[3.44079820e-10 1.62860047e-07 1.62230339e-04 ... 1.84178484e-19
  1.89934687e-23 1.51066620e-18]
 [1.09123731e-10 9.86378118e-02 1.18839294e-02 ... 9.03979142e-15
  1.08133854e-16 7.54801126e-24]
 [1.61696299e-11 3.31455492e-03 7.95396772e-06 ... 4.58756362e-29
  5.84464416e-28 1.44562878e-30]
 [9.79486825e-10 7.08068819e-06 6.23552070e-04 ... 1.25802220e-16
  1.04563302e-18 5.93197908e-13]
 [3.29034924e-11 2.60880665e-06 2.28099510e-04 ... 6.29555376e-25
  2.60511155e-29 2.71005054e-25]]


### with 60 percent accuracy

In [42]:
# Separate into x_eval and y_eval
x_eval = eval_padded[:, :-1]
y_eval = eval_padded[:, -1]
y_eval = to_categorical(y_eval, num_classes=len(tokenizer.word_index)+1)

# Evaluate the model
results = model.evaluate(x_eval, y_eval, verbose=0)
print(f"Evaluation results on the last 5 sentences: {results}")

# Make predictions on the last 5 sentences
predictions = model.predict(x_eval)
predicted_indices = np.argmax(predictions, axis=1)

# Function to convert sequences back to words
def sequences_to_texts(tokenizer, sequences):
    reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
    texts = []
    for sequence in sequences:
        words = [reverse_word_index.get(i, '?') for i in sequence]
        texts.append(' '.join(words))
    return texts

# Convert original and predicted sequences to texts
original_texts = sequences_to_texts(tokenizer, eval_sequences)
predicted_texts = sequences_to_texts(tokenizer, predicted_indices.reshape(-1, 1))

# Print each of the words from the original and predicted sequences
for i in range(len(original_texts)):
    print(f"Original sentence {i+1}: {original_texts[i]}")
    print(f"Predicted next word for sentence {i+1}: {predicted_texts[i]}")
    print()

Evaluation results on the last 5 sentences: [1.7894920110702515, 0.800000011920929]
Original sentence 1: out caught by uthappa umesh yadav strikes and removes the big fish he bowled much better compared to his first spell and earns himself a wicket it was a pacy test match line and length short of a good length delivery on fourth fifth stump line batsman gets a faint outside tickle looking to run it down towards third man settles into the safe mitts of uthappa batsman c uthappa b bowler 39 25 4s 7
Predicted next word for sentence 1: 7

Original sentence 2: four that's harsh on the bowler that didn't deserve to go to the boundary it was banged in short and bowler bent his back there iyer is surprised by the pace and bounce on that delivery he jumps and just sticks his bat out hoping that it doesn't hit him the ball flies past the keeper off his shoulder and dribbles away into the third man fence the umpire thinks it came off the bat and gives four runs
Predicted next word for sentence 2

In [43]:
eval_sequences = tokenizer.texts_to_sequences(eval_data['Modified_Commentary'])
max_predict_len = 10

# Function to predict the next n words
def predict_next_n_words(model, tokenizer, seed_text, n_words):
    result = []
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=1)[0]
        predicted_word = tokenizer.index_word[predicted_index]
        result.append(predicted_word)
        seed_text += ' ' + predicted_word
    return result

# Evaluate the model
reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
for i, seq in enumerate(eval_sequences):
    if len(seq) > 10:
        first_10_words = seq[:10]
    else:
        first_10_words = seq

    first_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in first_10_words)
    predicted_words = predict_next_n_words(model, tokenizer, first_10_words_text, max_predict_len)
    actual_next_10_words = seq[10:20] if len(seq) > 20 else seq[10:]
    actual_next_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in actual_next_10_words)

    print(f"Sentence {i+1}:")
    print(f"First 10 words: {first_10_words_text}")
    print(f"Actual next 10 words: {actual_next_10_words_text}")
    print(f"Predicted next 10 words: {' '.join(predicted_words)}")
    print()








Sentence 1:
First 10 words: out caught by uthappa umesh yadav strikes and removes the
Actual next 10 words: big fish he bowled much better compared to his first
Predicted next 10 words: big fish he walks down intercepts the length and bowls

Sentence 2:
First 10 words: four that's harsh on the bowler that didn't deserve to
Actual next 10 words: go to the boundary it was banged in short and
Predicted next 10 words: wait for batsman he enticingly tosses this short ball on

Sentence 3:
First 10 words: four welcome to delhi says iyer shortish and wide outside
Actual next 10 words: off iyer treks back and fiercely cuts it behind square
Predicted next 10 words: off batsman throws his arms to clobber it fine the

Sentence 4:
First 10 words: out caught by billings zak attack strikes trap set and
Actual next 10 words: trap sprung so batsman's promotion fails lasted for just two
Predicted next 10 words: trap sprung so batsman's promotion fails lasted for any a

Sentence 5:
First 10 words: four t

## with the accuracy of 90 percent

In [48]:
eval_sequences = tokenizer.texts_to_sequences(eval_data['Modified_Commentary'])
eval_padded = pad_sequences(eval_sequences, maxlen=max_len + 1, padding='pre')

x_eval = eval_padded[:, :-1]
y_eval = eval_padded[:, -1]
y_eval = to_categorical(y_eval, num_classes=len(tokenizer.word_index)+1)

# Evaluate the model
results = model.evaluate(x_eval, y_eval, verbose=0)
print(f"Evaluation results on the last 5 sentences: {results}")

# Calculate accuracy on the last 5 sentences
accuracy = results[1]  # The second element in results corresponds to accuracy
print(f"Accuracy on the last 5 sentences: {accuracy}")

# Make predictions on the last 5 sentences
predictions = model.predict(x_eval)
print(f"Predictions on the last 5 sentences: {predictions}")

Evaluation results on the last 5 sentences: [0.5841593742370605, 0.6000000238418579]
Accuracy on the last 5 sentences: 0.6000000238418579
Predictions on the last 5 sentences: [[1.27901978e-09 1.63285485e-09 4.04725615e-05 ... 5.80933427e-21
  7.31618691e-32 1.83667369e-24]
 [1.07257024e-11 3.64310920e-01 3.51343118e-03 ... 1.75843360e-19
  1.73291322e-22 2.42096062e-25]
 [4.95682141e-12 3.58189136e-04 4.96491202e-06 ... 7.31024279e-34
  1.07494313e-29 2.70576480e-25]
 [2.65667922e-11 3.96799882e-09 9.38424182e-06 ... 3.38177567e-21
  1.87199217e-24 9.27186955e-19]
 [1.20213587e-11 5.20853048e-11 6.47364368e-06 ... 1.23431556e-26
  5.44430417e-36 5.33603013e-28]]


In [49]:
eval_sequences = tokenizer.texts_to_sequences(eval_data['Modified_Commentary'])
max_predict_len = 10

# Function to predict the next n words
def predict_next_n_words(model, tokenizer, seed_text, n_words):
    result = []
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=1)[0]
        predicted_word = tokenizer.index_word[predicted_index]
        result.append(predicted_word)
        seed_text += ' ' + predicted_word
    return result

# Evaluate the model
reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
for i, seq in enumerate(eval_sequences):
    if len(seq) > 10:
        first_10_words = seq[:10]
    else:
        first_10_words = seq

    first_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in first_10_words)
    predicted_words = predict_next_n_words(model, tokenizer, first_10_words_text, max_predict_len)
    actual_next_10_words = seq[10:20] if len(seq) > 20 else seq[10:]
    actual_next_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in actual_next_10_words)

    print(f"Sentence {i+1}:")
    print(f"First 10 words: {first_10_words_text}")
    print(f"Actual next 10 words: {actual_next_10_words_text}")
    print(f"Predicted next 10 words: {' '.join(predicted_words)}")
    print()








Sentence 1:
First 10 words: out caught by uthappa umesh yadav strikes and removes the
Actual next 10 words: big fish he bowled much better compared to his first
Predicted next 10 words: big fish he bowled much better compared to his first

Sentence 2:
First 10 words: four that's harsh on the bowler that didn't deserve to
Actual next 10 words: go to the boundary it was banged in short and
Predicted next 10 words: go to the boundary it was banged in short and

Sentence 3:
First 10 words: four welcome to delhi says iyer shortish and wide outside
Actual next 10 words: off iyer treks back and fiercely cuts it behind square
Predicted next 10 words: off iyer treks back and fiercely cuts it behind square

Sentence 4:
First 10 words: out caught by billings zak attack strikes trap set and
Actual next 10 words: trap sprung so batsman's promotion fails lasted for just two
Predicted next 10 words: trap sprung so batsman's promotion fails lasted for just two

Sentence 5:
First 10 words: four that's 

In [50]:
eval_data_1 = data_text.iloc[-10:]

In [51]:
eval_sequences = tokenizer.texts_to_sequences(eval_data_1['Modified_Commentary'])
max_predict_len = 10

# Function to predict the next n words
def predict_next_n_words(model, tokenizer, seed_text, n_words):
    result = []
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=1)[0]
        predicted_word = tokenizer.index_word[predicted_index]
        result.append(predicted_word)
        seed_text += ' ' + predicted_word
    return result

# Evaluate the model
reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
for i, seq in enumerate(eval_sequences):
    if len(seq) > 10:
        first_10_words = seq[:10]
    else:
        first_10_words = seq

    first_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in first_10_words)
    predicted_words = predict_next_n_words(model, tokenizer, first_10_words_text, max_predict_len)
    actual_next_10_words = seq[10:20] if len(seq) > 20 else seq[10:]
    actual_next_10_words_text = ' '.join(reverse_word_index.get(word, '?') for word in actual_next_10_words)

    print(f"Sentence {i+1}:")
    print(f"First 10 words: {first_10_words_text}")
    print(f"Actual next 10 words: {actual_next_10_words_text}")
    print(f"Predicted next 10 words: {' '.join(predicted_words)}")
    print()








Sentence 1:
First 10 words: four exquisite he's seeing the cricket ball like a football
Actual next 10 words: all timing he is not trying to hit the leather
Predicted next 10 words: all timing with a open length delivery it was a

Sentence 2:
First 10 words: four starts with a full toss on leg easy peasy
Actual next 10 words: for batsman he clips it in front of square on
Predicted next 10 words: for batsman he clips it in front of square on

Sentence 3:
First 10 words: 2 runs to long on 50 run stand in the
Actual next 10 words: fifth over a gentle nudge into the leg side followed
Predicted next 10 words: fifth over a gentle nudge into the leg side followed

Sentence 4:
First 10 words: out caught by uthappa the umpire took ages to raise
Actual next 10 words: his finger perhaps it looked like he had a look
Predicted next 10 words: his finger perhaps it looked like he had no one

Sentence 5:
First 10 words: four cheeky delicate and effective uses the slightest of width
Actual next 10 word

In [None]:
import time
text = "batsman"

for i in range(20):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=130, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)