In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('filtered_videos.csv')

In [3]:
df.head()

Unnamed: 0,video_id,video_path,description
0,21179416,videos_1000\21179416.mp4,Aerial shot winter forest
1,5629184,videos_1000\5629184.mp4,Senior couple looking through binoculars on sa...
2,1063125190,videos_1000\1063125190.mp4,A beautiful cookie with oranges lies on a gree...
3,1039695998,videos_1000\1039695998.mp4,Japanese highrise office skyscrapers tokyo square
4,9607838,videos_1000\9607838.mp4,"Zrenjanin,serbia march 21 2015: fans watching ..."


In [4]:
df.count()

video_id       999
video_path     999
description    999
dtype: int64

In [5]:
# Load video features
video_features = np.load('video_features_pytorch_new.npy', allow_pickle=True).item()

In [6]:
# Extract descriptions
descriptions = df['description'].values

In [7]:
descriptions[1]

'Senior couple looking through binoculars on sailboat together. shot on red epic for high quality 4k, uhd, ultra hd resolution.'

In [8]:
# Tokenize the descriptions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions)
vocab_size = len(tokenizer.word_index) + 1

In [9]:
# Convert descriptions to sequences
sequences = tokenizer.texts_to_sequences(descriptions)
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [10]:
# Prepare the video features
video_ids = df['video_id'].values
X_video = np.array([video_features[str(video_id)] for video_id in video_ids])

In [11]:
X_video = X_video.squeeze() 

In [12]:
# Prepare the target sequences
y = np.expand_dims(padded_sequences, -1)

Define the Model Architecture

In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, RepeatVector, TimeDistributed, Attention

In [14]:
# Define the encoder
video_input = Input(shape=(2048,))
encoder = Dense(256, activation='relu')(video_input)
encoder = RepeatVector(max_length)(encoder)

In [15]:
# Define the decoder
decoder_input = Input(shape=(max_length,))
decoder_embedding = Embedding(vocab_size, 256, mask_zero=True)(decoder_input)
decoder_lstm = LSTM(256, return_sequences=True)(decoder_embedding)

In [16]:
# Attention mechanism
attention = Attention()([encoder, decoder_lstm])
decoder_combined_context = tf.concat([decoder_lstm, attention], axis=-1)

In [17]:
# TimeDistributed layer for word prediction
decoder_output = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder_combined_context)


In [18]:
# Define the model
model = Model(inputs=[video_input, decoder_input], outputs=decoder_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [19]:
# Print the model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 37)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 37, 256)      1001216     input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          524544      input_1[0][0]                    
______________________________________________________________________________________________

Train the Model

In [20]:
# Train the model
model.fit([X_video, padded_sequences], y, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x281bdc3ae80>

In [21]:
def generate_description(model, video_feature, tokenizer, max_length):
    input_seq = np.zeros((1, max_length))
    generated_desc = []

    for i in range(max_length):
        output = model.predict([video_feature, input_seq])
        predicted_word_index = np.argmax(output[0, i, :])
        predicted_word = tokenizer.index_word.get(predicted_word_index, '')

        if predicted_word == '':
            break

        generated_desc.append(predicted_word)
        input_seq[0, i] = predicted_word_index

    return ' '.join(generated_desc)

# Example: Generate description for an actual video feature
example_video_id = '5629184'  # Replace with any video ID from the dataset
example_video_feature = np.expand_dims(video_features[example_video_id].squeeze(), axis=0)
description1 = generate_description(model, example_video_feature, tokenizer, max_length)
if description1:
    print(description1)
else:
    print("No description generated.")


No description generated.
