In [None]:
import pandas as pd
import json
import tensorflow as tf
import torch
import numpy as np

json_file = "/kaggle/input/merged-data-with-hinglish/merged_data_with_hinglish.json"
data = []
with open(json_file, 'r') as file:
    for line in file:
        data.append(json.loads(line))

train_df = pd.DataFrame(data)

In [None]:
import numpy as np
import os

def load_video_embeddings(folder_path, df):
    embeddings = []
    problematic_ids = []
    problematic_indices = []
    
    for index, video_id in enumerate(df['video_id']):
        if video_id.startswith('vv'):
            video_id = video_id[2:]
        
        file_path = os.path.join(folder_path, f"{video_id}.npy")
        
        try:
            embedding = np.load(file_path)
            embeddings.append(embedding)
        except FileNotFoundError:
            problematic_ids.append(video_id)
            problematic_indices.append(index)
    
    embeddings_array = np.array(embeddings)
    return embeddings_array, problematic_ids, problematic_indices

# Usage
folder_path = '/kaggle/input/video-features-new/video_features_new'
video_array, problematic_ids_video, problematic_indices_video = load_video_embeddings(folder_path, train_df)

In [None]:
def load_audio_embeddings(folder_path, df):
    embeddings = []
    problematic_ids = []
    problematic_indices = []
    
    for index, video_id in enumerate(df['video_id']):
        if video_id.startswith('vv'):
            video_id = video_id[2:]
        
        file_path = os.path.join(folder_path, f"{video_id}.npy")
        
        try:
            embedding = np.load(file_path)
            embeddings.append(embedding)
        except FileNotFoundError:
            problematic_ids.append(video_id)
            problematic_indices.append(index)
    
    embeddings_array = np.array(embeddings)
    return embeddings_array, problematic_ids, problematic_indices

# Usage
folder_path = '/kaggle/input/audio-features-new/Audio_features'
audio_array, problematic_ids_audio, problematic_indices_audio = load_audio_embeddings(folder_path, train_df)

In [None]:
# Convert lists to sets and find the union
total_prob = list(set(problematic_ids_video).union(set(problematic_ids_audio)))
prob_indices = list(set(problematic_indices_video).union(set(problematic_indices_audio)))

In [None]:
total_prob = ['00006505', '00003730', '00002309', '00008427', '00000572', '00005665', '00002841', '00006564','vv00006505', 'vv00003730', 'vv00002309', 'vv00008427', 'vv00000572', 'vv00005665', 'vv00002841', 'vv00006564']

# Function to drop rows
def drop_rows_by_video_id(df, total_prob):
    # Create a mask to identify rows with video_id present in total_prob
    mask = df['video_id'].isin(total_prob)
    print(f"Number of rows to drop: {mask.sum()}")  # Diagnostic print

    # Invert the mask to keep rows where video_id is not in total_prob
    filtered_df = df[~mask]
    return filtered_df

# Example usage
filtered_df = drop_rows_by_video_id(train_df, total_prob)

In [None]:
hinglish_array = np.load("/kaggle/input/hinglish-question-embeddings/hinglish_question_embeddings.npy")
print(hinglish_array.shape)

english_array = np.load("/kaggle/input/english-question-embeddings/english_question_embeddings.npy")
print(english_array.shape)

In [None]:
# Create masks to filter out problematic indices
mask = np.ones(hinglish_array.shape[0], dtype=bool)  # Create a mask that is True for all indices
mask[prob_indices] = False  # Set problematic indices to False

# Apply masks to filter arrays
filtered_hinglish_array = hinglish_array[mask]
filtered_english_array = english_array[mask]

# Print new shapes after removing problematic indices
print("Filtered Hinglish array shape:", filtered_hinglish_array.shape)
print("Filtered English array shape:", filtered_english_array.shape)

In [None]:
folder_path = '/kaggle/input/video-features-new/video_features_new'
video_array, problematic_ids_video, problematic_indices_video = load_video_embeddings(folder_path, train_df)

folder_path = '/kaggle/input/audio-features-new/Audio_features'
audio_array, problematic_ids_audio, problematic_indices_audio = load_audio_embeddings(folder_path, train_df)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_df['labels'] = le.fit_transform(train_df['anser'])

In [None]:
train_df = train_df[['video_id', 'question_id', 'type', 'labels']]

In [None]:
from sklearn.model_selection import train_test_split
X_trainv, X_testv, y_train, y_test = train_test_split(video_array, train_df['labels'], test_size=0.2, random_state=42)

print("For Video:")
print("X_train shape:", X_trainv.shape)
print("X_test shape:", X_testv.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_traina, X_testa, y_train, y_test = train_test_split(audio_array, train_df['labels'], test_size=0.2, random_state=42)

print("For Audio:")
print("X_train shape:", X_traina.shape)
print("X_test shape:", X_testa.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)




X_traint, X_testt, y_train, y_test = train_test_split(hinglish_array, train_df['labels'], test_size=0.2, random_state=42)

print("For Audio:")
print("X_train shape:", X_traint.shape)
print("X_test shape:", X_testt.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import time

def compile_and_train_model(model, X_train, y_train, X_test, y_test, epochs=20, batch_size=16):
    # Define the optimizer
    lr = 1e-3
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    
    # Compile the model
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    
    # Define callbacks
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy',
                                                     factor=0.2,
                                                     patience=1,
                                                     verbose=1,
                                                     min_delta=1e-4,
                                                     min_lr=1e-15,
                                                     mode='max')
    earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                     min_delta=1e-4,
                                                     patience=70,
                                                     mode='max',
                                                     restore_best_weights=True,
                                                     verbose=1)
    
    callbacks = [earlystopping, reduce_lr]

    # Train the model
    start_time = time.time()
    history = model.fit(X_train, y_train,
                        validation_data=(X_test, y_test),
                        batch_size=batch_size,
                        epochs=epochs,
                        callbacks=callbacks)
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    # Plot training history
    plot_history(history)

def plot_history(history):
    # Plot training & validation accuracy values
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()



In [None]:
def create_model_audio_text_then_video():
    input_audio = tf.keras.Input(shape=[768, 1])
    audio_cnn = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(input_audio)
    audio_cnn = tf.keras.layers.MaxPooling1D()(audio_cnn)
    audio_cnn = tf.keras.layers.Flatten()(audio_cnn)

    input_text = tf.keras.Input(shape=[768, 1])
    text_cnn = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(input_text)
    text_cnn = tf.keras.layers.MaxPooling1D()(text_cnn)
    text_cnn = tf.keras.layers.Flatten()(text_cnn)

    concatenated_audio_text = tf.keras.layers.Concatenate(axis=1)([audio_cnn, text_cnn])

    input_video = tf.keras.Input(shape=[768, 1])
    video_cnn = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(input_video)
    video_cnn = tf.keras.layers.MaxPooling1D()(video_cnn)
    video_cnn = tf.keras.layers.Flatten()(video_cnn)

    concatenated_all = tf.keras.layers.Concatenate(axis=1)([concatenated_audio_text, video_cnn])

    x = tf.keras.layers.Dense(200, activation='relu')(concatenated_all)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(90, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(56, activation='relu')(x)
    output = tf.keras.layers.Dense(2, activation='softmax')(x)

    model = tf.keras.Model(inputs=[input_audio, input_text, input_video], outputs=output)
    return model


In [None]:
model_audio_text_then_video = create_model_audio_text_then_video()
compile_and_train_model(model_audio_text_then_video, [X_traina, X_traint, X_trainv], y_train, [X_testa, X_testt, X_testv], y_test)