In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

pes_ml_hack_link1_path = kagglehub.competition_download('PES-ml-hack-link1')

print('Data source import complete.')


In [None]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import random

import os

In [None]:
train_df = pd.read_csv('/kaggle/input/PES-ml-hack-link1/train.csv',encoding='cp1252')

# Define path to video clips

video_dir = '/kaggle/input/PES-ml-hack-link1/train_videos'





# Function to get video file path from IDs

def get_video_clip_path(row):

    dialogue_id = row['Dialogue_ID']

    utterance_id = row['Utterance_ID']

    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"

    return os.path.join(video_dir, filename)



# Apply the function to get file paths for each sampled clip

train_df['video_clip_path'] = train_df.apply(get_video_clip_path, axis=1)



# Check sample paths

print(train_df[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())

In [None]:
train_df.shape

# **PREPROCESSING STAGE**

In [None]:
# Count the number of records per emotion class



if train_df.isnull().sum().any():

    print("Dataset contains missing values!")

else:

    print("No missing values found.")



class_counts = train_df['Emotion'].value_counts()

print("Class distribution:")

print(class_counts)

Checked for null values: Ensured that there were **no missing values** in the dataset.



Since there are no null values, proceeded to check if the dataset is **biased**.



The distribution of records across different emotion classes in the dataset is calculated and we can infer that there is **class imbalance** because out of the 5 classes, around **half** the dataset values belong to **neutral class**.


In [None]:
!pip install -q spacy
!python -m spacy download en_core_web_sm

#  EMOTION ANALYSIS FROM TEXT

In [None]:
import pandas as pd

import numpy as np

import re

import nltk

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import layers, models






# Count the number of records per emotion class

class_counts = train_df['Emotion'].value_counts()

print("Original class distribution:")

print(class_counts)

In [None]:
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove non-alphabet characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize, remove stopwords, and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and not token.is_punct]

    return ' '.join(tokens)

# Apply the function
train_df['Processed_Utterance'] = train_df['Utterance'].apply(preprocess_text)

# Separate features and target
X = train_df['Processed_Utterance']
y = train_df['Emotion']


The `preprocess_text` function converts text to lowercase, removes non-alphabetic characters, eliminates stopwords, and applies lemmatization (converts to base or root form), returning cleaned text for further analysis.This new column, Processed_Uttterance will be used as the model’s input feature instead of the raw text.



X contains preprocessed utterances, and y contains the target emotion class for each instance.

In [None]:
# Tokenize the processed text

tokenizer = Tokenizer(num_words=10000)

tokenizer.fit_on_texts(X)

X_tokenized = tokenizer.texts_to_sequences(X)

X_padded = pad_sequences(X_tokenized, maxlen=100)



label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y)



X_train, X_val, y_train, y_val = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


* The code tokenizes the processed text into sequences and pads them to a

fixed length of 100. This prepares X for input into a neural network by converting it to a consistent numerical format.



*   Labels (y) are encoded into numerical values for model compatibility, allowing the model to learn and classify emotions based on the input features.

* The data is split into training and validation sets with an 80-20 ratio. Stratified splitting ensures an even class distribution across both sets, helping the model generalize well on unseen data.

In [None]:
def create_model():

    model = models.Sequential()

    model.add(layers.Embedding(input_dim=10000, output_dim=128, input_length=100))

    model.add(layers.LSTM(64, return_sequences=True))

    model.add(layers.LSTM(32))

    model.add(layers.Dense(64, activation='relu'))

    model.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model



model = create_model()



class_weights = {i: 1.0 / count for i, count in enumerate(class_counts)}

print("Class Weights:", class_weights)



history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, class_weight=class_weights)



val_loss, val_accuracy = model.evaluate(X_val, y_val)

print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')

We have built an lstm model due to the following reasons:

1. **Sequential Data**: LSTMs capture word order and context, which is crucial for text, whereas SVMs treat features independently.



2. **Feature Learning**: LSTMs learn relevant features automatically, while SVMs require manual feature engineering.



3. **Dimensionality**: LSTMs handle high-dimensional data (large vocabularies) better, while SVMs may struggle without special tuning.



4. **Probabilities**: LSTMs provide class probabilities, simplifying class balancing, while SVMs don’t naturally offer probabilities.





Model specifications:

- Embedding Layer: Converts each word in the sequence to a 128-dimensional vector, capturing word relationships and semantic meaning.



- LSTM Layers: Two LSTM layers (64 and 32 units) capture temporal dependencies and sequential patterns in the text.



- Dense Layers: A hidden dense layer (64 units, ReLU activation) helps learn non-linear relationships, and the output layer (softmax activation) predicts probabilities for each emotion class.



- Compilation: The model uses the Adam optimizer sparse_categorical_crossentropy loss for multi-class classification, and accuracy as the performance metric.



**Class Weights** adjusts the model to give more importance to minority classes by inversely scaling the weights based on class frequencies. This helps address **class imbalance**, improving model fairness across all emotions.



The model trains on X_train and y_train for 10 epochs with a batch size of



1.   List item

2.   List item



32, using X_val and y_val for validation. Class weights are applied to improve model balance across classes.



After training, the model is evaluated on validation data, providing val_loss and val_accuracy as metrics to assess model performance and generalization.

# **EMOTION ANALYSIS FROM VIDEO**

In [None]:
import os

import cv2

import numpy as np

import pandas as pd

from tensorflow.keras import layers, models

from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split



# Step 1: Load and preprocess the train.csv

def preprocess_data(csv_path='/kaggle/input/PES-ml-hack-link1/train_videos'):

    # Load the train.csv file

    train_data = pd.read_csv(csv_path,encoding='cp1252')



    # Extract relevant columns: 'Utterance_ID', 'Dialogue_ID', 'Emotion'

    train_data = train_data[['Utterance_ID', 'Dialogue_ID', 'Emotion']]



    # Map emotions to labels

    emotion_map = {'anger': 0, 'joy': 1, 'neutral': 2, 'sadness': 3, 'surprise': 4}

    train_data['Emotion_Label'] = train_data['Emotion'].map(emotion_map)



    return train_data


In [None]:


# Step 2: Create CNN Model for feature extraction

def create_cnn_model(input_shape=(64, 64, 3)):

    model = models.Sequential()



    # First convolutional block

    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))

    model.add(layers.MaxPooling2D((2, 2)))



    # Second convolutional block

    model.add(layers.Conv2D(64, (3, 3), activation='relu'))

    model.add(layers.MaxPooling2D((2, 2)))



    # Third convolutional block

    model.add(layers.Conv2D(128, (3, 3), activation='relu'))

    model.add(layers.MaxPooling2D((2, 2)))



    # Flatten the feature maps

    model.add(layers.Flatten())



    # Fully connected layer for feature extraction

    model.add(layers.Dense(128, activation='relu'))

    model.add(layers.Dropout(0.5))



    # The model ends here for feature extraction purpose

    return model


### **CNN Model for Feature Extraction**



1. **Convolutional Blocks**:

   - Three convolutional blocks with increasing filters (32, 64, 128), each followed by max-pooling to extract spatial features.



2. **Flattening**:

   - Flatten the feature maps into a 1D vector.



3. **Fully Connected Layer**:

   - A dense layer (128 units) with ReLU activation and a dropout layer to prevent overfitting.



4. **Output**:

   - The model is designed for feature extraction and ends before the classification layer.



---


In [None]:


# Step 3: Extract frames from video

def extract_frames_from_video(video_path, frame_size=(64, 64)):

    video_capture = cv2.VideoCapture(video_path)

    frames = []



    while video_capture.isOpened():

        ret, frame = video_capture.read()

        if not ret:

            break

        # Resize and normalize the frames

        frame = cv2.resize(frame, frame_size)

        frame = frame / 255.0  # Normalize to [0, 1]

        frames.append(frame)



    video_capture.release()

    return np.array(frames)



# Step 4: Get video features using the CNN model

def get_video_features(dialogue_id, utterance_id, cnn_model, video_dir='videos'):

    video_filename = f'dia{dialogue_id}_utt{utterance_id}.mp4'

    video_path = os.path.join(video_dir, video_filename)



    if os.path.exists(video_path):

        frames = extract_frames_from_video(video_path)

        # Extract features using the CNN model

        features = cnn_model.predict(frames)

        return np.mean(features, axis=0)  # Use the average of the frame features

    else:

        return None


### **Video Frame Extraction and Feature Extraction Using CNN**



1. **Extract Frames from Video**:

   - This function reads video files frame by frame.

   - Each frame is resized to a specified size (default is 64x64 pixels).

   - The pixel values of the frames are normalized to the range `[0, 1]` for consistency across videos.



2. **Get Video Features Using CNN**:

   - For each video, the function constructs the video file path using `Dialogue_ID` and `Utterance_ID`.

   - It extracts frames from the video using the previously defined frame extraction function.

   - The frames are then passed through a pre-trained CNN model to extract features.

   - The extracted features are averaged over all frames to create a single feature vector representing the video.



---


In [None]:


# Step 5: Prepare data for training the emotion recognition model

def prepare_data(train_data, cnn_model, video_dir='videos'):

    X = []

    y = []



    for _, row in train_data.iterrows():

        features = get_video_features(row['Dialogue_ID'], row['Utterance_ID'], cnn_model, video_dir)

        if features is not None:

            X.append(features)

            y.append(row['Emotion_Label'])



    X = np.array(X)

    y = np.array(y)



    # One-hot encode the labels

    y = to_categorical(y, num_classes=5)



    return X, y



# Step 6: Create the Emotion Recognition Model

def create_emotion_recognition_model(input_shape=(64,)):

    model = models.Sequential()



    # Fully connected layers for emotion classification

    model.add(layers.Dense(128, activation='relu', input_shape=input_shape))

    model.add(layers.Dropout(0.5))

    model.add(layers.Dense(64, activation='relu'))



    # Output layer for emotion classification (5 classes)

    model.add(layers.Dense(5, activation='softmax'))



    return model


### **Emotion Recognition Model Setup**



1. **Prepare Data for Training**:

   - This function processes the training data by:

     - Extracting video features using the pre-trained CNN model for each video.

     - Collecting the features and corresponding emotion labels.

     - One-hot encoding the emotion labels to prepare them for classification.



2. **Create the Emotion Recognition Model**:

   - A fully connected neural network model is defined for emotion classification:

     - The first layer is a Dense layer with 128 units and ReLU activation.

     - A Dropout layer is added to prevent overfitting.

     - A second Dense layer with 64 units and ReLU activation.

     - The output layer consists of 5 units (one for each emotion) with a softmax activation function for multi-class classification.



---


In [None]:


# Step 7: Main script to train the model

def train_emotion_recognition_model(csv_path='/kaggle/input/PES-ml-hack-link1/train.csv', video_dir='videos'):

    # Preprocess the data

    train_data = preprocess_data(csv_path)



    # Create CNN model for feature extraction

    cnn_model = create_cnn_model(input_shape=(64, 64, 3))



    # Prepare features and labels

    X, y = prepare_data(train_data, cnn_model, video_dir)



    # Split into training and validation sets

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



    # Create and compile the emotion recognition model

    emotion_model = create_emotion_recognition_model(input_shape=(X.shape[1],))

    emotion_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



    # Train the model

    emotion_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))



    # Evaluate the model on the validation set

    val_loss, val_accuracy = emotion_model.evaluate(X_val, y_val)

    print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')



# Run the training process

train_emotion_recognition_model(csv_path='/kaggle/input/PES-ml-hack-link1/train.csv', video_dir='/kaggle/input/PES-ml-hack-link1/train_videos')

### **Training the Emotion Recognition Model**



1. **Preprocess Data**:

   - The data is preprocessed, extracting video features and one-hot encoded emotion labels.



2. **Feature Extraction**:

   - A CNN model is created to extract features from video frames.



3. **Prepare Features and Labels**:

   - Features and labels are prepared for training, then split into training and validation sets.



4. **Create and Compile Model**:

   - An emotion recognition model with fully connected layers is created and compiled.



5. **Train Model**:

   - The model is trained for 10 epochs, using the training data and validating on the validation set.



6. **Evaluate Model**:

   - The model is evaluated on the validation set, with the loss and accuracy printed.



---


# **COMBINING TEXT AND VIDEO MODELS**




In [None]:
import numpy as np

import pandas as pd

import re

import nltk

import os

import cv2

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import layers, models

from tensorflow.keras.utils import to_categorical



# Download necessary NLTK data files

nltk.download('stopwords')

nltk.download('wordnet')


In [None]:


# Preprocessing function for text

# def preprocess_text(text):

#     text = text.lower()

#     text = re.sub(r'[^a-zA-Z\s]', '', text)

#     tokens = text.split()

#     stop_words = set(nltk.corpus.stopwords.words('english'))

#     tokens = [word for word in tokens if word not in stop_words]

#     lemmatizer = nltk.stem.WordNetLemmatizer()

#     tokens = [lemmatizer.lemmatize(word) for word in tokens]

#     return ' '.join(tokens)



# # Apply preprocessing to the utterances

# train_df['Processed_Utterance'] = train_df['Utterance'].apply(preprocess_text)



# # Separate features and labels

# X = train_df['Processed_Utterance']

# y = train_df['Emotion']

import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove non-alphabet characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize, remove stopwords, and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and not token.is_punct]

    return ' '.join(tokens)

# Apply the function
train_df['Processed_Utterance'] = train_df['Utterance'].apply(preprocess_text)

# Separate features and target
X = train_df['Processed_Utterance']
y = train_df['Emotion']



# Tokenize the processed text

tokenizer = Tokenizer(num_words=10000)

tokenizer.fit_on_texts(X)

X_tokenized = tokenizer.texts_to_sequences(X)

X_padded = pad_sequences(X_tokenized, maxlen=100)



# Encode emotion labels

label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y)



# Split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)



# Define the text model

def create_text_model():

    model = models.Sequential()

    model.add(layers.Embedding(input_dim=10000, output_dim=128, input_length=100))

    model.add(layers.LSTM(64, return_sequences=True))

    model.add(layers.LSTM(32))

    model.add(layers.Dense(64, activation='relu'))

    model.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model



# Create the text model

text_model = create_text_model()



# Calculate class weights

class_counts = train_df['Emotion'].value_counts()

class_weights = {i: 1.0 / count for i, count in enumerate(class_counts)}

print("Class Weights:", class_weights)



# Train the text model with class weights

history_text = text_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, class_weight=class_weights)



# Evaluate the text model on the validation set

val_loss_text, val_accuracy_text = text_model.evaluate(X_val, y_val)

print(f'Text Model - Validation Loss: {val_loss_text}, Validation Accuracy: {val_accuracy_text}')






### **Text Preprocessing & Model Training**



1. **Preprocessing**:

   - Text is cleaned (lowercased, special chars removed), tokenized, stop words removed, and lemmatized.



2. **Data Preparation**:

   - Features (`X`) are tokenized and padded; labels (`y`) are encoded using `LabelEncoder`.



3. **Model**:

   - A deep learning model is defined with an embedding layer, LSTM layers, and a softmax output for multi-class classification.



4. **Class Weights**:

   - Class weights are calculated to handle imbalanced data.



5. **Training**:

   - The model is trained for 10 epochs with class weights to handle imbalanced classes.



6. **Evaluation**:

   - Model performance (loss, accuracy) is evaluated on the validation set.



---



This workflow cleans the data, builds a model, and trains it while handling class imbalance.

In [None]:


# Load and preprocess the train.csv

train_df = pd.read_csv('/kaggle/input/PES-ml-hack-link1/train.csv', encoding='cp1252')

video_dir = '/kaggle/input/PES-ml-hack-link1/train_videos'



# Function to get video file path from IDs

def get_video_clip_path(row):

    dialogue_id = row['Dialogue_ID']

    utterance_id = row['Utterance_ID']

    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"

    return os.path.join(video_dir, filename)



# Apply the function to get file paths for each sampled clip

train_df['video_clip_path'] = train_df.apply(get_video_clip_path, axis=1)



# Check sample paths

print(train_df[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())



# Step 1: Load and preprocess the train.csv for video

def preprocess_video_data(csv_path='/kaggle/input/PES-ml-hack-link1/train.csv'):

    train_data = pd.read_csv(csv_path, encoding='cp1252')

    train_data = train_data[['Utterance_ID', 'Dialogue_ID', 'Emotion']]

    emotion_map = {'anger': 0, 'joy': 1, 'neutral': 2, 'sadness': 3, 'surprise': 4}

    train_data['Emotion_Label'] = train_data['Emotion'].map(emotion_map)

    return train_data



# Step 2: Create CNN Model for feature extraction

def create_cnn_model(input_shape=(64, 64, 3)):

    model = models.Sequential()

    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))

    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Conv2D(64, (3, 3), activation='relu'))

    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Conv2D(128, (3, 3), activation='relu'))

    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Flatten())

    model.add(layers.Dense(128, activation='relu'))

    model.add(layers.Dropout(0.5))

    return model



# Step 3: Extract frames from video

def extract_frames_from_video(video_path, frame_size=(64, 64)):

    video_capture = cv2.VideoCapture(video_path)

    frames = []



    while video_capture.isOpened():

        ret, frame = video_capture.read()

        if not ret:

            break

        frame = cv2.resize(frame, frame_size)

        frame = frame / 255.0  # Normalize to [0, 1]

        frames.append(frame)



    video_capture.release()

    return np.array(frames)



# Step 4: Get video features using the CNN model

def get_video_features(dialogue_id, utterance_id, cnn_model, video_dir='videos'):

    video_filename = f'dia{dialogue_id}_utt{utterance_id}.mp4'

    video_path = os.path.join(video_dir, video_filename)



    if os.path.exists(video_path):

        frames = extract_frames_from_video(video_path)

        features = cnn_model.predict(frames)

        return np.mean(features, axis=0)  # Use the average of the frame features

    else:

        return None



# Step 5: Prepare data for training the emotion recognition model

def prepare_video_data(train_data, cnn_model, video_dir='videos'):

    X_video = []

    y = []



    for _, row in train_data.iterrows():

        features = get_video_features(row['Dialogue_ID'], row['Utterance_ID'], cnn_model, video_dir)

        if features is not None:

            X_video.append(features)

            y.append(row['Emotion_Label'])



    X_video = np.array(X_video)

    y = np.array(y)

    y = to_categorical(y, num_classes=5)



    return X_video, y








### **Video Data Preprocessing and Model Setup**



1. **Load and Preprocess Data**:

   - Loads `train.csv` containing video metadata.

   - Generates video file paths based on `Dialogue_ID` and `Utterance_ID`.



2. **Emotion Label Mapping**:

   - Maps emotion labels (e.g., 'anger', 'joy') to numeric values using a dictionary.



3. **CNN Model for Feature Extraction**:

   - Defines a CNN model with 3 convolutional layers and max-pooling for feature extraction from video frames.



4. **Extract Video Frames**:

   - Extracts frames from video files, resizes to `(64, 64)`, and normalizes pixel values.



5. **Video Feature Extraction**:

   - Uses the CNN model to extract features from the video frames, averaging the features over all frames.



6. **Prepare Data for Training**:

   - Collects extracted video features and emotion labels, then prepares the dataset for training by converting labels to one-hot encoding.



---

In [None]:


# Create CNN model for feature extraction

cnn_model = create_cnn_model(input_shape=(64, 64, 3))



# Prepare video features and labels

video_data = preprocess_video_data(csv_path='/kaggle/input/PES-ml-hack-link1/train.csv')

X_video, y_video = prepare_video_data(video_data, cnn_model, video_dir='/kaggle/input/PES-ml-hack-link1/train_videos')



# Split the data into training and validation sets for video

X_train_video, X_val_video, y_train_video, y_val_video = train_test_split(X_video, y_video, test_size=0.2, random_state=42)



# Define the video model

def create_video_model():

    model = models.Sequential()

    model.add(layers.Dense(128, activation='relu', input_shape=(X_video.shape[1],)))

    model.add(layers.Dropout(0.5))

    model.add(layers.Dense(64, activation='relu'))

    model.add(layers.Dense(5, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model



# Create the video model

video_model = create_video_model()



# Train the video model

history_video = video_model.fit(X_train_video, y_train_video, validation_data=(X_val_video, y_val_video), epochs=10, batch_size=32)



# Evaluate the video model on the validation set

val_loss_video, val_accuracy_video = video_model.evaluate(X_val_video, y_val_video)

print(f'Video Model - Validation Loss: {val_loss_video}, Validation Accuracy: {val_accuracy_video}')



# Combine predictions from both models

def combine_predictions(text_model, video_model, X_text, X_video):

    text_predictions = text_model.predict(X_text)

    video_predictions = video_model.predict(X_video)



    # Average the predictions

    combined_predictions = (text_predictions + video_predictions) / 2

    return combined_predictions



# Prepare validation data for combining predictions

X_val_text = X_val

X_val_video = X_val_video



# Get combined predictions

combined_predictions = combine_predictions(text_model, video_model, X_val_text, X_val_video)



# Convert combined predictions to class labels

final_predictions = np.argmax(combined_predictions, axis=1)

### **Video and Text Model Training & Prediction Combination**



1. **CNN Model for Feature Extraction**:

   - A CNN model is created to extract features from video frames (`input_shape=(64, 64, 3)`).



2. **Prepare Data**:

   - Video features and emotion labels are prepared using `preprocess_video_data` and `prepare_video_data`.



3. **Model Creation**:

   - A simple fully connected video model is created with two Dense layers for classification. It is compiled using the Adam optimizer and categorical cross-entropy loss.



4. **Training**:

   - The video model is trained for 10 epochs with the training data, and its performance is evaluated on the validation set.



5. **Combining Text and Video Predictions**:

   - Predictions from both the text and video models are averaged to create a combined prediction for each sample.

   - The final class label is determined by taking the argmax of the combined predictions.



This approach merges the strengths of both text and video models by averaging their predictions to improve overall performance.

# **RUNNING THE COMBINED MODEL ON TEST DATA**

In [None]:


import pandas as pd

import numpy as np

import os





df = pd.read_csv('/kaggle/input/PES-ml-hack-link1/test.csv',encoding='cp1252')

video_dir = '/kaggle/input/PES-ml-hack-link1/test_videos'



# Function to get video file path from IDs

def get_video_clip_path(row):

    dialogue_id = row['Dialogue_ID']

    utterance_id = row['Utterance_ID']

    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"

    return os.path.join(video_dir, filename)



# Apply the function to get file paths for each sampled clip

df['video_clip_path'] = df.apply(get_video_clip_path, axis=1)



print(df[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())



def predict_on_test_data(df, text_model, video_model):



    # Preprocess text

    X_text_test = df['Utterance'].apply(preprocess_text)  # Use the same preprocessing as training

    X_text_test_tokenized = tokenizer.texts_to_sequences(X_text_test)

    X_text_test_padded = pad_sequences(X_text_test_tokenized, maxlen=100)



    # Extract video features

    X_video_test = []

    for _, row in df.iterrows():

        features = get_video_features(row['Dialogue_ID'], row['Utterance_ID'], cnn_model, video_dir)

        if features is not None:

            X_video_test.append(features)



    X_video_test = np.array(X_video_test)



    # Make predictions

    text_predictions = text_model.predict(X_text_test_padded)

    video_predictions = video_model.predict(X_video_test)



    # Combine predictions

    combined_predictions = (text_predictions + video_predictions) / 2

    final_predictions = np.argmax(combined_predictions, axis=1)



    # Map numerical predictions back to emotion labels

    emotion_labels = label_encoder.inverse_transform(final_predictions)



    return emotion_labels



# Get predictions for the test data

all_preds = predict_on_test_data(df, text_model, video_model)



# Prepare submission DataFrame

all_ids = df["Sr No."]

submission_df = pd.DataFrame({

    'Sr No.': all_ids,

    'Emotion': all_preds

})



# Save the DataFrame to CSV

submission_df.to_csv("submission.csv", index=False)

### **Test Data Prediction and Submission Pipeline**



1. **Load Test Data**:

   - The test dataset (`test.csv`) is loaded, and paths to the corresponding video files are generated based on the `Dialogue_ID` and `Utterance_ID` using the `get_video_clip_path` function.



2. **Text Data Preprocessing**:

   - The `Utterance` column is preprocessed using the same steps as during training (lowercasing, tokenization, padding) to prepare text input for the model.



3. **Extract Video Features**:

   - For each test sample, video features are extracted using the pre-trained CNN model by calling the `get_video_features` function. These features are collected into `X_video_test`.



4. **Make Predictions**:

   - Both text and video models make predictions on the processed test data:

     - `text_model.predict()` is used for text data predictions.

     - `video_model.predict()` is used for video feature predictions.

   - The predictions from both models are combined by averaging them.



5. **Convert Predictions to Labels**:

   - The combined predictions are converted to emotion labels by taking the `argmax` of the averaged predictions.

   - The numerical predictions are then mapped back to emotion labels using `label_encoder.inverse_transform()`.



6. **Prepare and Save Submission**:

   - A DataFrame is created with the `Sr No.` from the test data and the predicted emotions.

   - The submission DataFrame is saved to a CSV file for submission.



---



This pipeline preprocesses the test data, makes predictions using both text and video models, combines the results, and generates a final CSV file for submission.

**                   THANK YOU**