In [3]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder



# Function to clean text columns (Lowercase, whitespace, special characters)

def clean_text_columns(df, columns):

    """

    Convert text columns to lowercase, remove unnecessary whitespace, and remove special characters.



    Parameters:

    df (pd.DataFrame): The DataFrame containing the columns to be cleaned.

    columns (list): List of column names to clean.



    Returns:

    pd.DataFrame: DataFrame with cleaned text columns.

    """

    for col in columns:

        # Convert to lowercase and strip whitespace

        df[col] = df[col].str.lower().str.strip()



        # Remove extra spaces

        df[col] = df[col].str.replace(r'\s+', ' ', regex=True)



        # Remove special characters

        df[col] = df[col].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)



    return df



# Basic preprocessing for both datasets

def preprocess_data(train, test):

    # Clean 'Utterance' and other basic text preprocessing

    for df in [train, test]:

        # Clean 'Utterance' column

        df['Utterance'] = df['Utterance'].str.strip()

        df['Utterance'] = df['Utterance'].str.replace('\n', ' ')



        # Convert timestamps to seconds

        df['start_seconds'] = pd.to_timedelta(df['StartTime']).dt.total_seconds()

        df['end_seconds'] = pd.to_timedelta(df['EndTime']).dt.total_seconds()

        df['utterance_duration'] = df['end_seconds'] - df['start_seconds']



        # Add text-based features

        df['utterance_length'] = df['Utterance'].str.len()

        df['word_count'] = df['Utterance'].str.split().str.len()



        # Add dialogue context features

        df['position_in_dialogue'] = df.groupby('Dialogue_ID')['Utterance_ID'].rank()

        df['dialogue_length'] = df.groupby('Dialogue_ID')['Utterance_ID'].transform('count')

        df['relative_position'] = df['position_in_dialogue'] / df['dialogue_length']



    # Clean the 'Utterance' column by removing unnecessary characters

    text_columns = ['Utterance']

    train = clean_text_columns(train, text_columns)

    test = clean_text_columns(test, text_columns)



    # Encode sentiment labels (for training)

    label_encoder = LabelEncoder()

    train['Sentiment_Label'] = label_encoder.fit_transform(train['Emotion'])



    # # One-hot encode 'Speaker' column

    # train = pd.get_dummies(train, columns=['Speaker'])

    # test = pd.get_dummies(test, columns=['Speaker'])



    # Encode 'Speaker' column with LabelEncoder

    # le = LabelEncoder()

    # all_speakers = pd.concat([train['Speaker'], test['Speaker']]).unique()

    # le.fit(all_speakers)

    train=train.drop('Speaker',axis=1)

    test=test.drop('Speaker',axis=1)



    return train, test

In [4]:
train = pd.read_csv('/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_train/train_emotion.csv', encoding='ISO-8859-1')

test=pd.read_csv('/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_test/test_emotion.csv', encoding='ISO-8859-1')

In [5]:
train, test = preprocess_data(train, test)



# Check the preprocessed data

print(train.head())

print(test.head())

   Sr No.                                          Utterance  Emotion  \
0       8  but therell be perhaps 30 people under you so ...  neutral   
1      12  all right then well have a definite answer for...  neutral   
2      32                                   can i get a beer  neutral   
3      40            he was with her when he wrote this poem  neutral   
4      42    now that ive touched you you seem emptier still  neutral   

   Dialogue_ID  Utterance_ID  Season  Episode     StartTime       EndTime  \
0            0             7       8       21  00:16:48,800  00:16:54,514   
1            0            11       8       21  00:17:05,025  00:17:13,324   
2            2             8       3        6   0:06:07,367   0:06:08,035   
3            3             3       3       12  00:10:21,078  00:10:23,496   
4            3             5       3       12  00:10:26,667  00:10:29,586   

   start_seconds  end_seconds  utterance_duration  utterance_length  \
0        49760.0      55474

In [9]:
import numpy as np
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import cv2

import torch

from transformers import AutoTokenizer, AutoModel

import os

from sklearn.preprocessing import StandardScaler

import joblib

import librosa

import av  # Library for video/audio processing



# Check GPU availability

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")



def extract_features_from_dataset(df, video_dir, is_training=True):

    # Initialize BERT model

    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    model = AutoModel.from_pretrained('bert-base-uncased').to(device)

    

    # Initialize face detector

    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    

    # Load or initialize scalers

    if is_training:

        visual_scaler = StandardScaler()

        text_scaler = StandardScaler()

        audio_scaler = StandardScaler()

    else:

        visual_scaler = joblib.load('visual_scaler.pkl')

        text_scaler = joblib.load('text_scaler.pkl')

        audio_scaler = joblib.load('audio_scaler.pkl')

    

    features = {

        'text': [],

        'visual': [],

        'audio': [],

        'metadata': []

    }

    valid_indices = []

    

    print(f"Starting feature extraction for {len(df)} samples...")

    

    for i, (index, row) in enumerate(df.iterrows()):

        if i % 10 == 0:

            print(f"Processing sample {i}/{len(df)}")

        

        try:

            # Text features with BERT

            inputs = tokenizer(row['Utterance'], return_tensors="pt", padding=True, truncation=True, max_length=128)

            inputs = {k: v.to(device) for k, v in inputs.items()}

            

            with torch.no_grad():

                outputs = model(**inputs)

                text_features = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

            

            # Visual features

            video_path = os.path.join(video_dir, f"dia{row['Dialogue_ID']}_utt{row['Utterance_ID']}.mp4")

            cap = cv2.VideoCapture(video_path)

            frame_features = []

            

            fps = cap.get(cv2.CAP_PROP_FPS)

            start_frame = int(row['start_seconds'] * fps)

            end_frame = int(row['end_seconds'] * fps)

            cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

            

            while cap.isOpened():

                frame_pos = cap.get(cv2.CAP_PROP_POS_FRAMES)

                if frame_pos > end_frame:

                    break

                

                ret, frame = cap.read()

                if not ret:

                    break



                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

                faces = face_cascade.detectMultiScale(gray, 1.1, 4)

                

                if len(faces) > 0:

                    x, y, w, h = max(faces, key=lambda x: x[2] * x[3])

                    face_roi = gray[y:y+h, x:x+w]

                    face_roi = cv2.resize(face_roi, (64, 64))

                    

                    frame_feat = [

                        np.mean(face_roi),

                        np.std(face_roi),

                        np.max(face_roi),

                        np.min(face_roi),

                    ]

                    edges = cv2.Canny(face_roi, 100, 200)

                    frame_feat.append(np.mean(edges))

                    

                    frame_features.append(frame_feat)

            

            cap.release()

            

            if frame_features:

                frame_features = np.array(frame_features)

                visual_features = np.concatenate([

                    np.mean(frame_features, axis=0),

                    np.std(frame_features, axis=0),

                    np.max(frame_features, axis=0),

                    np.min(frame_features, axis=0)

                ])

            else:

                visual_features = np.zeros(5 * 4)

            

            # Audio features

            container = av.open(video_path)

            audio_stream = next(stream for stream in container.streams if stream.type == 'audio')

            audio_frames = []

            sample_rate = audio_stream.sample_rate

            start_sample = int(row['start_seconds'] * sample_rate)

            end_sample = int(row['end_seconds'] * sample_rate)

            

            for frame in container.decode(audio_stream):

                audio_array = frame.to_ndarray()

                audio_frames.append(audio_array)

            audio = np.concatenate(audio_frames)



            # Slice audio between start_sample and end_sample

            if len(audio) > start_sample:

                audio_segment = audio[start_sample:end_sample] if len(audio) > end_sample else audio[start_sample:]

            else:

                audio_segment = np.array([])  # Empty segment if audio is shorter than expected

            

            # Calculate audio features if audio_segment is not empty

            if audio_segment.size > 0:

                audio_features = [

                    np.mean(audio_segment),

                    np.std(audio_segment),

                    np.max(audio_segment),

                    np.min(audio_segment),

                    librosa.feature.zero_crossing_rate(audio_segment)[0].mean(),

                    librosa.feature.spectral_centroid(audio_segment, sr=sample_rate)[0].mean(),

                    librosa.feature.mfcc(audio_segment, sr=sample_rate, n_mfcc=20).mean(axis=1).mean()

                ]

            else:

                audio_features = [0] * 7  # Provide default values for missing audio features

            

            # Metadata features

            metadata_features = np.array([

                row['utterance_duration'],

                row['position_in_dialogue'],

                row['relative_position'],

                

            ])

            

            # Append to features

            features['text'].append(text_features)

            features['visual'].append(visual_features)

            features['audio'].append(audio_features)

            features['metadata'].append(metadata_features)

            valid_indices.append(index)

            

        except Exception as e:

            print(f"Error processing sample {i}: {str(e)}")

            continue

    

    # Standardize features if training

    for key in features:

        features[key] = np.array(features[key])

        

    if is_training:

        features['visual'] = visual_scaler.fit_transform(features['visual'])

        features['text'] = text_scaler.fit_transform(features['text'])

        features['audio'] = audio_scaler.fit_transform(features['audio'])

        

        # Save scalers

        joblib.dump(visual_scaler, 'visual_scaler.pkl')

        joblib.dump(text_scaler, 'text_scaler.pkl')

        joblib.dump(audio_scaler, 'audio_scaler.pkl')

    else:

        features['visual'] = visual_scaler.transform(features['visual'])

        features['text'] = text_scaler.transform(features['text'])

        features['audio'] = audio_scaler.transform(features['audio'])

    

    return features, valid_indices



# Run feature extraction

print("Processing training data...")

train_features, train_indices = extract_features_from_dataset(

    train, 

    '/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_train/train_data', 

    is_training=True

)



print("\nProcessing test data...")

test_features, test_indices = extract_features_from_dataset(

    test, 

    '/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_test/test_data',

    is_training=False

)



# Save features

np.save('train_features.npy', train_features, allow_pickle=True)

np.save('test_features.npy', test_features, allow_pickle=True)

np.save('train_indices.npy', train_indices, allow_pickle=True)

np.save('test_indices.npy', test_indices, allow_pickle=True)





print("Feature extraction completed!")


Using device: cpu
Processing training data...




Starting feature extraction for 1000 samples...
Processing sample 0/1000
Processing sample 10/1000
Processing sample 20/1000
Processing sample 30/1000
Processing sample 40/1000
Processing sample 50/1000
Processing sample 60/1000
Processing sample 70/1000
Processing sample 80/1000
Processing sample 90/1000
Processing sample 100/1000
Processing sample 110/1000
Processing sample 120/1000
Processing sample 130/1000
Processing sample 140/1000
Processing sample 150/1000
Processing sample 160/1000
Processing sample 170/1000
Processing sample 180/1000
Processing sample 190/1000
Processing sample 200/1000
Processing sample 210/1000
Processing sample 220/1000
Processing sample 230/1000
Processing sample 240/1000
Processing sample 250/1000
Processing sample 260/1000
Processing sample 270/1000
Processing sample 280/1000
Processing sample 290/1000
Processing sample 300/1000
Processing sample 310/1000
Processing sample 320/1000
Processing sample 330/1000
Processing sample 340/1000
Processing sample 



Starting feature extraction for 100 samples...
Processing sample 0/100
Processing sample 10/100
Processing sample 20/100
Processing sample 30/100
Processing sample 40/100
Processing sample 50/100
Processing sample 60/100
Processing sample 70/100
Processing sample 80/100
Processing sample 90/100
Feature extraction completed!


In [10]:
y_train = train.iloc[train_indices]['Sentiment_Label'].values
np.save('y_train.npy', y_train, allow_pickle=True)

In [13]:
# Assuming train_indices and test_indices are correctly defined and loaded
y_train = np.load('y_train.npy', allow_pickle=True)
y_test = y_train[test_indices]  # Creating y_test based on indices


In [14]:
print("Unique values in y_train:", np.unique(y_train))
print("Unique values in y_test:", np.unique(y_test))


Unique values in y_train: [0 1 2 3 4]
Unique values in y_test: [0 1 2 3 4]


In [15]:
# Map values greater than 2 to the class 2
y_train = np.where(y_train > 2, 2, y_train)
y_test = np.where(y_test > 2, 2, y_test)


In [16]:
print("Mapped unique values in y_train:", np.unique(y_train))
print("Mapped unique values in y_test:", np.unique(y_test))


Mapped unique values in y_train: [0 1 2]
Mapped unique values in y_test: [0 1 2]


In [17]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR

# Load the extracted features
print("Loading data...")
train_features = np.load('train_features.npy', allow_pickle=True).item()
test_features = np.load('test_features.npy', allow_pickle=True).item()
train_indices = np.load('train_indices.npy', allow_pickle=True)
test_indices = np.load('test_indices.npy', allow_pickle=True)
y_train = np.load('y_train.npy', allow_pickle=True)
y_test = y_train[test_indices]

# Ensure labels are within the range [0, 4]
y_train = np.clip(y_train, 0, 4)
y_test = np.clip(y_test, 0, 4)

# Split the training data
print("Splitting data...")
X_text_train, X_text_val, X_visual_train, X_visual_val, X_metadata_train, X_metadata_val, y_train, y_val = train_test_split(
    train_features['text'].astype(np.float32), 
    train_features['visual'].astype(np.float32), 
    train_features['metadata'].astype(np.float32), 
    y_train, 
    test_size=0.2, 
    random_state=42
)

# Create datasets and dataloaders
train_dataset = TensorDataset(
    torch.from_numpy(X_text_train),
    torch.from_numpy(X_visual_train),
    torch.from_numpy(X_metadata_train),
    torch.from_numpy(y_train)
)
val_dataset = TensorDataset(
    torch.from_numpy(X_text_val),
    torch.from_numpy(X_visual_val),
    torch.from_numpy(X_metadata_val),
    torch.from_numpy(y_val)
)
test_dataset = TensorDataset(
    torch.from_numpy(test_features['text'].astype(np.float32)),
    torch.from_numpy(test_features['visual'].astype(np.float32)),
    torch.from_numpy(test_features['metadata'].astype(np.float32)),
    torch.from_numpy(y_test)
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Late Fusion Model
class LateFusionModel(nn.Module):
    def __init__(self, input_size_text, input_size_visual, input_size_metadata, hidden_size, num_classes):
        super().__init__()
        self.text_encoder = nn.Sequential(
            nn.Linear(input_size_text, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )
        
        self.visual_encoder = nn.Sequential(
            nn.Linear(input_size_visual, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )
        
        self.metadata_encoder = nn.Sequential(
            nn.Linear(input_size_metadata, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )
        
        self.fusion_weights = nn.Parameter(torch.ones(3)/3)

    def forward(self, text, visual, metadata):
        text_out = self.text_encoder(text)
        visual_out = self.visual_encoder(visual)
        metadata_out = self.metadata_encoder(metadata)
        
        weights = F.softmax(self.fusion_weights, dim=0)
        return weights[0] * text_out + weights[1] * visual_out + weights[2] * metadata_out

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Function to train and evaluate model
def train_and_evaluate(model, train_loader, val_loader, test_loader, num_epochs=100):
    print("\nTraining Late Fusion Model...")
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss()
    scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
    
    best_val_f1 = 0
    best_model_state = None
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        
        for batch_idx, (text, visual, metadata, labels) in enumerate(train_loader):
            text, visual = text.to(device), visual.to(device)
            metadata, labels = metadata.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(text, visual, metadata)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for text, visual, metadata, labels in val_loader:
                text, visual = text.to(device), visual.to(device)
                metadata, labels = metadata.to(device), labels.to(device)
                
                outputs = model(text, visual, metadata)
                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        val_acc = accuracy_score(val_true, val_preds)
        val_f1 = f1_score(val_true, val_preds, average='macro')
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state = model.state_dict()
            print(f"New best model saved! F1-score: {val_f1:.4f}")
        
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"Average Loss: {running_loss/len(train_loader):.4f}")
        print(f"Validation Accuracy: {val_acc:.4f}, F1-score: {val_f1:.4f}")
        print(f"Fusion Weights: {F.softmax(model.fusion_weights, dim=0).cpu().detach().numpy()}")
        print("-" * 50)
        
        scheduler.step()
    
    # Load best model and evaluate on test set
    model.load_state_dict(best_model_state)
    
    # Test phase
    model.eval()
    test_preds = []
    test_true = []
    
    with torch.no_grad():
        for text, visual, metadata, labels in test_loader:
            text, visual = text.to(device), visual.to(device)
            metadata, labels = metadata.to(device), labels.to(device)
            
            outputs = model(text, visual, metadata)
            preds = torch.argmax(outputs, dim=1)
            test_preds.extend(preds.cpu().numpy())
            test_true.extend(labels.cpu().numpy())
    
    test_acc = accuracy_score(test_true, test_preds)
    test_f1 = f1_score(test_true, test_preds, average='macro')
    
    print("\nFinal Results:")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test F1-score: {test_f1:.4f}")
    print(f"Final Fusion Weights: {F.softmax(model.fusion_weights, dim=0).cpu().detach().numpy()}")
    
    # Save model
    torch.save(model.state_dict(), 'final_late_fusion_model.pth')
    print("\nModel saved to 'final_late_fusion_model.pth'")
    
    return test_acc, test_f1

# Create and train Late Fusion Model with num_classes=5
model = LateFusionModel(
    input_size_text=train_features['text'].shape[1],
    input_size_visual=train_features['visual'].shape[1],
    input_size_metadata=train_features['metadata'].shape[1],
    hidden_size=256,
    num_classes=5  # Updated to accommodate all unique labels
).to(device)

# Train and evaluate
acc, f1 = train_and_evaluate(model, train_loader, val_loader, test_loader)


Loading data...
Splitting data...
Using device: cpu

Training Late Fusion Model...
New best model saved! F1-score: 0.1403
Epoch [1/100]
Average Loss: 1.5031
Validation Accuracy: 0.5400, F1-score: 0.1403
Fusion Weights: [0.33292145 0.3341523  0.33292618]
--------------------------------------------------
Epoch [2/100]
Average Loss: 1.3605
Validation Accuracy: 0.5400, F1-score: 0.1403
Fusion Weights: [0.33334538 0.33428612 0.3323685 ]
--------------------------------------------------
Epoch [3/100]
Average Loss: 1.2866
Validation Accuracy: 0.5400, F1-score: 0.1403
Fusion Weights: [0.33441737 0.33393323 0.33164942]
--------------------------------------------------
New best model saved! F1-score: 0.1602
Epoch [4/100]
Average Loss: 1.2359
Validation Accuracy: 0.5450, F1-score: 0.1602
Fusion Weights: [0.33564037 0.3333758  0.33098385]
--------------------------------------------------
New best model saved! F1-score: 0.1896
Epoch [5/100]
Average Loss: 1.1998
Validation Accuracy: 0.5550, F1-s

In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR

# Load the extracted features
print("Loading data...")
train_features = np.load('train_features.npy', allow_pickle=True).item()
test_features = np.load('test_features.npy', allow_pickle=True).item()
train_indices = np.load('train_indices.npy', allow_pickle=True)
test_indices = np.load('test_indices.npy', allow_pickle=True)
y_train = np.load('y_train.npy', allow_pickle=True)
y_test = y_train[test_indices]

# Ensure labels are within the range [0, 4]
y_train = np.clip(y_train, 0, 4)
y_test = np.clip(y_test, 0, 4)

# Split the training data
print("Splitting data...")
X_text_train, X_text_val, X_visual_train, X_visual_val, X_metadata_train, X_metadata_val, y_train, y_val = train_test_split(
    train_features['text'].astype(np.float32), 
    train_features['visual'].astype(np.float32), 
    train_features['metadata'].astype(np.float32), 
    y_train, 
    test_size=0.2, 
    random_state=42
)

# Create datasets and dataloaders
train_dataset = TensorDataset(
    torch.from_numpy(X_text_train),
    torch.from_numpy(X_visual_train),
    torch.from_numpy(X_metadata_train),
    torch.from_numpy(y_train)
)
val_dataset = TensorDataset(
    torch.from_numpy(X_text_val),
    torch.from_numpy(X_visual_val),
    torch.from_numpy(X_metadata_val),
    torch.from_numpy(y_val)
)
test_dataset = TensorDataset(
    torch.from_numpy(test_features['text'].astype(np.float32)),
    torch.from_numpy(test_features['visual'].astype(np.float32)),
    torch.from_numpy(test_features['metadata'].astype(np.float32)),
    torch.from_numpy(y_test)
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Early Fusion Model
class EarlyFusionModel(nn.Module):
    def __init__(self, input_size_text, input_size_visual, input_size_metadata, hidden_size, num_classes):
        super().__init__()
        self.text_encoder = nn.Sequential(
            nn.Linear(input_size_text, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3)
        )
        
        self.visual_encoder = nn.Sequential(
            nn.Linear(input_size_visual, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3)
        )
        
        self.metadata_encoder = nn.Sequential(
            nn.Linear(input_size_metadata, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3)
        )
        
        self.fusion = nn.Sequential(
            nn.Linear(3 * hidden_size, 2 * hidden_size),
            nn.LayerNorm(2 * hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(2 * hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, text, visual, metadata):
        text_features = self.text_encoder(text)
        visual_features = self.visual_encoder(visual)
        metadata_features = self.metadata_encoder(metadata)
        combined = torch.cat([text_features, visual_features, metadata_features], dim=1)
        return self.fusion(combined)

# Late Fusion Model
class LateFusionModel(nn.Module):
    def __init__(self, input_size_text, input_size_visual, input_size_metadata, hidden_size, num_classes):
        super().__init__()
        self.text_encoder = nn.Sequential(
            nn.Linear(input_size_text, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )
        
        self.visual_encoder = nn.Sequential(
            nn.Linear(input_size_visual, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )
        
        self.metadata_encoder = nn.Sequential(
            nn.Linear(input_size_metadata, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU6(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )
        
        self.fusion_weights = nn.Parameter(torch.ones(3)/3)

    def forward(self, text, visual, metadata):
        text_out = self.text_encoder(text)
        visual_out = self.visual_encoder(visual)
        metadata_out = self.metadata_encoder(metadata)
        
        weights = F.softmax(self.fusion_weights, dim=0)
        return weights[0] * text_out + weights[1] * visual_out + weights[2] * metadata_out

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Function to train and evaluate a model
def train_and_evaluate(model_name, model, train_loader, val_loader, test_loader, num_epochs=100):
    print(f"\nTraining {model_name}...")
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss()
    scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
    
    best_val_f1 = 0
    best_model_state = None
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        
        for batch_idx, (text, visual, metadata, labels) in enumerate(train_loader):
            text, visual = text.to(device), visual.to(device)
            metadata, labels = metadata.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(text, visual, metadata)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for text, visual, metadata, labels in val_loader:
                text, visual = text.to(device), visual.to(device)
                metadata, labels = metadata.to(device), labels.to(device)
                
                outputs = model(text, visual, metadata)
                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        val_acc = accuracy_score(val_true, val_preds)
        val_f1 = f1_score(val_true, val_preds, average='macro')
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state = model.state_dict()
            print(f"New best model saved! F1-score: {val_f1:.4f}")
        
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"Average Loss: {running_loss/len(train_loader):.4f}")
        print(f"Validation Accuracy: {val_acc:.4f}, F1-score: {val_f1:.4f}")
        
        if isinstance(model, LateFusionModel):
            print(f"Fusion Weights: {F.softmax(model.fusion_weights, dim=0).cpu().detach().numpy()}")
        print("-" * 50)
        
        scheduler.step()
    
    # Load best model and evaluate on test set
    model.load_state_dict(best_model_state)
    
    # Test phase
    model.eval()
    test_preds = []
    test_true = []
    
    with torch.no_grad():
        for text, visual, metadata, labels in test_loader:
            text, visual = text.to(device), visual.to(device)
            metadata, labels = metadata.to(device), labels.to(device)
            
            outputs = model(text, visual, metadata)
            preds = torch.argmax(outputs, dim=1)
            test_preds.extend(preds.cpu().numpy())
            test_true.extend(labels.cpu().numpy())
    
    test_acc = accuracy_score(test_true, test_preds)
    test_f1 = f1_score(test_true, test_preds, average='macro')
    
    print(f"\n{model_name} Final Results:")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test F1-score: {test_f1:.4f}")
    
    # Save model
    torch.save(model.state_dict(), f'final_{model_name.lower().replace(" ", "_")}_model.pth')
    # print(f"Model saved to 'final_{model_name.lower().replace(" ", "_")}_model.pth'")
    print(f"Model saved to \"final_{model_name.lower().replace(' ', '_')}_model.pth\"")

    return test_acc, test_f1

# Train and evaluate Early Fusion Model
early_fusion = EarlyFusionModel(
    input_size_text=train_features['text'].shape[1],
    input_size_visual=train_features['visual'].shape[1],
    input_size_metadata=train_features['metadata'].shape[1],
    hidden_size=256,
    num_classes=5  # Adjusted to 5 classes
).to(device)

early_acc, early_f1 = train_and_evaluate(
    "Early Fusion", 
    early_fusion, 
    train_loader, 
    val_loader, 
    test_loader
)

# Train and evaluate Late Fusion Model
late_fusion = LateFusionModel(
    input_size_text=train_features['text'].shape[1],
    input_size_visual=train_features['visual'].shape[1],
    input_size_metadata=train_features['metadata'].shape[1],
    hidden_size=256,
    num_classes=5  # Adjusted to 5 classes
).to(device)

late_acc, late_f1 = train_and_evaluate(
    "Late Fusion", 
    late_fusion, 
    train_loader, 
    val_loader, 
    test_loader
)

# Compare results
print("\nModel Comparison:")
print(f"Early Fusion - Accuracy: {early_acc:.4f}, F1-score: {early_f1:.4f}")
print(f"Late Fusion  - Accuracy: {late_acc:.4f}, F1-score: {late_f1:.4f}")


Loading data...
Splitting data...
Using device: cpu

Training Early Fusion...
New best model saved! F1-score: 0.1403
Epoch [1/100]
Average Loss: 1.4076
Validation Accuracy: 0.5400, F1-score: 0.1403
--------------------------------------------------
Epoch [2/100]
Average Loss: 1.3248
Validation Accuracy: 0.5400, F1-score: 0.1403
--------------------------------------------------
New best model saved! F1-score: 0.1598
Epoch [3/100]
Average Loss: 1.2907
Validation Accuracy: 0.5450, F1-score: 0.1598
--------------------------------------------------
New best model saved! F1-score: 0.2477
Epoch [4/100]
Average Loss: 1.2380
Validation Accuracy: 0.5650, F1-score: 0.2477
--------------------------------------------------
New best model saved! F1-score: 0.2959
Epoch [5/100]
Average Loss: 1.1834
Validation Accuracy: 0.5600, F1-score: 0.2959
--------------------------------------------------
New best model saved! F1-score: 0.3113
Epoch [6/100]
Average Loss: 1.1004
Validation Accuracy: 0.5750, F1-

In [None]:
⁠ all_preds = ["your_prediction" for i in df['Utterance_ID']]
all_ids = df["Sr No."]
submission_df = pd.DataFrame({
        'Sr No.': all_ids,
        'Emotion': all_preds
    })
    
# Save the DataFrame to CSV
submission_df.to_csv("submission.csv", index=False) ⁠