## Loading package

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install keras==3.3.2

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.models import Model
from sklearn.metrics import confusion_matrix, accuracy_score
import librosa
import librosa.display
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from keras.models import load_model
from sklearn.decomposition import PCA
import os
import cv2
import joblib



## Feature Extraction Methods

In [None]:
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
base_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')
model = Model(inputs=base_model.input, outputs=base_model.output)
mapping_dict = {"advertisement":0,
    "drama":1,
    "entertainment": 2,
    "interview": 3,
    "live_broadcast": 4,
    "movie": 5,
    "play": 6,
    "recitation": 7,
    "singing": 8,
    "speech": 9,
    "vlog": 10}

dict2 = {0: "advertisement",
    1: "drama",
    2: "entertainment",
    3: "interview",
    4: "live_broadcast",
    5: "movie",
    6: "play",
    7: "recitation",
    8: "singing",
    9: "speech",
    10: "vlog"}

def audio_features_extractor(file, n_fft=2048, hop_length=512, num_segments=5):
  SAMPLES_PER_TRACK= 22050 * 5
  samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
  features=[]
  for d in range(num_segments):
    start = samples_per_segment * d
    finish = start + samples_per_segment
    audio, sample_rate = librosa.load(file,sr=22050)
    mfccs_features = librosa.feature.mfcc(y=audio[start:finish], sr=sample_rate, n_mfcc=40, n_fft=n_fft, hop_length=hop_length)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    features.append(mfccs_scaled_features)
  return features

def video_extract_features(frame):
    frame = cv2.resize(frame, (299, 299))
    frame = preprocess_input(frame)
    frame = np.expand_dims(frame, axis=0)
    features = model.predict(frame)
    return features

def compute_average_frame_and_features(video_path):
    cap = cv2.VideoCapture(video_path)
    total_frame = np.zeros((299, 299, 3), dtype=np.float32)
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (299, 299))
        frame = frame.astype(np.float32)
        total_frame += frame
        frame_count += 1
    cap.release()

    if frame_count == 0:
        return None

    average_frame = (total_frame / frame_count).astype(np.uint8)
    average_frame_features = video_extract_features(average_frame)

    return average_frame_features
def get_key(dictionary, search_value):
    return next((key for key, value in dictionary.items() if value == search_value), None)

class LSTMClassifier(nn.Module):
    def _init_(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMClassifier, self)._init_()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out)  # Take the last time step output
        return torch.softmax(output,dim=1)


In [None]:
!ls /content/drive/MyDrive/CS670/train/batch_0 | wc -l

1669


## Data Load

In [None]:
def load_data():
    vid_feat=[]
    aud_feat=[]
    labels=[]
    path='/content/drive/MyDrive/CS670_Project/test/'
    for video_file in os.listdir(path):
      video_path=os.path.join(path, video_file)
      if video_file.endswith('.mp4'):
        genre = video_file.split('-')[1]
        labels.append(mapping_dict[genre])
        !ffmpeg -y -i "$video_path" -vn -acodec pcm_s16le -ar 44100 -ac 2 output.wav
        audio_features = audio_features_extractor('output.wav')
        video_features = compute_average_frame_and_features(video_path)
        video_features=video_features.squeeze()
        vid_feat.append(video_features)
        aud_feat.append(audio_features)
    return np.array(aud_feat), np.array(vid_feat), np.array(labels)


audio_features, video_features, labels = load_data()




ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
pca = joblib.load('/content/drive/MyDrive/CS670_Project/pca_model.pkl')

video_features_pca = pca.transform(video_features)


## Model

In [None]:
classifier_audio = load_model('/content/drive/MyDrive/CS670_Project/audio/audiomodel.h5')
classifier_video = torch.load('/content/drive/MyDrive/CS670_Project/lstm_model_fullpca.pth')

gb_classifier = joblib.load('/content/drive/MyDrive/CS670_Project/gb_classifier_model2.pkl')
video_tensor = torch.tensor(video_features_pca, dtype=torch.float32)

with torch.no_grad():
    y_pred_video = classifier_video(video_tensor)
y_pred_audio = classifier_audio.predict(audio_features)

X_combined = np.concatenate((y_pred_audio, y_pred_video.numpy()), axis=1)
y_pred = gb_classifier.predict(X_combined)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step


In [None]:
y_pred_audio.shape, y_pred_video.shape

((3, 11), torch.Size([3, 11]))

## Genre Prediction

In [None]:
for i in range(len(y_pred)):
  print("Original Genre: ",dict2[labels[i]])
  print("Genre predicted by audio model: ",dict2[np.argmax(y_pred_audio[i])])
  print("Genre predicted by video model: ",dict2[np.argmax(y_pred_video[i].numpy())])
  print("Genre predicted by fusion model: ",dict2[y_pred[i]])
  print("\n")

Original Genre:  interview
Genre predicted by audio model:  speech
Genre predicted by video model:  interview
Genre predicted by fusion model:  interview


Original Genre:  vlog
Genre predicted by audio model:  interview
Genre predicted by video model:  vlog
Genre predicted by fusion model:  vlog


Original Genre:  live_broadcast
Genre predicted by audio model:  live_broadcast
Genre predicted by video model:  live_broadcast
Genre predicted by fusion model:  live_broadcast




In [None]:
classifier_audio.summary()

# Additional Testing

In [None]:
X_audio2=np.load('/content/drive/MyDrive/CS670_Project/final_audio_feat2.npy')
X_video2=np.load('/content/drive/MyDrive/CS670_Project/final_video_feat2.npy')
y=np.load('/content/drive/MyDrive/CS670_Project/final_labels2.npy')

In [None]:
names=np.load('/content/drive/MyDrive/CS670_Project/final_names2.npy')

In [None]:
num_samples = len(X_audio2)
test_size = 0.2

test_indices = np.random.choice(num_samples, size=int(test_size * num_samples), replace=False)

X_test_audio = X_audio2[test_indices]
X_test_video = X_video2[test_indices]
Y_test = y[test_indices]
train_indices = [i for i in np.arange(num_samples) if i not in test_indices]

X_train_audio = X_audio2[train_indices]
X_train_video = X_video2[train_indices]
y_train = y[train_indices]

In [None]:
video_tensor2 = torch.tensor(X_test_video, dtype=torch.float32)

with torch.no_grad():
    y_pred_video2 = classifier_video(video_tensor2)
y_pred_audio2 = classifier_audio.predict(X_test_audio)

X_combined2 = np.concatenate((y_pred_audio2, y_pred_video2.numpy()), axis=1)
y_pred2 = gb_classifier.predict(X_combined2)

[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 225ms/step


In [None]:
import shutil
import os

def copy_file(source_dir, destination_dir, filename):
    # Traverse the directory structure recursively
    for root, dirs, files in os.walk(source_dir):
        if filename in files:
            # Found the file, construct source and destination paths
            source_file = os.path.join(root, filename)
            destination_file = os.path.join(destination_dir, filename)

            # Create destination directory if it doesn't exist
            os.makedirs(destination_dir, exist_ok=True)

            # Copy file from source to destination
            shutil.copyfile(source_file, destination_file)

            print(f"File '{filename}' copied successfully!")
            return  # Exit function after copying the file

# Source and destination directories
source_dir = '/content/drive/MyDrive/CS670/train/video/'
destination_dir = '/content/drive/MyDrive/CS670_Project/test/'




# Results when Video predicted genre doesn't match actual genre but audio does

In [None]:
for i in range(len(y_pred2)):
  if (y[i] == y_pred2[i] and y[i]!= np.argmax(y_pred_video2[i].numpy()) and  y[i]== np.argmax(y_pred_audio2[i])):
    #copy_file(source_dir, destination_dir, names[i])
    print("Original Genre: ",dict2[y[i]])
    print("Genre predicted by audio model: ",dict2[np.argmax(y_pred_audio2[i])])
    print("Genre predicted by video model: ",dict2[np.argmax(y_pred_video2[i].numpy())])
    print("Genre predicted by fusion model: ",dict2[y_pred2[i]])
    print("\n")

Original Genre:  vlog
Genre predicted by audio model:  vlog
Genre predicted by video model:  interview
Genre predicted by fusion model:  vlog


Original Genre:  vlog
Genre predicted by audio model:  vlog
Genre predicted by video model:  entertainment
Genre predicted by fusion model:  vlog


Original Genre:  speech
Genre predicted by audio model:  speech
Genre predicted by video model:  interview
Genre predicted by fusion model:  speech


Original Genre:  interview
Genre predicted by audio model:  interview
Genre predicted by video model:  speech
Genre predicted by fusion model:  interview


Original Genre:  interview
Genre predicted by audio model:  interview
Genre predicted by video model:  vlog
Genre predicted by fusion model:  interview


Original Genre:  interview
Genre predicted by audio model:  interview
Genre predicted by video model:  speech
Genre predicted by fusion model:  interview


Original Genre:  interview
Genre predicted by audio model:  interview
Genre predicted by vid

# Results when Audio predicted genre doesn't match actual genre but video does

In [None]:
for i in range(len(y_pred2)):
  if (y[i] == y_pred2[i] and y[i]== np.argmax(y_pred_video2[i].numpy()) and  y[i]!= np.argmax(y_pred_audio2[i])):
    print("Original Genre: ",dict2[y[i]])
    print("Genre predicted by audio model: ",dict2[np.argmax(y_pred_audio2[i])])
    print("Genre predicted by video model: ",dict2[np.argmax(y_pred_video2[i].numpy())])
    print("Genre predicted by fusion model: ",dict2[y_pred2[i]])
    print("\n")

Original Genre:  vlog
Genre predicted by audio model:  interview
Genre predicted by video model:  vlog
Genre predicted by fusion model:  vlog


Original Genre:  interview
Genre predicted by audio model:  entertainment
Genre predicted by video model:  interview
Genre predicted by fusion model:  interview


Original Genre:  live_broadcast
Genre predicted by audio model:  entertainment
Genre predicted by video model:  live_broadcast
Genre predicted by fusion model:  live_broadcast


Original Genre:  interview
Genre predicted by audio model:  vlog
Genre predicted by video model:  interview
Genre predicted by fusion model:  interview


Original Genre:  vlog
Genre predicted by audio model:  live_broadcast
Genre predicted by video model:  vlog
Genre predicted by fusion model:  vlog


Original Genre:  interview
Genre predicted by audio model:  live_broadcast
Genre predicted by video model:  interview
Genre predicted by fusion model:  interview


Original Genre:  interview
Genre predicted by au

# Results when both Audio and video predicted genre don't match actual genre but fusion model does

In [None]:
for i in range(len(y_pred2)):
  if (y[i] == y_pred2[i] and y[i]!= np.argmax(y_pred_video2[i].numpy()) and  y[i]!= np.argmax(y_pred_audio2[i])):
    print("Original Genre: ",dict2[y[i]])
    print("Genre predicted by audio model: ",dict2[np.argmax(y_pred_audio2[i])])
    print("Genre predicted by video model: ",dict2[np.argmax(y_pred_video2[i].numpy())])
    print("Genre predicted by fusion model: ",dict2[y_pred2[i]])
    print("\n")

Original Genre:  speech
Genre predicted by audio model:  interview
Genre predicted by video model:  vlog
Genre predicted by fusion model:  speech


Original Genre:  speech
Genre predicted by audio model:  vlog
Genre predicted by video model:  singing
Genre predicted by fusion model:  speech


Original Genre:  speech
Genre predicted by audio model:  live_broadcast
Genre predicted by video model:  vlog
Genre predicted by fusion model:  speech


Original Genre:  advertisement
Genre predicted by audio model:  vlog
Genre predicted by video model:  vlog
Genre predicted by fusion model:  advertisement


Original Genre:  vlog
Genre predicted by audio model:  speech
Genre predicted by video model:  speech
Genre predicted by fusion model:  vlog


Original Genre:  vlog
Genre predicted by audio model:  live_broadcast
Genre predicted by video model:  speech
Genre predicted by fusion model:  vlog


Original Genre:  entertainment
Genre predicted by audio model:  vlog
Genre predicted by video model:  