1. **Data Loading**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertModel
import torch
from torchvision import models, transforms
import cv2
import os

# Load the textual data CSV
textual_data_path = '/content/drive/MyDrive/ra_task/Sample.csv'
text_data = pd.read_csv(textual_data_path)

# Function to get all video paths from a directory
def get_video_paths(directory_path):
    video_extensions = ('.mp4', '.avi', '.mov')
    video_paths = [os.path.join(directory_path, fname) for fname in os.listdir(directory_path) if fname.endswith(video_extensions)]
    return video_paths

# Specify the directory path
directory_path = '/content/drive/MyDrive/ra_task/sample'
video_paths = get_video_paths(directory_path)

# Check if all video paths are loaded
print(f"Total videos found: {len(video_paths)}")
print(video_paths)

ground_df= pd.read_csv('/content/drive/MyDrive/ra_task/ground-truth.csv')
ground_df.head()

Total videos found: 150
['/content/drive/MyDrive/ra_task/sample/2194673.mp4', '/content/drive/MyDrive/ra_task/sample/2142915.mp4', '/content/drive/MyDrive/ra_task/sample/1702851.mp4', '/content/drive/MyDrive/ra_task/sample/1671980.mp4', '/content/drive/MyDrive/ra_task/sample/2381477.mp4', '/content/drive/MyDrive/ra_task/sample/1749291.mp4', '/content/drive/MyDrive/ra_task/sample/1667694.mp4', '/content/drive/MyDrive/ra_task/sample/2764983.mp4', '/content/drive/MyDrive/ra_task/sample/2807978.mp4', '/content/drive/MyDrive/ra_task/sample/3212463.mp4', '/content/drive/MyDrive/ra_task/sample/1488315.mp4', '/content/drive/MyDrive/ra_task/sample/2379465.mp4', '/content/drive/MyDrive/ra_task/sample/3037506.mp4', '/content/drive/MyDrive/ra_task/sample/3149347.mp4', '/content/drive/MyDrive/ra_task/sample/3414303.mp4', '/content/drive/MyDrive/ra_task/sample/1702594.mp4', '/content/drive/MyDrive/ra_task/sample/3119347.mp4', '/content/drive/MyDrive/ra_task/sample/1471363.mp4', '/content/drive/MyDri

Unnamed: 0,Timestamp,creative_data_id,"Is there a call to go online (e.g., shop online, visit the Web)?","Is there online contact information provided (e.g., URL, website)?","Is there a visual or verbal call to purchase (e.g., buy now, order now)?","Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?","Is there an incentive to buy (e.g., a discount, a coupon, a sale or ""limited time offer"")?","Is there offline contact information provided (e.g., phone, mail, store location)?",Is there mention of something free?,"Does the ad mention at least one specific product or service (e.g., model, type, item)?",...,Was there a famous person in this ad?,"If yes to the above, write the name of the famous person, if known.",What happened in this ad? (Answer in 2-3 sentences each),What was/were the company's goal(s) with this ad? Choose (potentially multiple) from:,How successful was the ad in achieving its goal(s)?,"How much did you like the ad? (1. Strongly dislike, 2. Dislike, 3. Neither Like or Dislike, 4. Like, 5. Strongly Like)","What was the slogan presented in the ad, if any?","After addressing the specific survey items, write a general description of the ad. You can use answers to the questions above to formulate your answer. Your description should include:\nBrand and Product Identification: \nSpecify the brand and whether a product is being advertised. (1 sentence)\nVisual Elements: Describe what is seen on the screen, including setting, characters, and any text or graphics. (max 2 sentences)\nAuditory Elements: Note what is heard, such as dialogue, voice-over, music, or sound effects. (max 2 sentences)\n",Any additional feedback or things we should be aware of?,Please enter the video identifier one more time (e.g. 123456789.mp4)
0,5/16/2024 8:00:14,1471363,No,Yes,No,No,No,No,No,Yes,...,No,,During this ad we had a man and a woman taking...,Change how consumers feel about the product/br...,3,3,,Product Identification: Mini Countryman SUV\nM...,I believe the text legibility should be improv...,1471363.mp4
1,5/23/2024 2:35:55,1471363,No,Yes,No,No,No,No,No,Yes,...,No,,We watch Mini USA new Countryman driver around...,Directly persuade consumers to purchase,2,2,,Mini USA is advertising their new Countryman c...,,1471363.mp4
2,5/15/2024 18:09:53,1488315,No,No,No,No,No,No,No,No,...,No,,The end of the world seems to have been brough...,"Increase awareness of product/brand, Make cons...",3,3,"Their usual slogan was not presented, but ""Tom...",The brand being advertised is Jose Cuervo tequ...,,1488315.mp4
3,5/16/2024 6:18:33,1488315,No,No,No,No,No,No,No,Yes,...,No,,This ad is set in a bar in the middle of a des...,"Increase awareness of product/brand, Change ho...",4,4,"Cuervo, ""Tomorrow is Overrated""","This ad is for the brand Jose Cuervo, advertis...",,1488315.mp4
4,5/24/2024 10:00:26,1488315,No,No,No,No,No,No,No,Yes,...,No,,It started with a disaster occurring and the n...,Increase awareness of product/brand,4,3,Tomorrow is overrated,The brand being advertised was Jose Cuervo teq...,,1488315.mp4


2. **Data Preprocessing**

In [3]:
# Fill missing values with a default value (e.g., 'No')
ground_df.fillna('No', inplace=True)

In [4]:
# Drop columns with any None values
ground_df = ground_df.dropna(axis=1, how='any')

In [5]:
# Define the question columns with trimmed spaces
question_columns = [
    'Is there a call to go online (e.g., shop online, visit the Web)?',
    'Is there online contact information provided (e.g., URL, website)?',
    'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
    'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?',
    'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")?',
    'Is there offline contact information provided (e.g., phone, mail, store location)?',
    'Is there mention of something free?',
    'Does the ad mention at least one specific product or service (e.g., model, type, item)?',
    'Is there any verbal or visual mention of the price?',
    'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included.',
    'Does the ad show the brand or trademark exactly once at the end of the ad?',
    'Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)',
    'Does the ad give you a positive feeling about the brand?',
    'Does the ad have a story arc, with a beginning and an end?',
    'Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?',
    'Does the ad have relatable characters?',
    'Is the ad creative/clever?',
    'Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.)',
    'Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)?',
    'Is the ad visually pleasing?',
    'Does the ad have cute elements like animals, babies, animated, characters, etc?'
]

# Trim the spaces from the DataFrame column names
ground_df.columns = ground_df.columns.str.strip()

# Select the columns with trimmed spaces from the DataFrame
labels = ground_df[question_columns].values

3. **Feature Extraction**

In [6]:
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Advanced Text Feature Extraction with Fine-tuned BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to preprocess text
def preprocess_text(text):
    """
    Preprocess a text by converting it to lowercase, removing punctuation, numbers, and stopwords.
    """
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

# Function to extract BERT features from text
def extract_bert_features(texts):
    text_features = []
    for text in texts:
        inputs = tokenizer(preprocess_text(text), return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        with torch.no_grad():
            outputs = model(**inputs)
        text_features.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
    return np.vstack(text_features)

# Extract text features from descriptions and speeches
description_features = extract_bert_features(text_data['creative_data_description'].tolist())
speech_features = extract_bert_features(text_data['speech'].tolist())

# Combine text features
text_features = np.hstack((description_features, speech_features))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [27]:
print("Shape of text features:", text_features.shape)

Shape of text features: (150, 1536)


In [8]:
# Advanced Visual Feature Extraction with EfficientNet
efficientnet = models.efficientnet_b0(pretrained=True)
efficientnet.eval()

def extract_frames(video_path, max_frames=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_interval = max(1, frame_count // max_frames)
    for i in range(frame_count):
        ret, frame = cap.read()
        if not ret:
            break
        if i % frame_interval == 0:
            frames.append(frame)
        if len(frames) >= max_frames:
            break
    cap.release()
    return frames

preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_visual_features(video_paths, max_frames=30):
    visual_features = []
    for video_path in video_paths:
        frames = extract_frames(video_path, max_frames)
        for frame in frames:
            input_tensor = preprocess(frame).unsqueeze(0)
            with torch.no_grad():
                output = efficientnet(input_tensor)
            visual_features.append(output.numpy().flatten())
    return np.array(visual_features)

# Extract visual features
visual_features = extract_visual_features(video_paths)



In [28]:
print("Shape of visual features:", visual_features.shape)

Shape of visual features: (150, 1000)


In [10]:
# Ensure consistent sample size
n_samples = min(len(text_features), len(visual_features))
text_features = text_features[:n_samples]
visual_features = visual_features[:n_samples]
labels = ground_df[question_columns].values[:n_samples]

In [11]:
# Combine Features
combined_features = np.hstack((text_features, visual_features)).astype(np.float32)

In [40]:
print("Shape of labels:", labels.shape)
print("Shape of combined features:", combined_features.shape)

Shape of labels: (150, 21)
Shape of combined features: (150, 2536)


4. **Model Training**

In [33]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)

# Hyperparameter Tuning and Model Training with GridSearchCV
param_grid = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

best_classifiers = []
for i in range(labels.shape[1]):
    clf = GradientBoostingClassifier()
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
    grid_search.fit(X_train, y_train[:, i])
    best_classifiers.append(grid_search.best_estimator_)

# Predict and evaluate
y_pred = np.zeros_like(y_test)
for i, clf in enumerate(best_classifiers):
    y_pred[:, i] = clf.predict(X_test)



5. **Prediction**

In [34]:
# Create a DataFrame for the predicted answers
predicted_answers = []  # This should come from the prediction loop

# Ensure each creative_data_id is predicted only once
predicted_ids = set()
for i in range(combined_features.shape[0]):
    video_answers = []
    if text_data['creative_data_id'].values[i] in predicted_ids:
        continue
    predicted_ids.add(text_data['creative_data_id'].values[i])
    for clf in best_classifiers:
        pred = clf.predict(combined_features[i].reshape(1, -1))[0]
        video_answers.append(pred)
    predicted_answers.append(video_answers)

predicted_answers_df = pd.DataFrame(predicted_answers, columns=question_columns)

# Add video IDs to the DataFrame
video_ids = text_data['creative_data_id'].values[:predicted_answers_df.shape[0]]
predicted_answers_df.insert(0, 'creative_data_id', video_ids)

# Save the predicted answers to a CSV file
predicted_answers_df.to_csv('predicted_answers.csv', index=False)

6. **Evaluation** 

In [41]:
# Calculate metrics
precision_scores = []
recall_scores = []
f1_scores = []
agreement_percentages = []

for i in range(y_test.shape[1]):
    precision = precision_score(y_test[:, i], y_pred[:, i], average='macro')
    recall = recall_score(y_test[:, i], y_pred[:, i], average='macro')
    f1 = f1_score(y_test[:, i], y_pred[:, i], average='macro')
    agreement_percentage = np.mean(y_test[:, i] == y_pred[:, i]) * 100

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    agreement_percentages.append(agreement_percentage)

# Print metrics
for i, question in enumerate(question_columns):
    print(f'Question: {question}')
    print(f'  Precision: {precision_scores[i]:.2f}')
    print(f'  Recall: {recall_scores[i]:.2f}')
    print(f'  F1 Score: {f1_scores[i]:.2f}')
    print(f'  Agreement Percentage: {agreement_percentages[i]:.2f}%\n')

Question: Is there a call to go online (e.g., shop online, visit the Web)?
  Precision: 0.48
  Recall: 0.49
  F1 Score: 0.44
  Agreement Percentage: 60.00%

Question: Is there online contact information provided (e.g., URL, website)?
  Precision: 0.47
  Recall: 0.47
  F1 Score: 0.47
  Agreement Percentage: 46.67%

Question: Is there a visual or verbal call to purchase (e.g., buy now, order now)?
  Precision: 0.56
  Recall: 0.55
  F1 Score: 0.51
  Agreement Percentage: 53.33%

Question: Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?
  Precision: 0.38
  Recall: 0.33
  F1 Score: 0.35
  Agreement Percentage: 43.33%

Question: Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")?
  Precision: 0.61
  Recall: 0.58
  F1 Score: 0.57
  Agreement Percentage: 60.00%

Question: Is there offline contact information provided (e.g., phone, mail, store location)?
  Precision: 0.62
  Recall: 0.63
  F1 Score: 0.62
  Agr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
