In [1]:
import os
import csv
import pandas as pd

In [2]:
GIF_SAVE_PATH = "/home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs"
BLIP_CSV_PATH = "/home/jecroisp/Thesis/processed_data/TestingData/test_gif_metadata.csv"


## Downloading Gifs From Giphy

In [15]:
import os
import csv
import requests
from PIL import Image
from io import BytesIO

# Giphy API Key (Replace with your own key)
GIPHY_API_KEY = "IKehD6GbzfVJHKFt8DvZlCIRk8gfn5A8"

# Query parameters for movie GIFs
SEARCH_TERM = "movies"
MAX_RESULTS = 1000  # Number of GIFs to fetch per request
BATCH_LIMIT = 50  # GIFs per API request
GIF_SAVE_PATH = "/home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs"
BLIP_CSV_PATH = "/home/jecroisp/Thesis/processed_data/TestingData/test_gif_metadata.csv"

# Ensure save directory exists
os.makedirs(GIF_SAVE_PATH, exist_ok=True)

def fetch_gifs_from_giphy(search_term, limit=50, max_results=1000):
    """Fetch multiple batches of GIFs from Giphy along with captions."""
    url = f"https://api.giphy.com/v1/gifs/search"
    gif_data = []
    offset = 0

    while len(gif_data) < max_results:
        params = {
            "api_key": GIPHY_API_KEY,
            "q": search_term,
            "limit": min(limit, max_results - len(gif_data)),
            "offset": offset,
            "rating": "pg",  # Adjust rating if needed
            "lang": "en"
        }
        
        response = requests.get(url, params=params)
        if response.status_code != 200:
            print("Error fetching data from Giphy")
            break
        
        data = response.json()
        for item in data["data"]:
            gif_id = item["id"]
            gif_url = item["images"]["original"]["url"]
            caption = item.get("title", "No Caption")  # Get title or fallback to "No Caption"
            gif_data.append((gif_id, gif_url, caption))

        if len(data["data"]) == 0:
            break  # Stop if no more results
        
        offset += len(data["data"])

    return gif_data

def download_gif(url, save_path, gif_id):
    """Downloads a GIF and saves it with a unique filename."""
    response = requests.get(url)
    if response.status_code == 200:
        gif_name = os.path.join(save_path, f"{gif_id}.gif")  # Unique filename
        
        with open(gif_name, "wb") as f:
            f.write(response.content)
        print(f"Downloaded: {gif_name}")
        return gif_name
    else:
        print(f"Failed to download: {url}")
        return None

# Fetch GIF data
gif_data = fetch_gifs_from_giphy(SEARCH_TERM, BATCH_LIMIT, MAX_RESULTS)

# Save GIF metadata to CSV
with open(BLIP_CSV_PATH, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["GIF_ID", "File_Name", "Caption"])  # CSV Headers

    for gif_id, gif_url, caption in gif_data:
        file_name = download_gif(gif_url, GIF_SAVE_PATH, gif_id)
        if file_name:
            writer.writerow([gif_id, file_name, caption])  # Store metadata in CSV

print("Download complete! GIF metadata saved in CSV.")


Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/FEXGAddVac7K4VkY9C.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/UQvU6egcKE1eOd66Ss.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/UfVBxLG7IghCE.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/l0ErRtQDgjMtQcjsI.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/d2YWTOsVtuHgOHhC.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/njv4XzvImIOZxL3ddq.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/5caVuZgHsdr8I.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/yNZGTpQj4NN5K.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/9HWi6sWPRJfnG.gif
Downloaded: /home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs/vaRCdgM0fLNrW.gif
Downloaded: /home/jecroisp/Thesis/proc

## Preprocess Testing Data Set 

In [16]:
CLEANED_CSV_FILE_PATH = "/home/jecroisp/Thesis/processed_data/TestingData/cleaned_test_gif_metadata.csv"

In [None]:
def clean_csv_and_gifs(csv_path, gifs_folder):
    """Cleans the CSV file by removing duplicate GIF entries and deleting extra GIF files."""
    
    # Load CSV file into a DataFrame
    df = pd.read_csv(csv_path)

    # Remove duplicate GIF_ID entries (keeping first occurrence)
    df_cleaned = df.drop_duplicates(subset=["GIF_ID"], keep="first")

    # Get the list of unique GIF filenames
    unique_files = set(df_cleaned["File_Name"].tolist())

    # Delete extra GIFs not in the cleaned list
    for gif_file in os.listdir(gifs_folder):
        gif_path = os.path.join(gifs_folder, gif_file)
        if gif_path not in unique_files:
            os.remove(gif_path)
            print(f"Deleted duplicate GIF: {gif_path}")

    # Save cleaned CSV file
    df_cleaned.to_csv(CLEANED_CSV_FILE_PATH, index=False)
    print(f"Cleaned metadata saved to: {CLEANED_CSV_FILE_PATH}")

# Run the cleanup function
clean_csv_and_gifs(BLIP_CSV_PATH, GIF_SAVE_PATH)

Cleaned metadata saved to: /home/jecroisp/Thesis/processed_data/TestingData/cleaned_test_gif_metadata.csv


## Label Each Gif

### Attempt To use BLIP instead

In [31]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def generate_caption_blip(image_path):
    """Generates an image caption using BLIP."""
    try:
        image = Image.open(image_path).convert("RGB")  # Convert to RGB
        inputs = processor(image, return_tensors="pt")

        output = model.generate(**inputs)
        caption = processor.decode(output[0], skip_special_tokens=True)
        return caption

    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return "No caption generated"


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [32]:
df["Enhanced_Caption"] = df["File_Name"].apply(generate_caption_blip)
df.to_csv("gif_metadata_blip.csv", index=False)

print("BLIP-based captioning completed! Data saved.")


BLIP-based captioning completed! Data saved.


### Sentiment Analysis to apply an emotion to gif based on Caption

In [33]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER data
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

def classify_emotion_vader(caption):
    """Classifies emotion using VADER sentiment analysis."""
    scores = sia.polarity_scores(caption)

    if scores["compound"] >= 0.5:
        return "Happiness"  # Strongly positive
    elif scores["compound"] > -0.2:
        return "Neutral"  # Mildly positive or neutral
    else:
        return "Sadness"  # Strongly negative


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jecroisp/nltk_data...


In [34]:
from transformers import pipeline

# Load DistilBERT sentiment model
sentiment_pipeline = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

def classify_emotion_bert(caption):
    """Classifies emotion using a DistilBERT sentiment model."""
    result = sentiment_pipeline(caption)[0]  # Get top classification
    label = result["label"]

    # Map labels to emotions
    if label == "POSITIVE":
        return "Happiness"
    elif label == "NEGATIVE":
        return "Sadness"
    else:
        return "Neutral"


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


In [35]:
# Load the dataset with BLIP captions
BLIP_CSV_PATH = "gif_metadata_blip.csv"
OUTPUT_CSV_FILE = "gif_metadata_sentiment.csv"

df = pd.read_csv(BLIP_CSV_PATH)

# Apply sentiment models
df["VADER_Emotion"] = df["Enhanced_Caption"].apply(classify_emotion_vader)
df["BERT_Emotion"] = df["Enhanced_Caption"].apply(classify_emotion_bert)

# Save updated dataset
df.to_csv(OUTPUT_CSV_FILE, index=False)

print(f"Sentiment classification completed! Data saved to: {OUTPUT_CSV_FILE}")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Sentiment classification completed! Data saved to: gif_metadata_sentiment.csv


### Distilled Bert Gif Model For Alternative Labeling

In [38]:
from transformers import pipeline
import pandas as pd

# Load pre-trained Hugging Face model for GIF emotion classification
emotion_pipeline = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student")

# Define function to predict emotion for each GIF using the external model
def predict_gif_emotion(gif_caption):
    """Predicts the sentiment label for a GIF using a pre-trained model."""
    try:
        result = emotion_pipeline(gif_caption)[0]  # Get top classification
        predicted_label = result["label"]
        confidence_score = result["score"]
        return predicted_label, confidence_score
    except Exception as e:
        print(f"Error processing: {gif_caption} - {e}")
        return "Unknown", 0.0


config.json:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [39]:
# Load the BLIP-enhanced GIF dataset
CSV_FILE_PATH = "gif_metadata_blip.csv"  # Use latest dataset
OUTPUT_CSV_FILE = "gif_sentiment_labels.csv"

df = pd.read_csv(CSV_FILE_PATH)

# Apply sentiment model to each GIF caption
df["GIF_Emotion"], df["Confidence_Score"] = zip(*df["Enhanced_Caption"].apply(predict_gif_emotion))

# Save new sentiment labels to CSV
df.to_csv(OUTPUT_CSV_FILE, index=False)

print(f"Sentiment labeling completed! Data saved to: {OUTPUT_CSV_FILE}")


Sentiment labeling completed! Data saved to: gif_sentiment_labels.csv


In [42]:
# Define mapping dictionary
emotion_mapping = {
    "anger": "Anger",
    "annoyance": "Anger",
    "disapproval": "Anger",
    "disgust": "Disgust",
    "fear": "Fear",
    "nervousness": "Fear",
    "joy": "Happiness",
    "amusement": "Happiness",
    "excitement": "Happiness",
    "admiration": "Happiness",
    "gratitude": "Happiness",
    "love": "Happiness",
    "optimism": "Happiness",
    "pride": "Happiness",
    "relief": "Happiness",
    "neutral": "Neutral",
    "curiosity": "Neutral",
    "realization": "Neutral",
    "approval": "Neutral",
    "sadness": "Sadness",
    "grief": "Sadness",
    "disappointment": "Sadness",
    "remorse": "Sadness",
    "embarrassment": "Sadness"
}

def map_goemotions_label(goemotions_label):
    """Maps GoEmotions labels to the six core emotion categories."""
    return emotion_mapping.get(goemotions_label, "Neutral")  # Default to Neutral if not found


In [43]:


# Load the GoEmotions sentiment predictions
SENTIMENT_FILE = "gif_sentiment_labels.csv"
OUTPUT_MAPPED_FILE = "gif_sentiment_mapped.csv"

df = pd.read_csv(SENTIMENT_FILE)

# Apply mapping function to align labels
df["Mapped_Emotion"] = df["GIF_Emotion"].apply(map_goemotions_label)

# Save the mapped dataset
df.to_csv(OUTPUT_MAPPED_FILE, index=False)

print(f"Mapped sentiment labels saved to: {OUTPUT_MAPPED_FILE}")


Mapped sentiment labels saved to: gif_sentiment_mapped.csv


### Human Test

In [49]:
import os
import ipywidgets as widgets
import imageio
from IPython.display import Image as IPImage, display, clear_output
from PIL import Image as PILImage
import pandas as pd

# Define directories and output file
GIF_FOLDER = "/home/jecroisp/Thesis/processed_data/TestingData/downLoadedGifs"  # Path to your GIFs
LABELS_FILE = "manual_labels.csv"

# Emotion labels
LABELS = ["Anger", "Disgust", "Fear", "Happiness", "Neutral", "Sadness"]

# Load existing labels to avoid duplicates
if os.path.exists(LABELS_FILE):
    df = pd.read_csv(LABELS_FILE)
    labeled_files = set(df["File_Name"].tolist())
else:
    labeled_files = set()

# Get GIFs to label
gif_files = [f for f in os.listdir(GIF_FOLDER) if f.endswith(".gif") and f not in labeled_files]
index = 0  # Track current GIF index

def load_gif():
    global index, gif_files
    if index >= len(gif_files):
        with output:
            clear_output()
            print("🎉 All GIFs labeled! 🎉")
        return

    gif_path = os.path.join(GIF_FOLDER, gif_files[index])
    gif = imageio.mimread(gif_path)
    middle_frame = gif[len(gif) // 2]

    # Convert to PIL Image
    frame_image = PILImage.fromarray(middle_frame)
    if frame_image.mode == "RGBA":
        frame_image = frame_image.convert("RGB")

    # Save as JPEG
    frame_image.save("temp_frame.jpg", "JPEG")

    with output:
        clear_output()
        display(IPImage(filename="temp_frame.jpg"))
        display(filename_label)
        display(buttons_box)

    filename_label.value = f"Labeling: {gif_files[index]}"



def save_label(label):
    """Saves the selected label to the CSV and loads the next GIF."""
    global index

    with open(LABELS_FILE, "a") as file:
        file.write(f"{gif_files[index]},{label}\n")

    index += 1
    load_gif()

# UI Elements
filename_label = widgets.Label()
output = widgets.Output()
buttons = [widgets.Button(description=lbl) for lbl in LABELS]

# Attach button click events
for btn in buttons:
    btn.on_click(lambda btn, lbl=btn.description: save_label(lbl))

buttons_box = widgets.HBox(buttons)

# Load first GIF
load_gif()

# Display UI
display(output)


Output()