**Dependencies:**<br>
pip install moviepy <br>
pip install audiofile - this will give you SoX, another dependency.<br>
pip install audeer<br>
pip install audonnx<br>
pip install ffmpeg<br>

In [None]:
#basic
import os
import pandas as pd
import numpy as np
import glob
import re

#media processing
from moviepy.editor import VideoFileClip

#models
from transformers import pipeline
import audiofile

import audeer
import audonnx
import ffmpeg
import time

vid_path = "/Volumes/Scraplab/psypose_fmri/movie_files/"
isc_outs = '/Volumes/Scraplab/psypose_fmri/isc_analysis/'

tasknames = ['12yearsaslave','500daysofsummer','backtothefuture','citizenfour',
           'littlemisssunshine', 'pulpfiction','split','theprestige',
           'theshawshankredemption','theusualsuspects']

vidnames = ['12_years_a_slave','500_days_of_summer','back_to_the_future','citizenfour',
           'little_miss_sunshine', 'pulp_fiction','split','the_prestige',
           'the_shawshank_redemption','the_usual_suspects']

zippedlist = zip(tasknames,vidnames)
tasktovidmap = dict(zippedlist)

# Load in the Model

url = 'https://zenodo.org/record/6221127/files/w2v2-L-robust-12.6bc4a7fd-1.1.0.zip'
cache_root = audeer.mkdir('cache')
model_root = audeer.mkdir('model')

archive_path = audeer.download_url(url, cache_root, verbose=True)
audeer.extract_archive(archive_path, model_root)
model = audonnx.load(model_root)

# Generate Video Clips to train the model on

for task in tasknames:
    task_to_vid = tasktovidmap[task]
    video = VideoFileClip(vid_path+task_to_vid+'.mp4') 
    diar = pd.read_csv(isc_outs+task+os.sep+task+"_diarization_cleaned.csv")
    for i in diar.index:
        if 3 < diar.loc[i,"duration"] < 11:
            start,stop = int(diar.loc[i,"start"]),int(diar.loc[i,"stop"])
            clip = video.subclip(start,stop)
            clip.audio.write_audiofile(isc_outs+task+os.sep+"emo_clips/"+str(start)+"_"+str(stop)+'_clip.wav')
        else:
            continue

# Generate Arousal and Valence Scores

In [None]:
print("started emotion annotation at:",time.strftime("%I:%M %p"))

for task in tasknames:
    df = pd.DataFrame()
    df[["start","stop","arousal","valence"]] = "","","",""
    index=0
    task_clips = glob.glob(os.path.join(isc_outs+task+os.sep+"emo_clips/*_clip.wav"))
    task_clips.sort()
    
    for clip in task_clips:
        audio, sampling_rate = audiofile.read(clip,always_2d=True)
        audio_file = np.sum(audio,axis=0)
        out = model(audio_file, sampling_rate)
        
        split = clip.split("/")
        start,stop,_ = re.split("_",split[-1])
        df.loc[index,"start"], df.loc[index,"stop"],df.loc[index,"arousal"],df.loc[index,"valence"] = start,stop,out['logits'][0][0],out['logits'][0][2]
        index += 1
        
    df.to_csv(isc_outs+task+os.sep+task+"_emotion_annotations5.csv",index=False)

print(task,"finished emotion annotation at:",time.strftime("%I:%M %p"))