In [None]:
## First step 
## Download the MELD corpus to a specific location
## This program converts each video files (.mp4) into audio (.wav) file format 

#Read All files and convert video files to wav file format
import os

#Specify path of input folder
audio_path = '/home/jay_kejriwal/MELD/MELD.Raw/dev_splits_complete'
#Specify path of output folder
output_path = '/home/jay_kejriwal/MELD/MELD.Raw/dev_splits_resampled'
#Do this for all files i.e. train,val sets  

all_files = os.listdir(audio_path)
for root, dirs, files in os.walk(audio_path):
    for file in files:
        if file.endswith('.mp4'):
            out_name= os.path.basename(file[:-4])+'.wav'
            output=os.path.join(output_path, out_name)
            cmd_str = f"ffmpeg -i {os.path.join(root, file)} -ac 1 -ar 16000 -f wav -vn {output}"
            print(cmd_str)
            os.system(cmd_str)




In [None]:
## Second step
## This program reads all csv files and merge them into one big csv file
## The csv file has information about Utterance, Season, Episode, Speaker information 

#Pre-process text files
import glob
import pandas as pd
import os 
import numpy as np

# Specify path of dataset folder
csv_files = glob.glob(r'D:\D drive\MELD corpus\MELD_Dyadic\*.csv',recursive=True)

# Specify output path
out_name=r'D:\D drive\MELD corpus\output\MELD_Dyadic_all.csv'
df_csv_append = pd.DataFrame()
 
# append the CSV files
for file in csv_files:
    df = pd.read_csv(file)
    df['mainfile'] = os.path.basename(file)[:-4]
    df_csv_append = df_csv_append.append(df, ignore_index=True)
 

df_csv_append.sort_values(['Season', 'Episode'], ascending=[True, True], inplace=True)
# create a list of our conditions
conditions = [
    (df_csv_append['mainfile'] == 'train_sent_emo_dya'),
    (df_csv_append['mainfile'] == 'dev_sent_emo_dya'),
    (df_csv_append['mainfile'] == 'test_sent_emo_dya')
    ]

# create a list of the values we want to assign for each condition
values = [r'D:\D drive\MELD corpus\train_splits_resampled', r'D:\D drive\MELD corpus\dev_splits_complete_resampled', r'D:\D drive\MELD corpus\output_repeated_splits_test_resampled']

# create a new column and use np.select to assign values to it using our lists as arguments
df_csv_append['File_path'] = np.select(conditions, values)
df_csv_append[['Old_Dialogue_ID', 'Old_Utterance_ID','Season','Episode']] = df_csv_append[['Old_Dialogue_ID', 'Old_Utterance_ID','Season','Episode']].astype(str)
df_csv_append["Full_path"] = df_csv_append['File_path']+'\dia' + (df_csv_append['Old_Dialogue_ID'])+'_utt'+ (df_csv_append['Old_Utterance_ID']) + '.wav'
df_csv_append.to_csv(out_name, index=False,header=True,sep='\t')

In [4]:
## Third step
## This program concatenates ajacent emotional state of dyads. 
## For instance, if speaker A's turn is labelled as positive emotional state and speaker B is under 
## negative emotional state then program concatenates the two emotional state and writes it as positive-negative emotional state

#Program to concatenate adjacent emotional states
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import BertTokenizer, BertModel
import torch
from functools import reduce
import csv
from sklearn.metrics.pairwise import cosine_similarity
import random
import subprocess
import re
import glob
import os,sys
import numpy as np
import soundfile as sf
import wave
import json

from sklearn.metrics.pairwise import cosine_similarity
import pickle


#Specify input 
file_name=r"D:\D drive\MELD corpus\output\MELD_Dyadic_all.csv"
#Specify output 
output_filename=r'D:\D drive\MELD corpus\output\MELD_Dyadic_all_pairdistance.csv'
emotion=[]
sentiment=[]
old_dialogueid=[]
old_utteranceid=[]
speaker=[]
adjacent_distance=[]
df = pd.read_csv(file_name,delimiter='\t')
df=df.applymap(str)
for i in range(len(df)-1):
    if(df.loc[i, "Old_Dialogue_ID"]==df.loc[i+1, "Old_Dialogue_ID"] and df.loc[i, "Speaker"]!=df.loc[i+1, "Speaker"]):
        
        emotion_x=df.loc[i, "Emotion"]
        emotion_y=df.loc[i+1, "Emotion"]
        emotion_z=emotion_x+'-'+emotion_y
        emotion.append(emotion_z)
        
        sentiment_x=df.loc[i, "Sentiment"]
        sentiment_y=df.loc[i+1, "Sentiment"]
        sentiment_z=sentiment_x+'-'+sentiment_y
        sentiment.append(sentiment_z)
        
        old_dialogueid_x=df.loc[i, "Old_Dialogue_ID"]
        old_dialogueid_y=df.loc[i+1, "Old_Dialogue_ID"]
        old_dialogueid_z=old_dialogueid_x+'-'+old_dialogueid_y
        old_dialogueid.append(old_dialogueid_z)        
        
        old_utteranceid_x=df.loc[i, "Old_Utterance_ID"]
        old_utteranceid_y=df.loc[i+1, "Old_Utterance_ID"]
        old_utteranceid_z=old_utteranceid_x+'-'+old_utteranceid_y
        old_utteranceid.append(old_utteranceid_z)

        speaker_x=df.loc[i, "Speaker"]
        speaker_y=df.loc[i+1, "Speaker"]
        speaker_z=speaker_x+'-'+speaker_y
        speaker.append(speaker_z)

df1 = pd.DataFrame(data=None)
df1['Old_Dialogue_ID'] = pd.Series(old_dialogueid)
df1['Old_Utterance_ID'] = pd.Series(old_utteranceid)
df1['Speaker'] = pd.Series(speaker)
df1['Emotion'] = pd.Series(emotion)
df1['Sentiment'] = pd.Series(sentiment)
df1.to_csv(output_filename, index=False,sep='\t')

In [2]:
## Fourth step
## This program extracts acoustic-prosodic features from each turn. 
## The program extracts features from file generated in Step 2  
## The program uses Praat script and extracts pitch, intensity, and voice quality features

#Program to extract acoustic-prosodic feature
import pandas as pd
import csv
import subprocess
import re
import glob
import os,sys
import numpy as np

#Specify input
file_name=r"D:\D drive\MELD corpus\output\MELD_Dyadic_all.csv"
#Specify output
output_filename=r"D:\D drive\MELD corpus\output\MELD_Dyadic_all_ap_new_MELD_dyadic.txt"
#Specify Praat path
praat = 'D:\\D drive\\Praat.exe'
#Specify Praat script path
script= r"D:\D drive\MELD corpus\Program\Python_script\pitch,jitter,shimmer,intensity_emotion.praat"
df = pd.read_csv(file_name,delimiter='\t')
df=df.applymap(str)
for i in range(len(df)):
    subprocess.call([praat, '--run', script, df.loc[i, "Full_path"], output_filename])

In [None]:
## Fifth step

## Merge files generated in Step 3 (MELD_Dyadic_all.csv) and 4 (MELD_Dyadic_all_ap_new_MELD_dyadic) manually using Microsoft Excel 
## The merged file can be named as Merged_dyadic_ap.csv

## After merging the files execute this code
## This program measures speech rate and perform z-score normalizaton 

#This function meausres speech rate
def syllable_count_english(word):
    count = 0
    vowels = "aeiouy"

    for word in word.lower().split(" "):
        for i in range(len(word)):
            if word[i] in vowels and (i == 0 or word[i-1] not in vowels):
                count +=1
    return count

#Specify input file name
ap_filename=r'D:\D drive\MELD corpus\output\Merged_dyadic_ap.csv'
#Specify output file name
op_filename=r'D:\D drive\MELD corpus\output\final_merge1.csv'
with open(ap_filename) as csv_file:
    df = pd.read_csv(csv_file)
df.Utterance=df.Utterance.astype(str)
df.Speaker=df.Speaker.astype(str)
df['speechrate'] = df['Utterance'].map(syllable_count_english)/(df['duration'])
scaled_data = df.copy()
for col in ['mean_pitch','min_pitch', 'max_pitch','mean_intensity',
            'min_intensity','max_intensity','jitter_local',
            'shimmer_local','mean_nhr','speechrate']:
    scaled_data[col] = (scaled_data[col] - scaled_data[col].mean()) / scaled_data[col].std()
scaled_data.to_csv(op_filename, index=False,sep=',')

In [None]:
## Sixth step
## Measure adjacent score on each prosodic feature
## Program meausres absolute distance on adjacent turns and concatenates emotional state of dyads

import pandas as pd
from functools import reduce
import csv
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#Specify input file
file_name=r'D:\D drive\MELD corpus\output\final_merge1.csv'

#Specify output file
output_filename=r'D:\D drive\MELD corpus\output\final_ap_pairdistance_abs_prop.csv'
speaker=[]
adjacent_file_name=[]
adjacent_distance_mean_pitch=[]
adjacent_distance_min_pitch=[]
adjacent_distance_max_pitch=[]
adjacent_distance_mean_intensity=[]
adjacent_distance_min_intensity=[]
adjacent_distance_max_intensity=[]
adjacent_distance_jitter=[]
adjacent_distance_shimmer=[]
adjacent_distance_nhr=[]
adjacent_distance_speechrate=[]
emotion=[]
sentiment=[]
old_dialogueid=[]
old_utteranceid=[]
speaker=[]
adjacent_distance=[]

df = pd.read_csv(file_name,delimiter=',')

df=df.applymap(str)
for i in range(len(df)-1):
    if(df.loc[i, "Old_Dialogue_ID"]==df.loc[i+1, "Old_Dialogue_ID"] and df.loc[i, "Speaker"]!=df.loc[i+1, "Speaker"]):
        
        emotion_x=df.loc[i, "Emotion"]
        emotion_y=df.loc[i+1, "Emotion"]
        emotion_z=emotion_x+'-'+emotion_y
        emotion.append(emotion_z)
        
        sentiment_x=df.loc[i, "Sentiment"]
        sentiment_y=df.loc[i+1, "Sentiment"]
        sentiment_z=sentiment_x+'-'+sentiment_y
        sentiment.append(sentiment_z)
        
        old_dialogueid_x=df.loc[i, "Old_Dialogue_ID"]
        old_dialogueid_y=df.loc[i+1, "Old_Dialogue_ID"]
        old_dialogueid_z=old_dialogueid_x+'-'+old_dialogueid_y
        old_dialogueid.append(old_dialogueid_z)        
        
        old_utteranceid_x=df.loc[i, "Old_Utterance_ID"]
        old_utteranceid_y=df.loc[i+1, "Old_Utterance_ID"]
        old_utteranceid_z=old_utteranceid_x+'-'+old_utteranceid_y
        old_utteranceid.append(old_utteranceid_z)

        speaker_x=df.loc[i, "Speaker"]
        speaker_y=df.loc[i+1, "Speaker"]
        speaker_z=speaker_x+'-'+speaker_y
        speaker.append(speaker_z)
        
        df['mean_pitch']=df['mean_pitch'].astype(np.float)
        df['min_pitch']=df['min_pitch'].astype(np.float)
        df['max_pitch']=df['max_pitch'].astype(np.float)
        df['mean_intensity']=df['mean_intensity'].astype(np.float)
        df['min_intensity']=df['min_intensity'].astype(np.float)
        df['max_intensity']=df['max_intensity'].astype(np.float)
        df['jitter_local']=df['jitter_local'].astype(np.float)
        df['shimmer_local']=df['shimmer_local'].astype(np.float)
        df['mean_nhr']=df['mean_nhr'].astype(np.float)
        df['speechrate']=df['speechrate'].astype(np.float)

        pitch_mean_diff=(df.loc[i, "mean_pitch"]-df.loc[i+1, "mean_pitch"]).__abs__()
        adjacent_distance_mean_pitch.append(pitch_mean_diff)
        pitch_min_diff=(df.loc[i, "min_pitch"]-df.loc[i+1, "min_pitch"]).__abs__()
        adjacent_distance_min_pitch.append(pitch_min_diff)
        pitch_max_diff=(df.loc[i, "max_pitch"]-df.loc[i+1, "max_pitch"]).__abs__()
        adjacent_distance_max_pitch.append(pitch_max_diff)
        intensity_mean_diff=(df.loc[i, "mean_intensity"]-df.loc[i+1, "mean_intensity"]).__abs__()
        adjacent_distance_mean_intensity.append(intensity_mean_diff)
        intensity_min_diff=(df.loc[i, "min_intensity"]-df.loc[i+1, "min_intensity"]).__abs__()
        adjacent_distance_min_intensity.append(intensity_min_diff)
        intensity_max_diff=(df.loc[i, "max_intensity"]-df.loc[i+1, "max_intensity"]).__abs__()
        adjacent_distance_max_intensity.append(intensity_max_diff)
        jitter_diff=(df.loc[i, "jitter_local"]-df.loc[i+1, "jitter_local"]).__abs__()
        adjacent_distance_jitter.append(jitter_diff)
        shimmer_diff=(df.loc[i, "shimmer_local"]-df.loc[i+1, "shimmer_local"]).__abs__()
        adjacent_distance_shimmer.append(shimmer_diff)
        nhr_diff=(df.loc[i, "mean_nhr"]-df.loc[i+1, "mean_nhr"]).__abs__()
        adjacent_distance_nhr.append(nhr_diff)
        speechrate_diff=(df.loc[i, "speechrate"]-df.loc[i+1, "speechrate"]).__abs__()
        adjacent_distance_speechrate.append(speechrate_diff)
        
        
df1 = pd.DataFrame(data=None)
df1['Old_Dialogue_ID'] = pd.Series(old_dialogueid)
df1['Old_Utterance_ID'] = pd.Series(old_utteranceid)
df1['Speaker'] = pd.Series(speaker)
df1['Emotion'] = pd.Series(emotion)
df1['Sentiment'] = pd.Series(sentiment)
df1['Mean Pitch distance'] = pd.Series(adjacent_distance_mean_pitch)
df1['Min Pitch distance'] = pd.Series(adjacent_distance_min_pitch)
df1['Max Pitch distance'] = pd.Series(adjacent_distance_max_pitch)
df1['Mean Intensity distance'] = pd.Series(adjacent_distance_mean_intensity)
df1['Min Intensity distance'] = pd.Series(adjacent_distance_min_intensity)
df1['Max Intensity distance'] = pd.Series(adjacent_distance_max_intensity)
df1['Jitter distance'] = pd.Series(adjacent_distance_jitter)
df1['Shimmer distance'] = pd.Series(adjacent_distance_shimmer)
df1['Mean Nhr distance'] = pd.Series(adjacent_distance_nhr)
df1['Speechrate distance'] = pd.Series(adjacent_distance_speechrate)

df1.to_csv(output_filename, index=False,sep='\t')

In [None]:
## Lastly, analysis can be done using R or JASP
## R file is provided in the repository 