In [9]:
## First step 
## Download the columbia games corpus to a specific location
## Later, execute the code in sequence 
## This program reads each words file and creates a new words file with additional information about "speaker" and "filename".
## The program reads the .words file from each folder and creates a new words file with the extension .words_with_speaker
## For instance, s01.cards.1.A.words will be processed, and the new file will be created on the same path with s01.cards.1.A.words_with_speaker

#Program to add speaker and filename column in each word file
import os
import sys
import glob
import os.path
import pandas as pd
import re

list_of_files = glob.glob(r'D:\Jay\columbia-games-corpus\data\**\*.words',recursive=True) 

for file_name in list_of_files:    
    out_name=file_name+'_with_speaker'
    csv_input = pd.read_csv(file_name,delimiter=' ',header=None)
    csv_input['Speaker'] = file_name[-7]
    csv_input['filename'] = file_name
    csv_input.to_csv(out_name, index=False,header=False,sep=' ')

In [10]:
#Second step 
## The program reads each words_with_speaker file processed in the first step and combines the text files of both speakers in each session.
## For instance, s01.cards.1.A.words_with_speaker and s01.cards.1.B.words_with_speaker will be read and combined. 
## The output will be saved as s01.objects.1.A.words_with_speaker_txt
## The program also converts each flac file into wav file format
## For instance, s01.cards.1.A.flac will be transformed to s01.objects.1.A.wav

#Program to merge Speaker text files and convert flac files
import os
import fnmatch
import sys
import subprocess

OLD_BASE = r'D:\Jay\columbia-games-corpus\data'
NEW_BASE = r'D:\Jay\columbia-games-corpus\jkdataex'

def merge_files(infiles):
    output=os.path.basename(infiles[0][:-9])+'txt'
    outfiletxt = os.path.join(new_dir, output)
    with open(outfiletxt, 'wb') as fo:
        for infile in infiles:
            with open(infile, 'rb') as fi:
                fo.write(fi.read())

def convert_files(infiles):
    output_channel1=os.path.basename(infiles[0][:-5])+'.wav'
    output_channel2=os.path.basename(infiles[1][:-5])+'.wav'
    outfile1 = os.path.join(new_dir, output_channel1)
    outfile2 = os.path.join(new_dir, output_channel2)
    ffp=r"C:\ffmpeg-master-latest-win64-gpl\bin\ffmpeg"
    cmd2wav1 = ffp+' -i ' + infiles[0] + ' ' + outfile1
    cmd2wav2 = ffp+' -i ' + infiles[1] + ' ' + outfile2
    #print(cmd2wav)
    subprocess.call(cmd2wav1, shell=True)
    subprocess.call(cmd2wav2, shell=True)

for (dirpath, dirnames, filenames) in os.walk(OLD_BASE):
    base, tail = os.path.split(dirpath)
    if base != OLD_BASE: continue  # Don't operate on OLD_BASE, only children directories

    # Build infiles list for flac objects
    object_flac = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*objects*.flac")])
    # Build infiles list for words objects
    object_text = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*objects*.words_with_speaker")])

    # Build infiles list for flac objects
    object_flac1 = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*cards.1*.flac")])
    # Build infiles list for words objects
    object_text1 = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*cards.1*.words_with_speaker")])

    # Build infiles list for flac objects
    object_flac2 = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*cards.2*.flac")])
    # Build infiles list for words objects
    object_text2 = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*cards.2*.words_with_speaker")])

    # Build infiles list for flac objects
    object_flac3 = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*cards.3*.flac")])
    # Build infiles list for words objects
    object_text3 = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*cards.3*.words_with_speaker")])

    # Create output directory
    new_dir =  os.path.join(NEW_BASE, tail)
    os.mkdir(new_dir)  # This will raise an OSError if the directory already exists    

    # Merge
    convert_files(object_flac)
    merge_files(object_text)
    convert_files(object_flac1)
    merge_files(object_text1)
    convert_files(object_flac2)
    merge_files(object_text2)
    convert_files(object_flac3)
    merge_files(object_text3)





In [11]:
#Third step 
## The program reads each concatenated words_with_speaker_txt file processed in the second step and further processes it in the following way.
## Each word consecutively spoken by a speaker is merged and converted into a turn.
## Similarly, start and end times are adjusted accordingly
## For instance, s01.objects.1.A.words_with_speaker_txt will be read and s01.objects.1.txt will be generated in same folder. 

#Program to extract turns
import os
import sys
import glob
import os.path
import pandas as pd
import re
list_of_files = glob.glob(r'D:\Jay\columbia-games-corpus\jkdataex\**\*.words_wittxt',recursive=True) 

for file_name in list_of_files:    
    out_name=file_name[:-14]+'txt'
    headers= ['start','end','words','Speaker','filename']
    csv_input = pd.read_csv(file_name,delimiter=' ')
    csv_input.columns = headers
    #csv_input
    cgc2=csv_input.sort_values(['start'])    
    cgc3=cgc2[cgc2["words"].str.contains("#")==False]
    cgc3[['start', 'end']] = cgc3[['start', 'end']].astype(str)
    cgc3['obj1_count'] = (cgc3['Speaker'].ne(cgc3['Speaker'].shift())).cumsum()
    df3=cgc3.groupby('obj1_count').agg(lambda x: ' '.join(x))
    df3['start'] = df3['start'].map(lambda x: x.split(" ")[0])
    df3['end'] = df3['end'].map(lambda x: x.split(" ")[-1])
    df3['Speaker'] = df3['Speaker'].map(lambda x: x.split(" ")[-1])
    df3['filename'] = df3['filename'].map(lambda x: x.split(" ")[-1])
    df3['filename'] = df3['filename'].str.replace('words','wav')
    df3['filename'] = df3['filename'].str.replace('data','jkdataex')
    df3['session']=re.search('(\d\d)',file_name)[0]
    if df3.iloc[-1]['Speaker'] == df3.iloc[0]['Speaker']:
        df4=df3.drop(df3.index[len(df3)-1])
        df4.to_csv(out_name, index=False,header=False,sep='\t')
        del df4
    else:
        df3.to_csv(out_name, index=False,header=False,sep='\t')
    del csv_input,cgc2,cgc3,df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cgc3[['start', 'end']] = cgc3[['start', 'end']].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cgc3['obj1_count'] = (cgc3['Speaker'].ne(cgc3['Speaker'].shift())).cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cgc3[['start', 'end']] = cgc3[['start', 'end']].astype(str)
A value i

In [12]:
#Fourth step 
## Program to delete words_with_speaker_txt file
## The program deletes all the words_with_speaker_txt files generated in second step
## The program allows better readability of the generated output
## After executing the program, Each session folder will have two wav files and one txt file
## For instance, s01.objects.1.A.wav, s01.objects.1.B.wav and s01.objects.1.txt 

import glob

#Delete files with unwanted extension
for f in glob.glob(r'D:\Jay\columbia-games-corpus\jkdataex\**\*.words_wittxt',recursive=True):
    os.remove(f)

In [None]:
#Fifth step 
## Program to extract semantic and auditory embeddings from each turn
## The program reads each text file generated in the third step and extracts TRILL vectors and semantic embeddings from each turn
## The program saves embeddings in pkl file format
#Program to extract audio embeddings
import pandas as pd
import torch
from functools import reduce
import csv
import random
import subprocess
import re
import glob
import os,sys
import numpy as np
import soundfile as sf
import wave
import json
import tensorflow as tf1
import tensorflow_hub as hub
# Import TF 2.X and make sure we're running eager.
import tensorflow.compat.v2 as tf
#import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
modulev3=None
modulev3_graph=None

tf.enable_v2_behavior()
assert tf.executing_eagerly()

##Function to load TRILL vector model    
def get_TRILLv3_signal(signal,samplerate):
    global modulev3
    if modulev3==None:
        #Specify the path of TRILL vector model
        print('******************\nLoading model ...\n******************')    
        modulev3 = hub.load(r'D:\Jay\columbia-games-corpus\trill_extraction_v2\v3')
    
    
    max_int16 = 2**15
    chunks_cnt=int(signal.shape[0]/(samplerate*10.0))#10 seconds max in chunk
    if chunks_cnt==0:
        chunks=[signal]
    else:
        chunks=np.array_split(signal, chunks_cnt)
    
    trillv3_emb_all=np.empty(shape=(0,512))
    
    for chunk in chunks:
        trillv3 = modulev3(samples=chunk, sample_rate=samplerate)
        trillv3_emb = trillv3['embedding']
        trillv3_emb_all=np.concatenate((trillv3_emb_all, trillv3_emb))

    trillv3_emb_avg = np.mean(trillv3_emb_all, axis=0, keepdims=False)

    return (trillv3_emb_avg.tolist())    
##Function to check if audio files are in proper format
def check_wav_format(wav_file, start, end):
    wf = wave.open(wav_file)
    nchannels, sampwidth, framerate, nframes, comptype, compname = wf.getparams()
    print(nchannels, sampwidth, framerate, nframes, comptype, compname)
    wav_length = float(nframes) / float(framerate)
    print(wav_length)						
    return(framerate)

##Function to extract TRILL embeddings from specific file name with start and end time as parameters
def get_TRILLv3_audiofile_from_to(wav_file,start,end):
    print('get_TRILLv3_signal:',wav_file,start,end)
    samplerate=check_wav_format(wav_file, start, end)
    if samplerate<0:
        return(null)    
    startsample=int(start*samplerate)
    endsample=int(end*samplerate)
    signal, samplerate = sf.read(wav_file,start=startsample, stop=endsample)
    print(len(signal),samplerate)    

    trill=get_TRILLv3_signal(signal,samplerate)
    return(trill)

#Specify path where auditory embeddings needs to be saved
output_path = 'D:\\Jay\\\columbia-games-corpus\\jddataex_output'

##Specify input path from where text files needs to be read
#Specify path for files generated in previous step
path = 'D:\\Jay\\\columbia-games-corpus\\jkdataex\\'
all_files = os.listdir(path)
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r') as f:
                out_name= os.path.basename(file)
                output=os.path.join(output_path, out_name)
                audioembeddings=[]
                text = f.readlines()
                for line in text:
                    n1=line.split('\t')[0]
                    n2=line.split('\t')[1]
                    n3=line.split('\t')[4]
                    x=get_TRILLv3_audiofile_from_to(n3,float(n1),float(n2))
                    audioembeddings.append(x)

            #Merge consecutive utterance of Speaker A and B
            out = reduce(lambda x, y: x+y, audioembeddings)

            #Each consecutive utterance is of size 1024 i.e 512 for each utterance
            chunks = [out[x:x+1024] for x in range(0, len(out)-512, 512)]

            #Convert list to array
            arr = np.asarray(chunks)
            with open(output, 'w') as fcsv:
                writer = csv.writer(fcsv)
                writer.writerows(arr)
            audioembeddings = []
            audio_vectors = []
            arr=None
