In [1]:
## First step 
## Download the columbia games corpus to a specific location
## Later, execute the code in sequence 
## This program reads each words file and creates a new words file with additional information about "speaker" and "filename".
## The program reads the .words file from each folder and creates a new words file with the extension .words_with_speaker
## For instance, s01.cards.1.A.words will be processed, and the new file will be created on the same path with s01.cards.1.A.words_with_speaker

#Program to add speaker and filename column in each word file
import os
import sys
import glob
import os.path
import pandas as pd
import re

#Specify the path of the dataset
list_of_files = glob.glob(r'D:\Jay\columbia-games-corpus\data\**\*.words',recursive=True) 

for file_name in list_of_files:    
    out_name=file_name+'_with_speaker'
    csv_input = pd.read_csv(file_name,delimiter=' ',header=None)
    csv_input['Speaker'] = file_name[-7]
    csv_input['filename'] = file_name
    csv_input.to_csv(out_name, index=False,header=False,sep=' ')

In [3]:
#Second step 
## The program reads each words_with_speaker file processed in the first step and combines the text files of both speakers in each session.
## For instance, s01.cards.1.A.words_with_speaker and s01.cards.1.B.words_with_speaker will be read and combined. 
## The output will be saved as s01.objects.1.A.words_with_speaker_txt
## The program also converts each flac file into wav file format
## For instance, s01.cards.1.A.flac will be transformed to s01.objects.1.A.wav

#Program to merge Speaker text files and convert flac files
import os
import fnmatch
import sys
import subprocess

#Specify path of dataset
OLD_BASE = r'D:\Jay\columbia-games-corpus\data'

#Create the empty folder where ouput needs to be saved
#Specify path of output folder
NEW_BASE = r'D:\Jay\columbia-games-corpus\jkdata'

def merge_files(infiles):
    output=os.path.basename(infiles[0])+'_txt'
    outfiletxt = os.path.join(new_dir, output)
    with open(outfiletxt, 'wb') as fo:
        for infile in infiles:
            with open(infile, 'rb') as fi:
                fo.write(fi.read())

def convert_files(infiles):
    output_channel1=os.path.basename(infiles[0][:-5])+'.wav'
    output_channel2=os.path.basename(infiles[1][:-5])+'.wav'
    outfile1 = os.path.join(new_dir, output_channel1)
    outfile2 = os.path.join(new_dir, output_channel2)
    #Specify path of ffmpeg for converting audio files
    ffp=r"C:\ffmpeg-master-latest-win64-gpl\bin\ffmpeg"
    cmd2wav1 = ffp+' -i ' + infiles[0] + ' ' + outfile1
    cmd2wav2 = ffp+' -i ' + infiles[1] + ' ' + outfile2
    subprocess.call(cmd2wav1, shell=True)
    subprocess.call(cmd2wav2, shell=True)

for (dirpath, dirnames, filenames) in os.walk(OLD_BASE):
    base, tail = os.path.split(dirpath)
    if base != OLD_BASE: continue  # Don't operate on OLD_BASE, only children directories

    # Build infiles list for flac objects
    object_flac = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*objects*.flac")])
    # Build infiles list for words objects
    object_text = sorted([os.path.join(dirpath, filename) for filename in filenames if fnmatch.fnmatch(filename,"*objects*.words_with_speaker")])

    # Create output directory
    new_dir =  os.path.join(NEW_BASE, tail)
    os.mkdir(new_dir)  # This will raise an OSError if the directory already exists    

    # Merge
    convert_files(object_flac)
    merge_files(object_text)

In [None]:
#Third step 
## The program reads each concatenated words_with_speaker_txt file processed in the second step and further processes it in the following way.
## Each word consecutively spoken by a speaker is merged and converted into a turn.
## Similarly, start and end times are adjusted accordingly
## For instance, s01.objects.1.A.words_with_speaker_txt will be read and s01.objects.1.txt will be generated in same folder. 

#Program to extract turns
import os
import sys
import glob
import os.path
import pandas as pd

#Add path for output files saved in second step
list_of_files = glob.glob(r'D:\Jay\columbia-games-corpus\jkdata\**\*.words_with_speaker_txt',recursive=True) 

for file_name in list_of_files:    
    out_name=file_name[:-24]+'txt'
    headers= ['start','end','words','Speaker','filename']
    csv_input = pd.read_csv(file_name,delimiter=' ')
    csv_input.columns = headers
    cgc2=csv_input.sort_values(['start'])    
    cgc3=cgc2[cgc2["words"].str.contains("#")==False]
    cgc3[['start', 'end']] = cgc3[['start', 'end']].astype(str)
    #Merge consecutive turns of same speaker
    cgc3['obj1_count'] = (cgc3['Speaker'].ne(cgc3['Speaker'].shift())).cumsum()
    df3=cgc3.groupby('obj1_count').agg(lambda x: ' '.join(x))
    #Adjust start and end time
    df3['start'] = df3['start'].map(lambda x: x.split(" ")[0])
    df3['end'] = df3['end'].map(lambda x: x.split(" ")[-1])
    #Add speaker information
    df3['Speaker'] = df3['Speaker'].map(lambda x: x.split(" ")[-1])
    df3['filename'] = df3['filename'].map(lambda x: x.split(" ")[-1])
    df3['filename'] = df3['filename'].str.replace('words','flac')
    #Add session information
    df3['session']=re.search('(\d\d)',file_name)[0]
    if df3.iloc[-1]['Speaker'] == df3.iloc[0]['Speaker']:
        df4=df3.drop(df3.index[len(df3)-1])
        df4.to_csv(out_name, index=False,header=False,sep='\t')
        del df4
    else:
        df3.to_csv(out_name, index=False,header=False,sep='\t')
    del csv_input,cgc2,cgc3,df3

In [None]:
#Fourth step 
## The program reads each processed file from previous step and extracts semantic embeddings
##Extract semantic embeddings from files

import pandas as pd
import glob
import os
import sys
import torch
import numpy as np
from functools import reduce
import csv
from sentence_transformers import SentenceTransformer
sen_w_feats = []
sentence_embeddings = []

#Initialize transformer model for semantic feature extraction
# Load the BERT tokenizer.
print('Loading Transformer...')
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-dot-v5')

##Specify input path from where text files needs to be read
#Specify path for files generated in third step
list_of_files = glob.glob(r'D:\Jay\columbia-games-corpus\jkdata\**\*.txt',recursive=True) 

#Specify path where semantic embeddings needs to be saved
output_path = r'D:\Jay\DNN\Columbia\Embeddings\Text_semantic'

for file_name in list_of_files:
    out_name= os.path.join(output_path, os.path.basename(file_name))
    csv_input = pd.read_csv(file_name, usecols=[2], names=['utterance'],delimiter='\t',header=None)
    for index, row in csv_input.iterrows():
        sen_w_feats.append(row["utterance"])
        
    #Convert sentence to list
    sentence_embeddings = model.encode(sen_w_feats)
    sentence_vectors1=sentence_embeddings.tolist()

    #Merge consecutive utterance of Speaker A and B
    out = reduce(lambda x, y: x+y, sentence_vectors1)

    #Each consecutive utterance is of size 1536 i.e 768 for each utterance
    chunks = [out[x:x+1536] for x in range(0, len(out)-768, 768)]

    #Convert list to array
    arr = np.asarray(chunks)
    with open(out_name, 'w') as fcsv:
        writer = csv.writer(fcsv)
        writer.writerows(arr)
    sen_w_feats = []
    sentence_embeddings = []
    sentence_vectors1=None
    arr=None
    model_output=None

In [3]:
#Fifth step 
## The program reads each processed file from previous step and converts them into h5 format

import csv
import h5py
import numpy as np
import pandas as pd
import glob
import random
import pdb

SEED=448
frac_train = 1.0

# Create h5 files

##Specify input path from where embeddings files needs to be read
#Specify path for files generated in fourth step
sessList= sorted(glob.glob(r'D:\Jay\DNN\Columbia\Embeddings\Text_semantic\*.txt',recursive=True))

num_files_all = len(sessList)
num_files_train = int(np.ceil((frac_train*num_files_all)))

sessTrain = sessList[:num_files_train]

# Create Train Data file

X_train =np.array([])
X_train = np.empty(shape=(0, 0), dtype='float64' )
for sess_file in sessTrain:
    df_i = pd.read_csv(sess_file)
    xx=np.array(df_i)
    X_train=np.vstack([X_train, xx]) if X_train.size else xx


X_train = X_train.astype('float64')

#Specify path where h5 embeddings needs to be saved
hf = h5py.File(r'D:\Jay\DNN\Columbia\h5\semantic\train_nonorm.h5', 'w')
hf.create_dataset('textdataset', data=X_train)
hf.close()




In [7]:
#Sixth step 
#The program to split each h5 data into two groups
##The program reads h5r files generated in previous step and splits them into two groups
##The first group (Even) has utterance spoken by speaker A followed by speaker B
##The second group (Odd) has utterance spoken by speaker B followed by speaker A
import h5py
import numpy as np
import pandas as pd

##Specify input path from where h5 file needs to be read
#Specify path for files generated in fifth step
path= r'D:\Jay\DNN\Columbia\h5\semantic\train_nonorm.h5'

##Specify the output paths
new_path = r'D:\Jay\DNN\Columbia\h5\semantic\train_nonorm_even.h5'
new_path1 = r'D:\Jay\DNN\Columbia\h5\semantic\train_nonorm_odd.h5'

with h5py.File(path, 'r') as f:
   data_set = f['textdataset']
   new_data_even = data_set[::2]
   new_data_odd = data_set[1::2]

with h5py.File(new_path, 'w') as f:
   f.create_dataset('textdataset', data=new_data_even)

with h5py.File(new_path1, 'w') as f:
   f.create_dataset('textdataset', data=new_data_odd)