In [30]:
## First step
## Download the Gaze dataset to a specific location
## Later, execute the code in sequence 
## This program reads each baseline (BL) file in the dataset and removes them from the folder.
## Program to remove list of files with BL in the gaze data
import os
import glob
import re

#Specify path of the dataset
fileList = glob.glob(r'D:\Jay\GAZE\dataset\Audio_Textgrid\**\*.*', recursive=True)

for _file in fileList:
    if re.search('-BL-', _file):
        os.remove(_file)


In [31]:
## Second step
## The program renames all text grid and wav files by removing empty spaces between them
# Rename list of files in the gaze dataset
import os
import glob
import re

#Specify path of the dataset
fileList = glob.glob(r'D:\Jay\GAZE\dataset\Audio_Textgrid\**\*.*', recursive=True)

for _file in fileList:
    if re.search('\s', _file):
        new_name = _file.split(' ')[0]+'.TextGrid'
        os.rename(_file, new_name)

In [1]:
#Third step 
## The program reads each textgrid file in the dataset folder and returns a text file with information about start, end and speaker
## For instance, GA-CO-AMO.TextGrid will be read and GA-CO-AMO.txt will be generated in same folder

# Program extracts start, stop and tier information from textgrid files
import os
import sys
import glob
import os.path
import pandas as pd
import re
import textgrid
import numpy as np
# To install textgrid package
# git clone https://github.com/kylerbrown/textgrid.git
# cd textgrid
# pip install .

#Specify path of the dataset
list_of_files = glob.glob(r'D:\Jay\GAZE\dataset\Audio_Textgrid\**\*.TextGrid',recursive=True)

for file_name in list_of_files:    
    out_name=file_name[:-9]+'.txt'
    tgrid = textgrid.read_textgrid(file_name)
    df1=pd.DataFrame(tgrid)
    df1['name'].replace('', np.nan, inplace=True)
    df1.dropna(subset=['name'], inplace=True)
    df1 = df1[df1["name"].str.contains("conversation") == False]
    df1.drop('name', axis=1, inplace=True)
    df1.sort_values(by=['start'],inplace=True)
    df1.to_csv(out_name, index=False,header=False,sep='\t')
    del tgrid, df1


In [None]:
#Fourth step 
## The program converts each wav file with sample rate of 8k into wav file with 16k format

import subprocess
import glob
import os

#Specify input path of wav files
list_of_wav_files = sorted(glob.glob(r'D:\Jay\GAZE\dataset\Audio_Textgrid\**\*.wav',recursive=True))

#Specify output path of wav files
new_path_resampled = r'D:\Jay\GAZE\dataset\Audio_resampled'
def resample_wavfiles(infiles):
    for i in range(0, len(infiles)):
        current_file=os.path.basename(infiles[i])
        print("processing file", current_file)
        outfile2 = os.path.join(new_path_resampled, current_file)

        ffp=r"C:\ffmpeg-master-latest-win64-gpl\bin\ffmpeg"
        cmd2wav2 = ffp+' -i ' + infiles[i] + ' ' + "-ac 1" + ' ' + "-ar 16000" + ' ' + outfile2
        print(cmd2wav2)
        subprocess.call(cmd2wav2, shell=True)

resample_wavfiles(list_of_wav_files)

In [None]:
#Fifth step 
## Program to extract utterance (text), semantic and auditory embeddings from each text file
## The program read text file and extract embeddings from corresponding wav file
## The program reads each text file generated in the third step and extracts TRILL vectors, semantic and textual embeddings from each turn
## The output of embeddings is saved in pkl file format

#Get utterance from text file and extract DNN embeddings 
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import BertTokenizer, BertModel
import torch
from functools import reduce
from sklearn.metrics.pairwise import cosine_similarity
import os,sys
import numpy as np
import soundfile as sf
import wave
import tensorflow as tf1
import tensorflow_hub as hub
# Import TF 2.X and make sure we're running eager.
import tensorflow.compat.v2 as tf
#import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
modulev3=None
modulev3_graph=None
import pickle
tf.enable_v2_behavior()
assert tf.executing_eagerly()
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

##Function to load TRILL vector model   
def get_TRILLv3_signal(signal,samplerate):
    global modulev3
    if modulev3==None:
        #Specify the path of TRILL vector model
        print('******************\nLoading model ...\n******************')    
        modulev3 = hub.load(r'D:\Jay\columbia-games-corpus\trill_extraction_v2\v3')
    
    
    max_int16 = 2**15
    chunks_cnt=int(signal.shape[0]/(samplerate*10.0))#10 seconds max in chunk
    if chunks_cnt==0:
        chunks=[signal]
    else:
        chunks=np.array_split(signal, chunks_cnt)
    
    trillv3_emb_all=np.empty(shape=(0,512))
    
    for chunk in chunks:
        trillv3 = modulev3(samples=chunk, sample_rate=samplerate)
        trillv3_emb = trillv3['embedding']
        trillv3_emb_all=np.concatenate((trillv3_emb_all, trillv3_emb))

    trillv3_emb_avg = np.mean(trillv3_emb_all, axis=0, keepdims=False)

    return (trillv3_emb_avg.tolist())    

##Function to check if audio files are in proper format 
def check_wav_format(wav_file, start, end):
    wf = wave.open(wav_file)
    nchannels, sampwidth, framerate, nframes, comptype, compname = wf.getparams()
    print(nchannels, sampwidth, framerate, nframes, comptype, compname)
    wav_length = float(nframes) / float(framerate)
    print(wav_length)
    if nchannels!=1:
        print('Error: Incoming audio file has more then 1 channel')
        return(-1) 
				
    if framerate!=16000:
        print('Error: Incoming audio file sampling frequency must be 16000')
        return(-2)
				
    if sampwidth!=2:
        print('Error: Incoming audio file sample width must be 16 bit')
        return(-3)

    if wav_length<end:
        print('Error: The duration of the audio file is shorter than the required end time')
        return(-4)
		
    if start<0.0:
        print('Error: start time is lower then 0.0')
        return(-5)
    
    if start>=end:
        print('Error: start time is larger then end time')
        return(-6)    
		
				
    return(framerate)

##Function to extract TRILL embeddings from specific file name with start and end time as parameters   
##The function also uses facebook model to extract utterance from audio file i.e., speech to text
def get_utterance_audiofile_from_to(wav_file,start,end):
    print('get_signal:',wav_file,start,end)
    samplerate=check_wav_format(wav_file, start, end)
    if samplerate<0:
        return(null)    
    startsample=int(start*samplerate)
    endsample=int(end*samplerate)
    signal, samplerate = sf.read(wav_file,start=startsample, stop=endsample)
    print(len(signal),samplerate)    
    tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h",do_lower_case=True)
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    input_values = tokenizer(signal, return_tensors = "pt").input_values
    logits = model(input_values).logits
    prediction = torch.argmax(logits, dim = -1)
    transcription = tokenizer.batch_decode(prediction)[0]
    trill=get_TRILLv3_signal(signal,samplerate)
    return(transcription,trill)

#Specify path of dataset
path = 'D:\\Jay\\GAZE\\dataset\\'

#Specify path of audio files generated in previous step
audio_path=r'D:\Jay\GAZE\dataset\Audio_resampled'

#Specify path of text files where data will be saved
text_path=r'D:\Jay\GAZE\dataset\Text'

#Specify path of embeddings where data will be saved in pkl file format
#Specify path where auditory embeddings needs to be saved
audio_embedding_path = r'D:\Jay\GAZE\Pickle\audio'

#Specify path where utterance embeddings needs to be saved
text_embedding_path = r'D:\Jay\GAZE\Pickle\text'

#Specify path where semantic embeddings needs to be saved
semantic_embedding_path = r'D:\Jay\GAZE\Pickle\semantic'

#Initialize transformer model for semantic feature extraction
print('Loading Transformer...')
model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v4")

all_files = os.listdir(path)

for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.txt'):
            current_file=os.path.join(root, file)
            with open(current_file, 'r', encoding="utf8") as f:
                out_text_file= os.path.join(text_path, os.path.basename(file))
                out_audio_embedding= os.path.join(audio_embedding_path, os.path.basename(file)).split('.')[-2]
                out_text_embedding= os.path.join(text_embedding_path, os.path.basename(file)).split('.')[-2]
                out_semantic_embedding= os.path.join(semantic_embedding_path, os.path.basename(file)).split('.')[-2]
                sentence=[]
                audioembeddings = []
                sentence_embeddings_tensor = []
                text = f.readlines()
                for line in text:
                    n1=line.split('\t')[0]
                    n2=line.split('\t')[1]
                    n3=os.path.join(audio_path, os.path.basename(file)).split('.')[-2]+'.wav'
                    x,y=get_utterance_audiofile_from_to(n3,float(n1),float(n2))
                    sentence.append(x)
                    audioembeddings.append(y)
                sentence_embeddings_tensor = model.encode(sentence, convert_to_tensor=False)
                sentence_embeddings_tensor=sentence_embeddings_tensor.tolist()
 
                with open(out_audio_embedding+'.pkl', 'wb') as f:
                    pickle.dump(audioembeddings, f)
                with open(out_semantic_embedding+'.pkl', 'wb') as f:
                    pickle.dump(sentence_embeddings_tensor, f)
                with open(out_text_embedding+'.pkl', 'wb') as f:
                    pickle.dump(sentence, f)
    
                df1=pd.read_csv(current_file,delimiter='\t',header=None, names=['start','end','Speaker'])
                df1['Utterance'] = sentence
                df1.insert(0, 'Turn_number', range(1, 1 + len(df1)))
                df1.to_csv(out_text_file, index=False,header=True,sep='\t')

In [None]:
#Sixth step 
## Program to extract seven acoustic prosodic features using PRAAT toolkit
## The program read text file and extract prosodic features from corresponding wav file
## The program reads each text file generated in the fourth step and extracts pitch mean and max, inetnsity mean and max, Jitter, Shimmer, NHR from each turn 
## The output is saved in text file format

#Extract acoustic prosodic features
import pandas as pd
import csv
import subprocess
import re
import glob
import os,sys
import numpy as np

#Specify path of the text files generated in previous step 
path = 'D:\\Jay\\GAZE\\dataset\\Text'

#Specify path of audio files generated in fourth step
audio_path=r'D:\Jay\GAZE\dataset\Audio_resampled'

#Specify output folder
output_path=r'D:\Jay\GAZE\dataset\Audio_ap'

#Specify PRAAT path
praat = 'D:\\D drive\\Praat.exe'

#Specify PRAAT script path
script= 'D:\\Jay\\GAZE\\dataset\\Praat script\\pitch,jitter,shimmer,intensity_new.praat'


all_files = os.listdir(path)

for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.txt'):
            current_file=os.path.join(root, file)
            with open(current_file, 'r', encoding="utf8") as f:
                text = f.readlines()
                output_filename = os.path.join(output_path, os.path.basename(file))
                for line in text:
                    n1=line.split('\t')[1]
                    n2=line.split('\t')[2]
                    n3=os.path.join(audio_path, os.path.basename(file)).split('.')[-2]+'.wav'
                    print(n3)
                    subprocess.call([praat, '--run', script, n3,n1,n2, output_filename])

In [3]:
#Seventh step 
## Program to merge text transcripts
## The program read all text files and merge them into one big file
## The program reads each text file generated in the fourth step and extracts pitch mean and max, inetnsity mean and max, Jitter, Shimmer, NHR from each turn 
## One output file is generated and is saved in text file format
## The output file has 8 columns namely Turn_number, start,	stop, tier,	File_name, Condition, Participant_id, Utterance 

#Merge Text transcript
import pandas as pd
import glob

#Specify path for input files generated in Fifth step
list_of_files = glob.glob(r'D:\Jay\GAZE\dataset\Text\*',recursive=True)

#Specify path for output file 
out_name = r'D:\Jay\GAZE\dataset\merged_text_transcript'
li = []

for filename in list_of_files:
    df = pd.read_csv(filename, index_col=None, header=0, delimiter='\t')
    basename = os.path.basename(filename)
    file_name = os.path.splitext(basename)[0]
    df['File_name'] = file_name
    df['Condition'] = df['File_name'].str.slice(0,2)
    df['Participant'] = df['File_name'].str.slice(6,9)
    #Re-arrange columns with filename as first column
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    #Define new arranged column to data-frame
    df = df[cols]
    #Append the dataframe to list
    li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True) 

frame.to_csv(out_name+'.csv', index=False,sep=',')

In [10]:
#Eight step 
## Program to merge acoustic-prosodic files generated in sixth step
## The program read all text files and merge them into one big file
## The program reads each text file generated in the sixth step and combines them and save it in text file format
## The output file has 12 columns namely 
# filename, duration, mean_pitch, min_pitch, 
# max_pitch, median_pitch, mean_intensity, min_intensity, 
# max_intensity, jitter_local, shimmer_local, mean_nhr

import pandas as pd
import glob

#Specify path of acoustic-prosodic features file generated in sixth step
list_of_files = glob.glob(r'D:\Jay\GAZE\dataset\Audio_ap\*',recursive=True)

#Specify path of output
out_name = r'D:\Jay\GAZE\dataset\merged_audio_ap'
li = []

for filename in list_of_files:
    df = pd.read_csv(filename, delimiter='\t',header=None, names=['filename','Full_path','duration','mean_pitch','min_pitch',
    'max_pitch','mean_intensity','min_intensity','max_intensity','jitter_local','shimmer_local','mean_nhr'])
    basename = os.path.basename(filename)
    file_name = os.path.splitext(basename)[0]
    df['filename'] = file_name
    df.drop(['Full_path'], axis=1,inplace=True)
    li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True) 

frame.to_csv(out_name+'.csv', index=False,sep=',')

In [None]:
#Ninth step
##Using excel, merge files manually generated in seventh and eighth step
##Save the merged file and provide it as an input to current program
##This program measures speech rate and perform z-score normalization
#Measure speech rate and perform z-score normalization

from scipy.stats import zscore
import numpy as np
def syllable_count_english(word):
    count = 0
    vowels = "aeiouy"

    for word in word.lower().split(" "):
        for i in range(len(word)):
            if word[i] in vowels and (i == 0 or word[i-1] not in vowels):
                count +=1
    return count

#Provide path for merged file as input
ap_filename=r'D:\Jay\GAZE\dataset\Merged.csv'

#Provide path for output file
output=r'D:\Jay\GAZE\dataset\final_merge.csv'


with open(ap_filename,encoding='utf-8') as csv_file:
    df = pd.read_csv(csv_file)
df.drop(['File_name'], axis=1,inplace=True)
df.replace('--undefined--', 0,inplace=True)
df['Speaker'] = np.where(df['Speaker'] == 'participant', df['Participant'], df['Speaker'])
df.Utterance=df.Utterance.astype(str)
df['speechrate'] = df['Utterance'].map(syllable_count_english)/(df['duration'])
df['mean_pitch']=df['mean_pitch'].astype(np.float)
df['min_pitch']=df['min_pitch'].astype(np.float)
df['max_pitch']=df['max_pitch'].astype(np.float)
df['mean_intensity']=df['mean_intensity'].astype(np.float)
df['min_intensity']=df['min_intensity'].astype(np.float)
df['max_intensity']=df['max_intensity'].astype(np.float)
df['jitter_local']=df['jitter_local'].astype(np.float)
df['shimmer_local']=df['shimmer_local'].astype(np.float)
df['mean_nhr']=df['mean_nhr'].astype(np.float)
df['speechrate']=df['speechrate'].astype(np.float)
#Perform z score normalization
df['z_mean_pitch'] = df.groupby(['Speaker']).mean_pitch.transform(zscore, ddof=1)
df['z_min_pitch'] = df.groupby(['Speaker']).min_pitch.transform(zscore, ddof=1)
df['z_max_pitch'] = df.groupby(['Speaker']).max_pitch.transform(zscore, ddof=1)
df['z_mean_intensity'] = df.groupby(['Speaker']).mean_intensity.transform(zscore, ddof=1)
df['z_min_intensity'] = df.groupby(['Speaker']).min_intensity.transform(zscore, ddof=1)
df['z_max_intensity'] = df.groupby(['Speaker']).max_intensity.transform(zscore, ddof=1)
df['z_jitter_local'] = df.groupby(['Speaker']).jitter_local.transform(zscore, ddof=1)
df['z_shimmer_local'] = df.groupby(['Speaker']).shimmer_local.transform(zscore, ddof=1)
df['z_mean_nhr'] = df.groupby(['Speaker']).mean_nhr.transform(zscore, ddof=1)
df['z_speechrate'] = df.groupby(['Speaker']).speechrate.transform(zscore, ddof=1)
df.to_csv(output, index=False,sep=',')


In [None]:
#Tenth step
##The program measures absolute distance on two adjacent turns on 8 acoustic-prosodic features.
##The program reads file generated in previous step and measures L1 distance on each prosodic feature seperately
##The file will have 14 columns namely
#Turn_number, Participant, Condition, File_name, Mean Pitch distance, Min Pitch distance	
#Max Pitch distance, Mean Intensity distance, Min Intensity distance,Max Intensity distance,	
#Jitter distance, Shimmer distance, Mean Nhr distance and Speechrate distance


import pandas as pd
from functools import reduce
import csv
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#Specify input 
file_name=r'D:\Jay\GAZE\dataset\final_merge.csv'

#Specify output
output_filename=r'D:\Jay\GAZE\dataset\final_ap_pairdistance_abs.csv'

speaker=[]
adjacent_file_name=[]
adjacent_distance_mean_pitch=[]
adjacent_distance_min_pitch=[]
adjacent_distance_max_pitch=[]
adjacent_distance_mean_intensity=[]
adjacent_distance_min_intensity=[]
adjacent_distance_max_intensity=[]
adjacent_distance_jitter=[]
adjacent_distance_shimmer=[]
adjacent_distance_nhr=[]
adjacent_distance_speechrate=[]


df = pd.read_csv(file_name,delimiter=',')

df=df.applymap(str)
for i in range(len(df)-1):
    if(df.loc[i, "filename"]==df.loc[i+1, "filename"] and df.loc[i, "Speaker"]!=df.loc[i+1, "Speaker"]):
        
        df['z_mean_pitch']=df['z_mean_pitch'].astype(np.float)
        df['z_min_pitch']=df['z_min_pitch'].astype(np.float)
        df['z_max_pitch']=df['z_max_pitch'].astype(np.float)
        df['z_mean_intensity']=df['z_mean_intensity'].astype(np.float)
        df['z_min_intensity']=df['z_min_intensity'].astype(np.float)
        df['z_max_intensity']=df['z_max_intensity'].astype(np.float)
        df['z_jitter_local']=df['z_jitter_local'].astype(np.float)
        df['z_shimmer_local']=df['z_shimmer_local'].astype(np.float)
        df['z_mean_nhr']=df['z_mean_nhr'].astype(np.float)
        df['z_speechrate']=df['z_speechrate'].astype(np.float)

        file_name=(df.loc[i, "filename"])
        adjacent_file_name.append(file_name)
        pitch_mean_diff=(df.loc[i, "z_mean_pitch"]-df.loc[i+1, "z_mean_pitch"]).__abs__()
        adjacent_distance_mean_pitch.append(pitch_mean_diff)
        pitch_min_diff=(df.loc[i, "z_min_pitch"]-df.loc[i+1, "z_min_pitch"]).__abs__()
        adjacent_distance_min_pitch.append(pitch_min_diff)
        pitch_max_diff=(df.loc[i, "z_max_pitch"]-df.loc[i+1, "z_max_pitch"]).__abs__()
        adjacent_distance_max_pitch.append(pitch_max_diff)
        intensity_mean_diff=(df.loc[i, "z_mean_intensity"]-df.loc[i+1, "z_mean_intensity"]).__abs__()
        adjacent_distance_mean_intensity.append(intensity_mean_diff)
        intensity_min_diff=(df.loc[i, "z_min_intensity"]-df.loc[i+1, "z_min_intensity"]).__abs__()
        adjacent_distance_min_intensity.append(intensity_min_diff)
        intensity_max_diff=(df.loc[i, "z_max_intensity"]-df.loc[i+1, "z_max_intensity"]).__abs__()
        adjacent_distance_max_intensity.append(intensity_max_diff)
        jitter_diff=(df.loc[i, "z_jitter_local"]-df.loc[i+1, "z_jitter_local"]).__abs__()
        adjacent_distance_jitter.append(jitter_diff)
        shimmer_diff=(df.loc[i, "z_shimmer_local"]-df.loc[i+1, "z_shimmer_local"]).__abs__()
        adjacent_distance_shimmer.append(shimmer_diff)
        nhr_diff=(df.loc[i, "z_mean_nhr"]-df.loc[i+1, "z_mean_nhr"]).__abs__()
        adjacent_distance_nhr.append(nhr_diff)
        speechrate_diff=(df.loc[i, "z_speechrate"]-df.loc[i+1, "z_speechrate"]).__abs__()
        adjacent_distance_speechrate.append(speechrate_diff)
        
        
df1 = pd.DataFrame(data=None)
df1['file_name'] = pd.Series(adjacent_file_name)
df1['Condition'] = df1['file_name'].str.slice(0,2)
df1['Participant'] = df1['file_name'].str.slice(6,9)
df1['Mean Pitch distance'] = pd.Series(adjacent_distance_mean_pitch)
df1['Min Pitch distance'] = pd.Series(adjacent_distance_min_pitch)
df1['Max Pitch distance'] = pd.Series(adjacent_distance_max_pitch)
df1['Mean Intensity distance'] = pd.Series(adjacent_distance_mean_intensity)
df1['Min Intensity distance'] = pd.Series(adjacent_distance_min_intensity)
df1['Max Intensity distance'] = pd.Series(adjacent_distance_max_intensity)
df1['Jitter distance'] = pd.Series(adjacent_distance_jitter)
df1['Shimmer distance'] = pd.Series(adjacent_distance_shimmer)
df1['Mean Nhr distance'] = pd.Series(adjacent_distance_nhr)
df1['Speechrate distance'] = pd.Series(adjacent_distance_speechrate)

df1.to_csv(output_filename, index=False,sep='\t')

In [3]:
#Eleventh step
##The program initializes function to measure lexical and syntactic similarities using n-gram sequence methodology


# ------------------------------------------------------------------------
# Name : Syntactic_and_Lexical_similarity_en.py
# Author : Jay Kejriwal
# Date   : 21-03-2022
# Description : Program to measure lexical and syntactic similarity between two sentences
# ------------------------------------------------------------------------

import stanza
import re
import math
from collections import Counter
import nltk
from nltk.util import ngrams
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def get_cosine(vec1, vec2):      #Function to measure cosine similarity
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text,ngram):  #Function to measure lexical similarities in sentence
    n_grams = ngrams(nltk.word_tokenize(text), ngram)
    new_sequence1 =  [ ' '.join(grams) for grams in n_grams]
    return Counter(new_sequence1)


def process_lex_text(text):         #Function to remove punctuation and extrat lemma and pos
    words = re.sub(r'[.,"\-?:!;]', '', text.lower())
    doc = nlp(words)
    x=[f'{word.lemma}' for sent in doc.sentences for word in sent.words]
    return ' '.join([str(elem) for elem in x])


def process_syn_text(text):         #Function to remove punctuation and extrat lemma and pos
    words = re.sub(r'[.,"\-?:!;]', '', text.lower())
    doc = nlp(words)
    l=[f'{word.upos}' for sent in doc.sentences for word in sent.words]
    return ' '.join([str(elem) for elem in l])

def calculate_lexical_similarity(text1, text2):
    process1 = process_lex_text(text1)
    process2 = process_lex_text(text2)
    vector1 = text_to_vector(process1,ngram=1)
    vector2 = text_to_vector(process2,ngram=1)
    cosine = get_cosine(vector1, vector2)
    return cosine

# def calculate_syntactic_similarity(text1, text2):
#     process1 = process_syn_text(text1)
#     process2 = process_syn_text(text2)
#     vector1 = text_to_vector(process1,ngram=2)
#     vector2 = text_to_vector(process2,ngram=2)
#     cosin = get_cosine(vector1, vector2)
#     return cosin

2023-08-29 11:24:57 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-08-29 11:24:59 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2023-08-29 11:24:59 INFO: Using device: cpu
2023-08-29 11:24:59 INFO: Loading: tokenize
2023-08-29 11:24:59 INFO: Loading: pos
2023-08-29 11:25:00 INFO: Loading: lemma
2023-08-29 11:25:00 INFO: Done loading processors!


In [2]:
#Twelveth step
##The program initializes function to measure syntactic similarities on textual features using edit distance

#Measure syntax similarity using edit distance
# ------------------------------------------------------------------------
# Name : Syntax_similarity.py
# Author : Jay Kejriwal
# Date   : 01-01-2022
# Original__author__ = 'reihane'
# Original__code__ = https://github.com/USC-CSSL/CASSIM
# Description : Program re-written in Python 3.8.8 to measure syntax similarity between two sentences using Core-NLP
# ------------------------------------------------------------------------

from scipy.optimize import linear_sum_assignment as su
import numpy as np
import sys
import os
import nltk
nltk.download('punkt')
from nltk.tree import ParentedTree
from zss import simple_distance, Node
from nltk.parse import stanford
from nltk.parse import CoreNLPParser
from nltk.tokenize import sent_tokenize
from nltk.data import load
from collections import OrderedDict
numnodes = 0

#Specify path of dictionary
sent_detector = load('file:D:\Jay\Lexical_Syntactic\stanford\english.pickle')
#Specify path of jar files
jar_path = r'D:\Jay\Lexical_Syntactic\stanford\stanford-corenlp-4.5.0\stanford-corenlp-4.5.0.jar'
models_jar_path = r'D:\Jay\Lexical_Syntactic\stanford\stanford-corenlp-4.5.0\stanford-corenlp-4.5.0-models.jar'
#Specify path of model
model_path= r'D:\Jay\Lexical_Syntactic\stanford\englishPCFG.ser.gz'
parser = stanford.StanfordParser(path_to_jar = jar_path, path_to_models_jar = models_jar_path, model_path = model_path)

def convert_mytree(nltktree,pnode):
    global numnodes
    for node in nltktree:
        numnodes+=1
        if type(node) is nltk.ParentedTree:
            tempnode = Node(node.label())
            pnode.addkid(tempnode)
            convert_mytree(node,tempnode)
    return pnode

def calculate_syntactic_similarity(doc1, doc2, average=True): #syntax similarity of two single documents
    global numnodes
    doc1sents = sent_detector.tokenize(doc1.strip())
    doc2sents = sent_detector.tokenize(doc2.strip())
    for s in doc1sents: # to handle unusual long sentences.
        if len(s.split())>100:
            return "NA"
    for s in doc2sents:
        if len(s.split())>100:
            return "NA"
    try: #to handle parse errors. Parser errors might happen in cases where there is an unsuall long word in the sentence.
        doc1parsed = parser.raw_parse_sents((doc1sents))
        doc2parsed = parser.raw_parse_sents((doc2sents))
    except Exception as e:
        sys.stderr.write(str(e))
        return "NA"
    costMatrix = []
    doc1parsed = list(doc1parsed)
    for i in range(len(doc1parsed)):
        doc1parsed[i] = list(doc1parsed[i])[0]
    doc2parsed = list(doc2parsed)
    for i in range(len(doc2parsed)):
        doc2parsed[i] = list(doc2parsed[i])[0]
    for i in range(len(doc1parsed)):
        numnodes = 0
        sentencedoc1 = ParentedTree.convert(doc1parsed[i])
        tempnode = Node(sentencedoc1.root().label())
        new_sentencedoc1 = convert_mytree(sentencedoc1,tempnode)
        temp_costMatrix = []
        sen1nodes = numnodes
        for j in range(len(doc2parsed)):
            numnodes=0.0
            sentencedoc2 = ParentedTree.convert(doc2parsed[j])
            tempnode = Node(sentencedoc2.root().label())
            new_sentencedoc2 = convert_mytree(sentencedoc2,tempnode)
            ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
            ED = ED / (numnodes + sen1nodes)
            temp_costMatrix.append(ED)
        costMatrix.append(temp_costMatrix)
    costMatrix = np.array(costMatrix)
    if average==True:
        return 1-np.mean(costMatrix)
    else:
        indexes = su(costMatrix)
        total = 0
        rowMarked = [0] * len(doc1parsed)
        colMarked = [0] * len(doc2parsed)
        for row, column in indexes:
            total += costMatrix[row][column]
            rowMarked[row] = 1
            colMarked [column] = 1
        for k in range(len(rowMarked)):
            if rowMarked[k]==0:
                total+= np.min(costMatrix[k])
        for c in range(len(colMarked)):
            if colMarked[c]==0:
                total+= np.min(costMatrix[:,c])
        maxlengraph = max(len(doc1parsed),len(doc2parsed))
        return 1-(total/maxlengraph)

print(calculate_syntactic_similarity("Her face lit up.","Her candle lit up itself."))

[nltk_data] Downloading package punkt to C:\Users\kejri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  parser = stanford.StanfordParser(path_to_jar = jar_path, path_to_models_jar = models_jar_path, model_path = model_path)


0.935483870967742


In [None]:
#Thirteenth Step
## Program to measure entrainment distance using textual, semantic and auditory embeddings
## The program reads pkl file generated in the Fifth step and measures adjacent and self scores at lexical, syntactic, semantic and acoustic levels
## Only adjacent scores were used for analysis
## The program generates a text file for each session with the following column information
###Adjacent score on each linguistic level
# Same_pair_lexical 	Same_pair_syntactic	Same_pair_semantic	Same_pair_audio	
###Self score of each speaker at each linguistic level
#Speaker1_lexical_self_distance	Speaker2_lexical_self_distance	
#Speaker1_syntactic_self_distance	Speaker2_syntactic_self_distance	
#Speaker1_semantic_self_distance	Speaker2_semantic_self_distance	
#Speaker1_audio_self_distance	Speaker2_audio_self_distance

import pandas as pd
import csv
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import util
import random
import subprocess
import re
import glob
import os,sys
import numpy as np
import pickle

#Define path of embedding files and output
audio_embedding_path = r'D:\Jay\GAZE\Pickle\audio'
text_embedding_path = r'D:\Jay\GAZE\Pickle\text'
semantic_embedding_path = r'D:\Jay\GAZE\Pickle\semantic'

output_path = r'D:\Jay\GAZE\output'

for audiofile,textfile,semanticfile in zip(os.listdir(audio_embedding_path),os.listdir(text_embedding_path),os.listdir(semantic_embedding_path)):
        if audiofile.endswith(".pkl") & textfile.endswith(".pkl") & semanticfile.endswith(".pkl"):
            current_audiofile = os.path.join(audio_embedding_path,audiofile)
            current_textfile = os.path.join(text_embedding_path,textfile)
            current_semanticfile = os.path.join(semantic_embedding_path,semanticfile)

            output_filename= os.path.splitext(os.path.join(output_path, os.path.basename(audiofile)))[0]
            #Open audio, text and semantic pickle files
            with open(current_audiofile, 'rb') as a, open(current_textfile, 'rb') as b, open(current_semanticfile, 'rb') as c:
                audio_embeddings_tensor=pickle.load(a)
                sentence_embeddings_tensor=pickle.load(b)
                semantic_embeddings_tensor=pickle.load(c)
                
                lexical_speaker1=[]
                lexical_speaker2=[]
                syntactic_speaker1=[]
                syntactic_speaker2=[]
                semantic_speaker1=[]
                semantic_speaker2=[]
                audio_speaker1=[]
                audio_speaker2=[]
                
                same_pair_lexical = []
                same_pairs_syntactic = []
                same_pairs_semantic = []
                same_pairs_audio = []
                
                for i in range(0, len(sentence_embeddings_tensor)-1):

                    if(i%2==0):
                        #Measure Partner distance for even turns
                        spl=calculate_lexical_similarity(sentence_embeddings_tensor[i],sentence_embeddings_tensor[i+1])
                        same_pair_lexical.append(spl)
                        spsy=calculate_syntactic_similarity(sentence_embeddings_tensor[i],sentence_embeddings_tensor[i+1])
                        same_pairs_syntactic.append(spsy)
                        sps=util.cos_sim(semantic_embeddings_tensor[i],semantic_embeddings_tensor[i+1]).item()
                        same_pairs_semantic.append(sps)
                        spa=util.cos_sim(audio_embeddings_tensor[i],audio_embeddings_tensor[i+1]).item()
                        same_pairs_audio.append(spa)
                        
                    elif(i%2!=0):
                        #Measure Partner distance for odd turns
                        spl=calculate_lexical_similarity(sentence_embeddings_tensor[i],sentence_embeddings_tensor[i+1])
                        same_pair_lexical.append(spl)
                        spsy=calculate_syntactic_similarity(sentence_embeddings_tensor[i],sentence_embeddings_tensor[i+1])
                        same_pairs_syntactic.append(spsy)
                        sps=util.cos_sim(semantic_embeddings_tensor[i],semantic_embeddings_tensor[i+1]).item()
                        same_pairs_semantic.append(sps)
                        spa=util.cos_sim(audio_embeddings_tensor[i],audio_embeddings_tensor[i+1]).item()
                        same_pairs_audio.append(spa)

                for n in range(0, len(sentence_embeddings_tensor)-2):
                    #Measure self-distance
                    if(n%2==0):
                        same_spkr1_lexical=calculate_lexical_similarity(sentence_embeddings_tensor[n],sentence_embeddings_tensor[n+2])
                        lexical_speaker1.append(same_spkr1_lexical)
                        same_spkr1_syntactic=calculate_syntactic_similarity(sentence_embeddings_tensor[n],sentence_embeddings_tensor[n+2])
                        syntactic_speaker1.append(same_spkr1_syntactic)
                        same_spkr1_semantic=util.cos_sim(semantic_embeddings_tensor[n],semantic_embeddings_tensor[n+2]).item()
                        semantic_speaker1.append(same_spkr1_semantic)
                        same_spkr1_audio=util.cos_sim(audio_embeddings_tensor[n],audio_embeddings_tensor[n+2]).item()
                        audio_speaker1.append(same_spkr1_audio)
                
                    elif(n%2!=0):
                        same_spkr2_lexical=calculate_lexical_similarity(sentence_embeddings_tensor[n],sentence_embeddings_tensor[n+2])
                        lexical_speaker2.append(same_spkr2_lexical)
                        same_spkr2_syntactic=calculate_syntactic_similarity(sentence_embeddings_tensor[n],sentence_embeddings_tensor[n+2])
                        syntactic_speaker2.append(same_spkr2_syntactic)
                        same_spkr2_semantic=util.cos_sim(semantic_embeddings_tensor[n],semantic_embeddings_tensor[n+2]).item()
                        semantic_speaker2.append(same_spkr2_semantic)
                        same_spkr2_audio=util.cos_sim(audio_embeddings_tensor[n],audio_embeddings_tensor[n+2]).item()
                        audio_speaker2.append(same_spkr2_audio)
                        
                df = pd.DataFrame(data=None)
                df['Same_pair_lexical'] = pd.Series(same_pair_lexical)
                df['Same_pair_syntactic'] = pd.Series(same_pairs_syntactic)
                df['Same_pair_semantic'] = pd.Series(same_pairs_semantic)
                df['Same_pair_audio'] = pd.Series(same_pairs_audio)

                df['Speaker1_lexical_self_distance'] = pd.Series(lexical_speaker1)
                df['Speaker2_lexical_self_distance'] = pd.Series(lexical_speaker2)
                df['Speaker1_syntactic_self_distance'] = pd.Series(syntactic_speaker1)
                df['Speaker2_syntactic_self_distance'] = pd.Series(syntactic_speaker2)
                df['Speaker1_semantic_self_distance'] = pd.Series(semantic_speaker1)
                df['Speaker2_semantic_self_distance'] = pd.Series(semantic_speaker2)
                df['Speaker1_audio_self_distance'] = pd.Series(audio_speaker1)
                df['Speaker2_audio_self_distance'] = pd.Series(audio_speaker2)

                df.to_csv(output_filename, index=False,sep='\t')
                
                lexical_speaker1=[]
                lexical_speaker2=[]
                syntactic_speaker1=[]
                syntactic_speaker2=[]
                semantic_speaker1=[]
                semantic_speaker2=[]
                audio_speaker1=[]
                audio_speaker2=[]
                same_pair_lexical = []
                same_pairs_syntactic = []
                same_pairs_semantic = []
                same_pairs_audio = []
                del df,spl,spsy,sps,spa,same_spkr1_lexical,
                same_spkr1_semantic,same_spkr1_syntactic,same_spkr1_audio,same_spkr2_lexical,
                same_spkr2_semantic,same_spkr2_syntactic,same_spkr2_audio

         

In [5]:
#Final step 
## Program to merge output files generated in previous
# The program concatenates adjacent and self-score obtained of each inidividual sessions into one file

import pandas as pd
import glob

#Specify path for files generated in previous step
list_of_files = glob.glob(r'D:\Jay\GAZE\output\*',recursive=True)

#Specify output path
out_name = r'D:\Jay\GAZE\merged_stats'
li = []

for filename in list_of_files:
    df = pd.read_csv(filename, index_col=None, header=0, delimiter='\t')
    basename = os.path.basename(filename)
    file_name = os.path.splitext(basename)[0]
    df['File_name'] = file_name
    df['Condition'] = df['File_name'].str.slice(0,2)
    df['Participant'] = df['File_name'].str.slice(6,9)
    #Re-arrange columns with filename as first column
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    #Define new arranged column to data-frame
    df = df[cols]
    #Append the dataframe to list
    li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True) 

frame.to_csv(out_name+'.csv', index=False,sep=',')

In [None]:
#Analyze the data using python or JASP.
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

#Specify input file
file_name=r'D:\Jay\GAZE\merged_stats.csv'
data = pd.read_csv(file_name,delimiter='\t')
data.head()
data.rename(columns = {'Mean Pitch distance':'Mean_Pitch_distance'}, inplace = True)
md = smf.ols(formula="Mean_Pitch_distance ~  Condition", data=data, groups=data["Participant"])
mdf = md.fit()
print(mdf.summary())