#  Make CSV with subjectwise trial details, RT, and acoustic feature distances

Step 1: Load packages

In [3]:
import parselmouth
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import librosa
import pickle
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

sns.set() # Use seaborn's default style to make attractive graphs
sns.set_style("white")
sns.set_style("ticks")

Step 2: Init paths

In [4]:
path_wav_files = './lisTest/tcdDecision/playback/STIMULI/'
wav_folder_types = ['chinese','english']
file_types = ['nspkrs_1','nspkrs_2']

path_store_feats = './dataAnalysis/data/feats/PRAAT/'
path_store_figure = './figures/'
# make direcs if absent
if not os.path.exists(path_store_feats):
    os.mkdir(path_store_feats)    

Step 3: Read listening test stimulus set filenames 

In [5]:
# read wav filenames with path
wav_files = []
for i in range(len(wav_folder_types)):
    wav_files.append([])
    for j in range(len(file_types)):
        wav_files[i].append([])
        for file in glob.glob(path_wav_files+wav_folder_types[i]+'/wavFilesTest/'+'*'+file_types[j]+'.wav'):
            wav_files[i][j].append(file)

Step 4: Read stimulus feature pickle file and compute the feature distance

In [6]:
# euclidean distance
feats_column_name = ['pitch','mfcc', 'mel','scentroid','harm','intensity','xvec']


feats_distance = []
feats_distance.append([])
feats_distance[0].append('fname')
feats_distance[0].append('lang')
feats_distance[0].append('changeIns')
for i in range(len(feats_column_name)):
    feats_distance[0].append(feats_column_name[i])
cnt = 1
for i in range(len(wav_files)):
    for j in range(len(wav_files[i])):
        for k in range(len(wav_files[i][j])):
            feats_distance.append([])
            head, tail = os.path.split(wav_files[i][j][k])

            # load pickle file
            with open(path_store_feats+tail[:-4]+'.pickle', 'rb') as f:
                feats = pickle.load(f) 
            
            feats_distance[cnt].append(tail[:-4])
            feats_distance[cnt].append(i)
            changeIns = float(tail.split('_nspkrs')[0].split('_')[-2]+\
                                    '.'+tail.split('_nspkrs')[0].split('_')[-1])
            feats_distance[cnt].append(changeIns)
            
            # find euclidean distance
            for feats_name in feats_column_name:
                if feats_name!='xvec':
                    temp_1 = feats[feats_name+'_1'][feats['voiced_1']]
                    temp_2 = feats[feats_name+'_2'][feats['voiced_2']]
#                     X = np.vstack((temp_1,temp_2))
#                     y = np.vstack((0*np.ones((temp_1.shape[0],1)),np.ones((temp_2.shape[0],1))))[:,0]
#                     lda = LinearDiscriminantAnalysis(n_components=2)
#                     X_p = lda.fit(X, y).transform(X)
#                     inter_class_scatter = np.linalg.norm(np.mean(X_p[y == 0, 0]) - np.mean(X_p[y == 1, 0]))**2 
#                     intra_class_scatter = np.sum(np.var(X_p[y == 0, 0])+ np.var(X_p[y == 1, 0]))
#                     dist = inter_class_scatter/intra_class_scatter
                    temp_1 = np.mean(temp_1,axis=0)
                    temp_2 = np.mean(temp_2,axis=0)
                    dist = np.linalg.norm(temp_1-temp_2)
                if feats_name == 'xvec':
                    dist = np.linalg.norm(feats[feats_name+'_1']-feats[feats_name+'_2'])
                feats_distance[cnt].append(dist)
            cnt = cnt+1
df = pd.DataFrame(feats_distance)
new_header = df.iloc[0] #grab the first row for the header
df = df[1:] #take the data less the header row
df.columns = new_header #set the header row as the df header

Step 4: Read and append the response CSV of each subject

In [7]:
# read data
audio_type = ['chin','eng']
sub_IDs = ['S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14','S15',
          'S16','S17','S18','S19','S20','S21','S22','S23','S24','S25','S26','S27','S28','S29']

miss = []
fa = []
hit = []
rt_miss = []
rt_fa = []
rt_hit = []

fa_names = []
miss_names = []
hit_names = []
rt_correct = []
rt_incorrect = []

df = pd.DataFrame() 

for i in range(len(sub_IDs)):
    for j in range(len(audio_type)):
        if 1: # read from local repo
            url = './lisTest/tcdDecision/recordings/New_recordings/'+audio_type[j]+'_'+sub_IDs[i]+'/keys.csv'
        else: # read from online repo
            url = 'https://raw.githubusercontent.com/neerajww/lang_tcd/venkat201097-test1/code/lisTest/tcdDecision/recordings/New_recordings/'+audio_type[j]+'_'+sub_IDs[i]+'/keys.csv'
        # load csv into a dataframe
        temp = pd.read_csv(url,header=None)
        temp['subID'] = i+1
        df = df.append(temp, ignore_index = True)

Step 5: Make a CSV pooling all subject response data and corresponding stimulus-wise feature distances

In [8]:
DF = df.copy()
for i in range(len(feats_column_name)):
    DF[feats_column_name[i]] = 0
DF['lang'] = 0
DF['changeIns'] = 0

for i in range(len(DF)):
    for j in range(len(feats_distance)):
        fname = DF[1].values[i]
        head, tail = os.path.split(fname)
        if feats_distance[j][0] == tail[:-4]:
            for k in range(len(feats_column_name)):
                DF.loc[i,feats_column_name[k]] = feats_distance[j][3+k] 
            DF.loc[i,'lang'] = feats_distance[j][1] 
            DF.loc[i,'changeIns'] = feats_distance[j][2]
DF.rename(columns={0: "fIndex", 1: "fname",2:'label',3:'response',4:'session',5:'t_start',6:"t_end",7:"t_resp"},
          inplace = True)
# save csv
DF.to_csv('./dataAnalysis/data/csvs/praat_pooled_subject_response_acoustic_feats_data_euc.csv',index=False) 