In [15]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('/home1/esolo/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [26]:
# get dictionary of all possible words in task and their 300 length word2vecs

## iEEG_FR_nouns.txt and wordpool.txt get you 390 unique words.
## I've found 461 unique words going through FR1 (after removing Spanish words)

import numpy as np
wordpool1 = open('FR1_english_words.txt', 'r').readlines() # iEEG_FR_nouns.txt
wordpool1 = [w[:-1] for w in wordpool1]; wordpool1[8]='AX'

wordpool_feats = {}
for w in wordpool1:
    wordpool_feats[w] = model[w.lower()]
    
# wordpool2 = open('wordpool.txt', 'r').readlines()
# wordpool2 = [w[:-1] for w in wordpool2]; 
# for w in wordpool2:
#     wordpool_feats[w] = model[w.lower()]

import pickle as pk
pk.dump(wordpool_feats, open('wordpool_feats_461.pk', 'wb'))

In [28]:
len(range(1, 12))
len(range(14,25))

11

11

In [30]:
import numpy as np
from copy import copy
from gensim.models import KeyedVectors
from cmlreaders import CMLReader, get_data_index
import itertools
from sklearn.decomposition import PCA
from scipy.stats import zscore
from scipy.spatial.distance import euclidean
import os
import pickle as pk
df = get_data_index("r1")

In [40]:
## Example subject semantic clustering 

model = pk.load(open('/home1/esolo/notebooks/Semantic_dimensions/wordpool_feats.pk', 'rb')) # dictionary of words

#Load subjects information
arg = ['R1001P', 'FR1', 1]
s = arg[0]
exp = arg[1]
sess = arg[2]
loc = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['localization'])
mont = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['montage'])

sessions = df[np.logical_and(df["subject"] == s, df['experiment']==exp)]['session'].unique()

#Get task eveents
reader = CMLReader(s, exp, sess, montage=mont, localization=loc)
evs = reader.load("events")
word_evs = evs[evs['type']=='WORD']

ndim = 1 # number of PC dimensions (Ethan usually found only 1 worked for theta/FC)
listnum = 4 # select list to look at for this sessions

#Get info from one list
list_dat = word_evs[word_evs['list']==listnum]
words = list(list_dat['item_name'])

#Project semantic features for this list to 1 dimension
feats = np.array([model[w] 
                  for w in list_dat['item_name']
                 ])  #construct feature matrix from one list; # WORDs X 300 vecs
pca = PCA(n_components=1)
pcs = pca.fit_transform(feats) # list of ndim PCs for 12 words

# get recalls
rec_evs = evs[(evs['type']=='REC_WORD') & (evs['list']==listnum) & (evs['intrusion']==0)]

# Get semantic cluster transition pairwise values 
serial_pos = [int(list_dat[list_dat['item_name']==w]['serialpos'])-1 
              for w in rec_evs['item_name']
             ] # serialpos starting at 0
serial_pos, repeats_removed = remove_repeats(serial_pos)
semantic_transition_scores = get_recall_clustering(pcs, serial_pos)
print('Semantic:')
semantic_transition_scores

# get temporal pairwise transition scores
temporal_transition_scores = get_recall_clustering(np.arange(len(pcs)), serial_pos)
print('Temporal:')
temporal_transition_scores
print('0-indexed serial position:')
serial_pos
words

Semantic:


[0.8181818181818181]

Temporal:


[0.4545454545454546]

0-indexed serial position:


array([2, 7])

['MULE',
 'STORM',
 'HEN',
 'EGG',
 'SKI',
 'PEN',
 'FORT',
 'BEAK',
 'SAIL',
 'FARM',
 'FUR',
 'TRAIN']

In [42]:
pcs

array([[-0.35210073],
       [-1.0532515 ],
       [ 1.3383746 ],
       [ 1.6368098 ],
       [-1.5043539 ],
       [ 0.39463693],
       [-0.95285213],
       [ 1.7572211 ],
       [-1.1632018 ],
       [ 0.01127505],
       [ 1.1254106 ],
       [-1.2379668 ]], dtype=float32)

In [39]:
def get_recall_clustering(recall_cluster_values, recall_serial_pos):
    from scipy.stats import percentileofscore
    #Get temporal/semantic clustering scores. 

    #recall_cluster_values: array of semantic/temporal values
    #recall_serial_pos: array of indices for true recall sequence (zero indexed), e.g. [0, 2, 3, 5, 9, 6]

    recall_cluster_values = copy(np.array(recall_cluster_values).astype(float))
    all_pcts = []
    all_possible_trans = list(itertools.combinations(range(len(recall_cluster_values)), 2))

    for ridx in np.arange(len(recall_serial_pos)-1):  #Loops through each recall event, except last one
        possible_trans = [comb 
                          for comb in all_possible_trans 
                          if (recall_serial_pos[ridx] in comb)
                         ]
        dists = []
        for c in possible_trans: # all possible trans within list
            try:
                dists.append(euclidean(recall_cluster_values[c[0]], recall_cluster_values[c[1]]))
            except:
                #If we did this transition, then it's a NaN, so append a NaN
                dists.append(np.nan)
        dists = np.array(dists)
        dists = dists[np.isfinite(dists)]
        true_trans = euclidean(recall_cluster_values[recall_serial_pos[ridx]], recall_cluster_values[recall_serial_pos[ridx+1]])
        pctrank = 1.-percentileofscore(dists, true_trans, kind='strict')/100.
        all_pcts.append(pctrank) # percentile rank within each list

        recall_cluster_values[recall_serial_pos[ridx]] = np.nan

    return all_pcts

def remove_repeats(recall_serial_pos):
    #Takes array of serial positions and remove second instance of a repeated word
    items_to_keep = np.ones(len(recall_serial_pos)).astype(bool)
    items_seen = []
    idx_removed = []
    for idx in range(len(recall_serial_pos)):
        if recall_serial_pos[idx] in items_seen:
            items_to_keep[idx] = False
            idx_removed.append(idx)
        items_seen.append(recall_serial_pos[idx])

    final_vec = np.array(recall_serial_pos)[items_to_keep]
    return final_vec, idx_removed

In [46]:
def get_clustering_scores(arg, ndim=12):
    
    import numpy as np
    import pickle as pk
    from copy import copy
    from cmlreaders import CMLReader, get_data_index
    import itertools
    from sklearn.decomposition import PCA
    from scipy.stats import zscore
    from scipy.spatial.distance import euclidean
    import os
    
#     def get_recall_clustering(positions, recalls):
#         from scipy.stats import percentileofscore
#         #Get temporal/semantic clustering scores. 

#         #Positions: array of semantic/temporal values
#         #Recalls: array of indices for true recall sequence (zero indexed), e.g. [0, 2, 3, 5, 9, 6]

#         positions = copy(np.array(positions).astype(float))
#         all_pcts = []
#         all_possible_trans = list(itertools.combinations(range(len(positions)), 2))
#         for ridx in np.arange(len(recalls)-1):  #Loops through each recall event, except last one
#             possible_trans = [comb for comb in all_possible_trans if (recalls[ridx] in comb)]
#             dists = []
#             for c in possible_trans:
#                 try:
#                     dists.append(euclidean(positions[c[0]], positions[c[1]]))
#                 except:
#                     #If we did this transition, then it's a NaN, so append a NaN
#                     dists.append(np.nan)
#             dists = np.array(dists)
#             dists = dists[np.isfinite(dists)]

#             true_trans = euclidean(positions[recalls[ridx]], positions[recalls[ridx+1]])
#             pctrank = 1.-percentileofscore(dists, true_trans)/100.
#             all_pcts.append(pctrank)

#             positions[recalls[ridx]] = np.nan

#         return np.mean(all_pcts)
    
#     def remove_repeats(recalls):
#         #Takes array of serial positions and remove second instance of a repeated word
#         items_to_keep = np.ones(len(recalls)).astype(bool)
#         items_seen = []
#         idx_removed = []
#         for idx in range(len(recalls)):
#             if recalls[idx] in items_seen:
#                 items_to_keep[idx] = False
#                 idx_removed.append(idx)
#             items_seen.append(recalls[idx])

#         final_vec = np.array(recalls)[items_to_keep]
#         return final_vec, idx_removed
    
#     model = pk.load(open('/home1/esolo/notebooks/Semantic_dimensions/wordpool_feats.pk', 'rb'))
    
    try:

        #Load subjects information
        s = arg[0]
        exp = arg[1]
        sess = arg[2]
        loc = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['localization'])
        mont = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['montage'])

        sessions = df[np.logical_and(df["subject"] == s, df['experiment']==exp)]['session'].unique()

        #Get task eveents
        reader = CMLReader(s, exp, sess, montage=mont, localization=loc)
        evs = reader.load("events")
        word_evs = evs[evs['type']=='WORD']

        all_trans = []
        all_temp = []
        all_sem = []

        for i in range(251):
            list_sem_sc = []
            list_temp_sc = []

            for listnum in word_evs['list'].unique():
                try:
                    #Get info from one list
                    list_dat = word_evs[word_evs['list']==listnum]
                    words = np.array(list_dat['item_name'])
                    if 'AXE' in words:
                        words[words=='AXE']='AX'  #seems to not have this spelling of ax

                    #Project semantic features for this list to 1 dimension
                    feats = np.array([model[w] for w in words])  #construct feature matrix from one list
                    pca = PCA(n_components=ndim)
                    pcs = pca.fit_transform(feats)
                    #print('List '+str(listnum)+' Variance Explained: '+str(pca.explained_variance_ratio_))

                    #Get recall events and their semantic values 
                    rec_evs = evs[(evs['type']=='REC_WORD') & (evs['list']==listnum) & (evs['intrusion']==0)]
                    if len(rec_evs)<3: #don't use lists with fewer than 3 recalls
                        continue
                    serial_pos = [int(list_dat[list_dat['item_name']==w]['serialpos'])-1 for w in rec_evs['item_name']]
                    serial_pos, repeats_removed = remove_repeats(serial_pos)

                    #Get temporal and semantic clustering scores

                    #Semantic clustering, randomly draw same number of recalls
                    sem_sc = []
                    foo = np.arange(12)
                    if i == 0: 
                        list_sem_sc.append(get_recall_clustering(pcs.ravel(), serial_pos))
                    else:
                        np.random.shuffle(foo)
                        tmp = foo[:len(serial_pos)]
                        list_sem_sc.append(get_recall_clustering(pcs.ravel(), tmp))

                    #Temporal clustering, shuffle the actual recall order
#                     temp_sc = []
#                     foo = copy(serial_pos)
#                     if i == 0:
#                         list_temp_sc.append(get_recall_clustering(np.arange(len(pcs)), serial_pos))
#                     else:
#                         np.random.shuffle(foo)
#                         tmp = foo
#                         list_temp_sc.append(get_recall_clustering(np.arange(len(pcs)), tmp))

                except:
                    continue

            all_temp.append(list_temp_sc)
            all_sem.append(list_sem_sc)
            print(i)

        #Create new directories if needed
        try:
            os.mkdir('/scratch/esolo/grids/'+s+'/')
        except:
            pass
        try:
            os.mkdir('/scratch/esolo/grids/'+s+'/'+str(sess)+'/')
        except:
            pass

        #Save full output
        #np.save('/scratch/esolo/grids/'+s+'/'+str(sess)+'/temporal_clustering.npy', np.array(all_temp))
        np.save('/scratch/esolo/grids/'+s+'/'+str(sess)+'/semantic_clustering_'+str(ndim)+'dims.npy', np.array(all_sem)) 
    except:
        return

In [3]:
FR_subs = df[df['experiment']=='FR1']
args = []
for i in range(len(FR_subs)):
    s = FR_subs.iloc()[i]['subject']
    sess = FR_subs.iloc()[i]['session']
    args.append([s, 'FR1', sess])

In [6]:
def dims_par_explainedVar(arg):
    
    import numpy as np
    import pickle as pk
    from copy import copy
    from cmlreaders import CMLReader, get_data_index
    import itertools
    from sklearn.decomposition import PCA
    from scipy.stats import zscore
    from scipy.spatial.distance import euclidean
    import os
    
    df = get_data_index("r1")
    model = pk.load(open('/home1/esolo/notebooks/Semantic_dimensions/wordpool_feats.pk', 'rb'))
    all_explainedVar = []
    
    for d_ in [25]:
        
        ndim = d_
            
        try:
        
            #Load subjects information
            s = arg[0]
            exp = arg[1]
            sess = arg[2]
            loc = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['localization'])
            mont = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['montage'])

            sessions = df[np.logical_and(df["subject"] == s, df['experiment']==exp)]['session'].unique()

            #Get task eveents
            reader = CMLReader(s, exp, sess, montage=mont, localization=loc)
            evs = reader.load("events")
            word_evs = evs[evs['type']=='WORD']

            #Get PCA dims for *session-level* wordpool
            words = np.array(word_evs['item_name'])
            if 'AXE' in words:
                words[words=='AXE']='AX'  #seems to not have this spelling of ax
            word_mat = np.array([model[w] for w in words])
            pca = PCA(n_components=ndim)
            pcs = pca.fit_transform(word_mat)
            explainedVar = pca.explained_variance_ratio_
            all_explainedVar.append(copy(explainedVar))
            
        except:
            return
        
    
#         #Load subjects information
#         s = arg[0]
#         exp = arg[1]
#         sess = arg[2]
#         loc = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['localization'])
#         mont = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['montage'])

#         sessions = df[np.logical_and(df["subject"] == s, df['experiment']==exp)]['session'].unique()

#         #Get task eveents
#         reader = CMLReader(s, exp, sess, montage=mont, localization=loc)
#         evs = reader.load("events")
#         word_evs = evs[evs['type']=='WORD']    
    
#         for listnum in word_evs['list'].unique():
#             try:
#                 #Get info from one list
#                 list_dat = word_evs[word_evs['list']==listnum]
#                 words = np.array(list_dat['item_name'])
#                 if 'AXE' in words:
#                     words[words=='AXE']='AX'  #seems to not have this spelling of ax

#                 #Project semantic features for this list to 1 dimension
#                 feats = np.array([model[w] for w in words])  #construct feature matrix from one list
#                 pca = PCA(n_components=ndim)
#                 pcs = pca.fit_transform(feats)
#                 explainedVar = pca.explained_variance_ratio_
#                 all_explainedVar.append(copy(explainedVar))
                
#             except:
#                 continue
                
    np.save('/scratch/esolo/Semantic_dimensions/'+s+'/'+str(sess)+'/explainedVar_session.npy', np.array(all_explainedVar))

In [7]:
import cluster_helper.cluster
with cluster_helper.cluster.cluster_view(scheduler="sge", queue="RAM.q", num_jobs=300, cores_per_job=1) as view:
    view.map(dims_par_explainedVar, args)

277 Engines running
Sending a shutdown signal to the controller and engines.


OSError: [Errno 16] Device or resource busy: '.nfs0000002601db588200000017'

In [5]:
def dims_par_clustering(arg):

    for d_ in range(1, 12):

        def get_clustering_scores(arg, ndim=d_):

            import numpy as np
            import pickle as pk
            from copy import copy
            from cmlreaders import CMLReader, get_data_index
            import itertools
            from sklearn.decomposition import PCA
            from scipy.stats import zscore
            from scipy.spatial.distance import euclidean
            import os

            def get_recall_clustering(positions, recalls):
                from scipy.stats import percentileofscore
                #Get temporal/semantic clustering scores. 

                #Positions: array of semantic/temporal values
                #Recalls: array of indices for true recall sequence (zero indexed), e.g. [0, 2, 3, 5, 9, 6]

                positions = copy(np.array(positions).astype(float))
                all_pcts = []
                all_possible_trans = list(itertools.combinations(range(len(positions)), 2))
                for ridx in np.arange(len(recalls)-1):  #Loops through each recall event, except last one
                    possible_trans = [comb for comb in all_possible_trans if (recalls[ridx] in comb)]
                    dists = []
                    for c in possible_trans:
                        try:
                            dists.append(euclidean(positions[c[0]], positions[c[1]]))
                        except:
                            #If we did this transition, then it's a NaN, so append a NaN
                            dists.append(np.nan)
                    dists = np.array(dists)
                    dists = dists[np.isfinite(dists)]

                    true_trans = euclidean(positions[recalls[ridx]], positions[recalls[ridx+1]])
                    pctrank = 1.-percentileofscore(dists, true_trans, kind='strict')/100.
                    all_pcts.append(pctrank)

                    positions[recalls[ridx]] = np.nan

                return np.mean(all_pcts)

            def remove_repeats(recalls):
                #Takes array of serial positions and remove second instance of a repeated word
                items_to_keep = np.ones(len(recalls)).astype(bool)
                items_seen = []
                idx_removed = []
                for idx in range(len(recalls)):
                    if recalls[idx] in items_seen:
                        items_to_keep[idx] = False
                        idx_removed.append(idx)
                    items_seen.append(recalls[idx])

                final_vec = np.array(recalls)[items_to_keep]
                return final_vec, idx_removed
            
            def get_session_model(words, ndim):
    
                #Get PCA dims for *session-level* wordpool
                words = np.array(words)
                if 'AXE' in words:
                    words[words=='AXE']='AX'  #seems to not have this spelling of ax
                word_mat = np.array([model[w] for w in words])
                pca = PCA(n_components=ndim)
                pcs = pca.fit_transform(word_mat)
                exp_var = pca.explained_variance_ratio_
                new_model = {}
                for idx, w in enumerate(words):
                    new_model[w] = pcs[idx, :]

                return new_model, exp_var

            def get_session_PCs(word_evs, new_model, listnum):

                list_dat = word_evs[word_evs['list']==listnum]
                list_words = np.array(list_dat['item_name'])
                if 'AXE' in list_words:
                    list_words[list_words=='AXE']='AX'  #seems to not have this spelling of ax

                #Get semantic positions from new_model
                pcs = np.array([new_model[w_] for w_ in list_words])

                return pcs

            model = pk.load(open('/home1/esolo/notebooks/Semantic_dimensions/wordpool_feats.pk', 'rb'))
            df = get_data_index("r1")

            try:

                #Load subjects information
                s = arg[0]
                exp = arg[1]
                sess = arg[2]
                loc = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['localization'])
                mont = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['montage'])

                sessions = df[np.logical_and(df["subject"] == s, df['experiment']==exp)]['session'].unique()

                #Get task eveents
                reader = CMLReader(s, exp, sess, montage=mont, localization=loc)
                evs = reader.load("events")
                word_evs = evs[evs['type']=='WORD']
                
                #new_model, exp_var = get_session_model(np.array(word_evs['item_name']), ndim)

                all_trans = []
                all_temp = []
                all_sem = []
                
                for i in range(251):
                    list_sem_sc = []
                    list_temp_sc = []
                    words_done = []
                    
                    all_expVar = []
                    for listnum in word_evs['list'].unique():
                        try:
                            #Get info from one list
                            list_dat = word_evs[word_evs['list']==listnum]
                            words = np.array(list_dat['item_name'])
                            if 'AXE' in words:
                                words[words=='AXE']='AX'  #seems to not have this spelling of ax

                            #Project semantic features for this list to 1 dimension
                            feats = np.array([model[w] for w in words])  #construct feature matrix from one list
                            pca = PCA(n_components=ndim)
                            pcs = pca.fit_transform(feats)

                            #pcs = get_session_PCs(word_evs, new_model, listnum)

                            #Get recall events and their semantic values 
                            rec_evs = evs[(evs['type']=='REC_WORD') & (evs['list']==listnum) & (evs['intrusion']==0)]
                            if len(rec_evs)<4: #don't use lists with fewer than N recalls
                                continue
                            serial_pos = [int(list_dat[list_dat['item_name']==w]['serialpos'])-1 for w in rec_evs['item_name']]
                            serial_pos, repeats_removed = remove_repeats(serial_pos)

                            #Get temporal and semantic clustering scores

                            #Semantic clustering, randomly draw same number of recalls
                            sem_sc = []
                            #foo = np.arange(12)
                            foo = copy(serial_pos)
                            if i == 0: 
                                list_sem_sc.append(get_recall_clustering(pcs, serial_pos))
                            else:
                                np.random.shuffle(foo)
                                #tmp = foo[:len(serial_pos)]
                                tmp = foo
                                list_sem_sc.append(get_recall_clustering(pcs, tmp))

                            #Temporal clustering, shuffle the actual recall order
#                             temp_sc = []
#                             foo = copy(serial_pos)
#                             if i == 0:
#                                 list_temp_sc.append(get_recall_clustering(np.arange(len(pcs)), serial_pos))
#                             else:
#                                 np.random.shuffle(foo)
#                                 tmp = foo
#                                 list_temp_sc.append(get_recall_clustering(np.arange(len(pcs)), tmp))

                        except:
                            continue

                    #all_temp.append(list_temp_sc)
                    all_sem.append(list_sem_sc)
                    print(i)

                #Create new directories if needed
                try:
                    os.mkdir('/scratch/esolo/Semantic_dimensions/'+s+'/')
                except:
                    pass
                try:
                    os.mkdir('/scratch/esolo/Semantic_dimensions/'+s+'/'+str(sess)+'/')
                except:
                    pass

                #Save full output
                #np.save('/scratch/esolo/Semantic_dimensions/'+s+'/'+str(sess)+'/temporal_clustering.npy', np.array(all_temp))
                np.save('/scratch/esolo/Semantic_dimensions/'+s+'/'+str(sess)+'/semantic_clustering_'+str(ndim)+'dims_min4recalls_list_altZscore.npy', np.array(all_sem)) 
            except:
                return
            
        get_clustering_scores(arg)
    

import cluster_helper.cluster
with cluster_helper.cluster.cluster_view(scheduler="sge", queue="RAM.q", num_jobs=300, cores_per_job=1) as view:
    view.map(dims_par_clustering, args)

173 Engines running
Sending a shutdown signal to the controller and engines.


OSError: [Errno 16] Device or resource busy: '.nfs00000036019998220000001c'

In [10]:
def dims_par_clustering_matrix(arg):

    def get_clustering_scores(arg):

        import numpy as np
        import pickle as pk
        from copy import copy
        from cmlreaders import CMLReader, get_data_index
        import itertools
        from scipy.stats import zscore
        import os
        
        def get_recall_clustering_matrix(positions, recalls):
            from scipy.stats import percentileofscore
            import itertools

            #Get semantic clustering scores using a pre-defined matrix of distances (e.g. WordNet)

            #Positions: matrix of word-word distances (NxN). Should already be symmetrized.
            #Recalls: array of indices for true recall sequence (zero indexed), e.g. [0, 2, 3, 5, 9, 6]

            all_pcts = []
            all_possible_trans = list(itertools.combinations(range(positions.shape[0]), 2))
            for ridx in np.arange(len(recalls)-1):  #Loops through each recall event, except last one
                possible_trans = [comb for comb in all_possible_trans if (recalls[ridx] in comb)]
                dists = []
                for c in possible_trans:
                    dists.append(positions[c[0], c[1]])   #could be appending a NaN, but that's okay
                dists = np.array(dists)
                dists = dists[np.isfinite(dists)]

                true_trans = positions[recalls[ridx], recalls[ridx+1]]
                pctrank = 1.-percentileofscore(dists, true_trans, kind='strict')/100.
                all_pcts.append(pctrank)

                positions[recalls[ridx], :] = np.nan; positions[:, recalls[ridx]] = np.nan  #NaN out cols/rows for recalled word

            return np.mean(all_pcts)

        def remove_repeats(recalls):
            #Takes array of serial positions and remove second instance of a repeated word
            items_to_keep = np.ones(len(recalls)).astype(bool)
            items_seen = []
            idx_removed = []
            for idx in range(len(recalls)):
                if recalls[idx] in items_seen:
                    items_to_keep[idx] = False
                    idx_removed.append(idx)
                items_seen.append(recalls[idx])

            final_vec = np.array(recalls)[items_to_keep]
            return final_vec, idx_removed

        def create_position_matrix(word_list):
            from nltk.corpus import wordnet as wn

            mat = np.empty([len(word_list), len(word_list)])
            mat[:] = np.nan

            for idx1, w1 in enumerate(word_list):
                for idx2, w2 in enumerate(word_list):
                    w1_str = w1.lower()
                    w2_str = w2.lower()
                    word1 = wn.synset(w1_str+'.n.01')
                    word2 = wn.synset(w2_str+'.n.01')

                    sim = word1.wup_similarity(word2)
                    mat[idx1, idx2] = sim

            return mat

        df = get_data_index("r1")

        try:

            #Load subjects information
            s = arg[0]
            exp = arg[1]
            sess = arg[2]
            loc = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['localization'])
            mont = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['montage'])

            sessions = df[np.logical_and(df["subject"] == s, df['experiment']==exp)]['session'].unique()

            #Get task eveents
            reader = CMLReader(s, exp, sess, montage=mont, localization=loc)
            evs = reader.load("events")
            word_evs = evs[evs['type']=='WORD']

            all_trans = []
            all_temp = []
            all_sem = []

            for i in range(251):
                list_sem_sc = []
                list_temp_sc = []

                all_expVar = []
                for listnum in word_evs['list'].unique():
                    try:
                        #Get info from one list
                        list_dat = word_evs[word_evs['list']==listnum]
                        words = np.array(list_dat['item_name'])
                        if 'AXE' in words:
                            words[words=='AXE']='AX'  #seems to not have this spelling of ax

                        pmat = create_position_matrix(words)

                        #Get recall events and their semantic values 
                        rec_evs = evs[(evs['type']=='REC_WORD') & (evs['list']==listnum) & (evs['intrusion']==0)]
                        if len(rec_evs)<4: #don't use lists with fewer than N recalls
                            continue
                        serial_pos = [int(list_dat[list_dat['item_name']==w]['serialpos'])-1 for w in rec_evs['item_name']]
                        serial_pos, repeats_removed = remove_repeats(serial_pos)

                        #Get temporal and semantic clustering scores

                        #Semantic clustering, randomly draw same number of recalls
                        sem_sc = []
                        foo = np.arange(12)
                        if i == 0: 
                            list_sem_sc.append(get_recall_clustering_matrix(pmat, serial_pos))
                        else:
                            np.random.shuffle(foo)
                            tmp = foo[:len(serial_pos)]
                            list_sem_sc.append(get_recall_clustering_matrix(pmat, tmp))

    #                         #Temporal clustering, shuffle the actual recall order
    #                         temp_sc = []
    #                         foo = copy(serial_pos)
    #                         if i == 0:
    #                             list_temp_sc.append(get_recall_clustering(np.arange(len(pcs)), serial_pos))
    #                         else:
    #                             np.random.shuffle(foo)
    #                             tmp = foo
    #                             list_temp_sc.append(get_recall_clustering(np.arange(len(pcs)), tmp))

                    except:
                        continue

                #all_temp.append(list_temp_sc)
                all_sem.append(list_sem_sc)
                print(i)

            #Create new directories if needed
            try:
                os.mkdir('/scratch/esolo/grids/'+s+'/')
            except:
                pass
            try:
                os.mkdir('/scratch/esolo/grids/'+s+'/'+str(sess)+'/')
            except:
                pass

            #Save full output
            #np.save('/scratch/esolo/Semantic_dimensions/'+s+'/'+str(sess)+'/temporal_clustering.npy', np.array(all_temp))
            np.save('/scratch/esolo/Semantic_dimensions/'+s+'/'+str(sess)+'/semantic_clustering_min4recalls_wordNet.npy', np.array(all_sem)) 
        except:
            return

    get_clustering_scores(arg)
    

import cluster_helper.cluster
with cluster_helper.cluster.cluster_view(scheduler="sge", queue="RAM.q", num_jobs=300, cores_per_job=1) as view:
    view.map(dims_par_clustering_matrix, args)

300 Engines running
Sending a shutdown signal to the controller and engines.


OSError: [Errno 16] Device or resource busy: '.nfs0000001a00dc700a0000001b'

In [66]:
def dims_par_clustering_session(arg):

    for d_ in range(14, 25):

        def get_clustering_scores(arg, ndim=d_):

            import numpy as np
            import pickle as pk
            from copy import copy
            from cmlreaders import CMLReader, get_data_index
            import itertools
            from sklearn.decomposition import PCA
            from scipy.stats import zscore
            from scipy.spatial.distance import euclidean
            import os

            def get_recall_clustering(positions, recalls):
                from scipy.stats import percentileofscore
                #Get temporal/semantic clustering scores. 

                #Positions: array of semantic/temporal values
                #Recalls: array of indices for true recall sequence (zero indexed), e.g. [0, 2, 3, 5, 9, 6]

                positions = copy(np.array(positions).astype(float))
                all_pcts = []
                all_possible_trans = list(itertools.combinations(range(len(positions)), 2))
                for ridx in np.arange(len(recalls)-1):  #Loops through each recall event, except last one
                    possible_trans = [comb for comb in all_possible_trans if (recalls[ridx] in comb)]
                    dists = []
                    for c in possible_trans:
                        try:
                            dists.append(euclidean(positions[c[0]], positions[c[1]]))
                        except:
                            #If we did this transition, then it's a NaN, so append a NaN
                            dists.append(np.nan)
                    dists = np.array(dists)
                    dists = dists[np.isfinite(dists)]

                    true_trans = euclidean(positions[recalls[ridx]], positions[recalls[ridx+1]])
                    pctrank = 1.-percentileofscore(dists, true_trans)/100.
                    all_pcts.append(pctrank)

                    positions[recalls[ridx]] = np.nan

                return np.mean(all_pcts)

            def remove_repeats(recalls):
                #Takes array of serial positions and remove second instance of a repeated word
                items_to_keep = np.ones(len(recalls)).astype(bool)
                items_seen = []
                idx_removed = []
                for idx in range(len(recalls)):
                    if recalls[idx] in items_seen:
                        items_to_keep[idx] = False
                        idx_removed.append(idx)
                    items_seen.append(recalls[idx])

                final_vec = np.array(recalls)[items_to_keep]
                return final_vec, idx_removed

            model = pk.load(open('/home1/esolo/notebooks/Semantic_dimensions/wordpool_feats.pk', 'rb'))
            df = get_data_index("r1")

            try:

                #Load subjects information
                s = arg[0]
                exp = arg[1]
                sess = arg[2]
                loc = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['localization'])
                mont = int(df[(df['subject']==s) & (df['session']==sess) & (df['experiment']==exp)]['montage'])

                sessions = df[np.logical_and(df["subject"] == s, df['experiment']==exp)]['session'].unique()

                #Get task eveents
                reader = CMLReader(s, exp, sess, montage=mont, localization=loc)
                evs = reader.load("events")
                word_evs = evs[evs['type']=='WORD']

                all_trans = []
                all_temp = []
                all_sem = []
                
                #Get PCA dims for *session-level* wordpool
                words = np.array(word_evs['item_name'])
                if 'AXE' in words:
                    words[words=='AXE']='AX'  #seems to not have this spelling of ax
                word_mat = np.array([model[w] for w in words])
                pca = PCA(n_components=ndim)
                pcs = pca.fit_transform(word_mat)
                new_model = {}
                for idx, w in enumerate(word_evs['item_name']):
                    new_model[w] = pcs[idx, :]

                for i in range(251):
                    list_sem_sc = []
                    list_temp_sc = []

                    for listnum in word_evs['list'].unique():
                        try:

                            list_dat = word_evs[word_evs['list']==listnum]
                            list_words = np.array(list_dat['item_name'])
                            if 'AXE' in list_words:
                                list_words[list_words=='AXE']='AX'  #seems to not have this spelling of ax
                            
                            #Get recall events and their semantic values from one list
                            rec_evs = evs[(evs['type']=='REC_WORD') & (evs['list']==listnum) & (evs['intrusion']==0)]
                            if len(rec_evs)<3: #don't use lists with fewer than 3 recalls
                                continue
                            serial_pos = [int(list_dat[list_dat['item_name']==w]['serialpos'])-1 for w in rec_evs['item_name']]
                            serial_pos, repeats_removed = remove_repeats(serial_pos)

                            #Get semantic positions from new_model
                            pcs = np.array([new_model[w_] for w_ in list_words])

                            #Semantic clustering, randomly draw same number of recalls
                            sem_sc = []
                            foo = np.arange(12)
                            if i == 0: 
                                list_sem_sc.append(get_recall_clustering(pcs, serial_pos))
                            else:
                                np.random.shuffle(foo)
                                tmp = foo[:len(serial_pos)]
                                list_sem_sc.append(get_recall_clustering(pcs, tmp))

                            #Temporal clustering, shuffle the actual recall order
        #                     temp_sc = []
        #                     foo = copy(serial_pos)
        #                     if i == 0:
        #                         list_temp_sc.append(get_recall_clustering(np.arange(len(pcs)), serial_pos))
        #                     else:
        #                         np.random.shuffle(foo)
        #                         tmp = foo
        #                         list_temp_sc.append(get_recall_clustering(np.arange(len(pcs)), tmp))

                        except:
                            continue

                    all_temp.append(list_temp_sc)
                    all_sem.append(list_sem_sc)
                    print(i)

                #Create new directories if needed
                try:
                    os.mkdir('/scratch/esolo/Semantic_dimensions/'+s+'/')
                except:
                    pass
                try:
                    os.mkdir('/scratch/esolo/Semantic_dimensions/'+s+'/'+str(sess)+'/')
                except:
                    pass

                #Save full output
                #np.save('/scratch/esolo/grids/'+s+'/'+str(sess)+'/temporal_clustering.npy', np.array(all_temp))
                np.save('/scratch/esolo/Semantic_dimensions/'+s+'/'+str(sess)+'/semantic_clustering_'+str(ndim)+'dims_min3recalls_session.npy', np.array(all_sem)) 
            except:
                return
            
        get_clustering_scores(arg)
    

import cluster_helper.cluster
with cluster_helper.cluster.cluster_view(scheduler="sge", queue="RAM.q", num_jobs=300, cores_per_job=1) as view:
    view.map(dims_par_clustering_session, args)

37 Engines running
Sending a shutdown signal to the controller and engines.


OSError: [Errno 16] Device or resource busy: '.nfs0000000b018a401900000077'