### Check for correlations in wordpools

In [1]:
# imports
import numpy as np
import pandas as pd; pd.set_option('display.max_columns', None)
import cmlreaders as cml
from tqdm.notebook import tqdm
import json
import xarray as xr
import itertools
import warnings; warnings.filterwarnings("ignore")
from analyses import analyses_behavioral

[nltk_data] Downloading package cmudict to
[nltk_data]     /home1/hherrema/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [2]:
with open('pronouncing_dictionaries/pronouncing_cml.json', 'r') as f:
        pronouncing_cml = json.load(f)
        
sem_sim_cml = pd.read_csv('semantic_similarity/semantic_cml.csv')              # word2vec semantic similarites

In [2]:
df = cml.get_data_index()

df_intrac = df[df.experiment.isin(['FR1', 'pyFR', 'IFR1'])]
df_scalp = df[(df['experiment'] == 'ltpFR2') & (df['session'] != 23)]

In [10]:
def find_wordpool(df_sel):
    wordpool = []
    spanish_subs = ['R1039M', 'R1070T', 'R1094T', 'R1134T', 'R1331T', 'R1461T', 'R1499T']
    for _, row in tqdm(df_sel.iterrows()):
        # only english subjects
        if row.subject in spanish_subs or (row.experiment == 'pyFR' and row.subject[:2] not in ['UP', 'TJ', 'CP', 'BW', 'CH']):
            continue
            
        # skip duplicate sessions
        elif row.subject == 'R1100D' and row.experiment == 'FR1' and row.session == 0:
            continue
        elif row.subject == 'R1275D' and row.experiment == 'FR1' and row.session == 3:
            continue
        
        # R1030J only subject in multiple experiments
        elif row.subject == 'R1030J' and row.experiment == 'IFR1':
            continue
            
        reader = cml.CMLReader(row.subject, row.experiment, row.session, row.localization, row.montage)
        
        try:
            evs = reader.load('events')
            word_evs = evs[evs['type'] == 'WORD']
            
            if row.experiment == 'pyFR':
                words = word_evs['item'].unique()
            else:
                words = word_evs['item_name'].unique()
                
            wordpool.extend(list(words))
            
        except BaseException as e:
            continue
            
    return np.unique(wordpool)

In [32]:
def wordpool_psim_ssim(wordpool, sem_sim_cml):
    # filter all semantic similarities, convert to xarray for query speed
    sem_sim = sem_sim_cml[sem_sim_cml.word_i.isin(wordpool) & (sem_sim_cml.word_j.isin(wordpool))]
    sem_sim = xr.DataArray(sem_sim.pivot(index='word_i', columns='word_j', values='cosine_similarity'))
    
    wp_data = []
    for w1, w2 in tqdm(itertools.combinations(wordpool, 2)):
        # H
        sim_start = analyses_behavioral.phonetic_sim_H(w1, w2, pronouncing_cml, False)
        rhyme = analyses_behavioral.phonetic_sim_H(w1, w2, pronouncing_cml, True)

        psim_H = 1 if sim_start or rhyme else 0

        # J
        psim_J = analyses_behavioral.phonetic_sim_J(w1, w2, pronouncing_cml)

        ssim = float(sem_sim.loc[w1, w2].values)

        wp_data.append((w1, w2, psim_H, psim_J, ssim))
        
    return pd.DataFrame(wp_data, columns=['word_i', 'word_j', 'psim_H', 'psim_J', 'ssim'])

#### Intracranial

In [30]:
wp_intrac = find_wordpool(df_intrac)
len(wp_intrac)

0it [00:00, ?it/s]

466

In [33]:
wp_intrac_data = wordpool_psim_ssim(wp_intrac, sem_sim_cml)
wp_intrac_data

0it [00:00, ?it/s]

Unnamed: 0,word_i,word_j,psim_H,psim_J,ssim
0,AISLE,ANT,0,0.000000,0.298682
1,AISLE,APE,0,0.000000,0.178440
2,AISLE,ARCH,0,0.000000,0.294049
3,AISLE,ARK,0,0.000000,0.382428
4,AISLE,ARM,0,0.000000,0.123792
...,...,...,...,...,...
108340,YARD,YOLK,1,0.166667,0.028830
108341,YARD,ZOO,0,0.000000,0.400028
108342,YARN,YOLK,1,0.166667,0.059118
108343,YARN,ZOO,0,0.000000,0.353103


In [36]:
import scipy.stats

In [38]:
scipy.stats.pearsonr(wp_intrac_data.psim_H, wp_intrac_data.ssim), scipy.stats.pearsonr(wp_intrac_data.psim_J, wp_intrac_data.ssim)

(PearsonRResult(statistic=0.0215093014803013, pvalue=1.4343180987776833e-12),
 PearsonRResult(statistic=0.025192978886517836, pvalue=1.0978695277173092e-16))

#### Scalp

In [34]:
wp_scalp = find_wordpool(df_scalp)
len(wp_scalp)

0it [00:00, ?it/s]

576

In [35]:
wp_scalp_data = wordpool_psim_ssim(wp_scalp, sem_sim_cml)
wp_scalp_data

0it [00:00, ?it/s]

Unnamed: 0,word_i,word_j,psim_H,psim_J,ssim
0,ACTOR,ACTRESS,1,0.428571,0.672933
1,ACTOR,AGENT,0,0.125000,0.455292
2,ACTOR,AIRPLANE,0,0.000000,0.515949
3,ACTOR,AIRPORT,0,0.125000,0.337698
4,ACTOR,ANKLE,1,0.285714,0.459130
...,...,...,...,...,...
165595,YARN,ZEBRA,0,0.125000,0.497243
165596,YARN,ZIPPER,0,0.000000,0.075563
165597,YOLK,ZEBRA,0,0.000000,0.071246
165598,YOLK,ZIPPER,0,0.000000,0.216152


In [39]:
scipy.stats.pearsonr(wp_scalp_data.psim_H, wp_scalp_data.ssim), scipy.stats.pearsonr(wp_scalp_data.psim_J, wp_scalp_data.ssim)

(PearsonRResult(statistic=0.02634976499048617, pvalue=7.806693406927987e-27),
 PearsonRResult(statistic=0.0004620005081179693, pvalue=0.8508727435545649))

#### pyFR re-implants

Update session values for pyFR re-implants so they are unique.

In [5]:
df_select = df.query("experiment == 'pyFR'")[['subject', 'experiment', 'session', 'localization', 'montage']]

In [6]:
df_select

Unnamed: 0,subject,experiment,session,localization,montage
0,FR240,pyFR,0,0,0
1,FR240,pyFR,2,0,0
2,CP001,pyFR,1,0,0
3,CP001,pyFR,2,0,0
4,CP001,pyFR,3,0,0
...,...,...,...,...,...
373,UP045,pyFR,2,0,0
374,UP045,pyFR,3,0,0
375,UP046,pyFR,0,0,0
376,UP046,pyFR,1,0,0


In [13]:
# match what is in re_implants.csv
for sub, data in df_select.groupby('subject'):
    if len(data.session.unique()) < len(data.session):
        print(sub)

TJ005
TJ018
TJ038
TJ040
TJ041
TJ064
TJ069
TJ078
UP044


In [9]:
df_select.query("montage != 0")

Unnamed: 0,subject,experiment,session,localization,montage
116,TJ005,pyFR,0,0,1
117,TJ005,pyFR,1,0,1
118,TJ005,pyFR,2,0,1
119,TJ005,pyFR,3,0,1
120,TJ005,pyFR,4,0,1
121,TJ005,pyFR,5,0,1
122,TJ005,pyFR,6,0,1
141,TJ018,pyFR,0,0,1
142,TJ018,pyFR,2,0,2
187,TJ035,pyFR,0,0,1


In [10]:
df_select.query("subject == 'TJ005'")

Unnamed: 0,subject,experiment,session,localization,montage
116,TJ005,pyFR,0,0,1
117,TJ005,pyFR,1,0,1
118,TJ005,pyFR,2,0,1
119,TJ005,pyFR,3,0,1
120,TJ005,pyFR,4,0,1
121,TJ005,pyFR,5,0,1
122,TJ005,pyFR,6,0,1
123,TJ005,pyFR,0,0,0
124,TJ005,pyFR,1,0,0
125,TJ005,pyFR,2,0,0
