In [7]:
import pandas as pd
import numpy as np

In [8]:
# Set type of dataset
data_set = 0 # 0 = General, 1 = Reacll

In [9]:
data = pd.read_csv('raw_data.csv')
del data['Unnamed: 0']
del data['test_list']
del data['study_list']
del data['item']
del data['itemno']
del data['finalrecalled']

In [10]:
data.loc[data.type == 'RECOG_TARGET', 'type'] = 'old'
data.loc[data.type == 'RECOG_LURE', 'type'] = 'new'
data = data.rename(columns = {'type':'ground_truth'})

In [11]:
data = data[~np.isnan(data.recog_conf)]
data['recog_conf'] = data.recog_conf.astype('float').astype('int')
data['recog_rt'] = data.recog_rt.astype('float')

In [12]:
data = data[(data.recog_conf > 0) & (data.recog_conf < 6)]
data = data[(data.recog_rt >= 300) & (data.recog_rt <= 3000)]

In [13]:
data['participant_session'] = data['participant'] + data['session'].map(str)

In [14]:
data[:10]

Unnamed: 0,participant,session,ground_truth,recog_resp,recog_conf,recog_rt,recalled,participant_session
0,LTP268,1,old,old,5,1395.0,yes,LTP2681
1,LTP268,1,old,new,2,1788.0,no,LTP2681
2,LTP268,1,old,new,3,2308.0,no,LTP2681
3,LTP268,1,old,old,5,1320.0,yes,LTP2681
4,LTP268,1,old,old,5,1025.0,yes,LTP2681
5,LTP268,1,new,new,4,1108.0,lure,LTP2681
6,LTP268,1,old,old,5,1207.0,no,LTP2681
7,LTP268,1,new,new,5,1125.0,lure,LTP2681
8,LTP268,1,old,new,3,1635.0,yes,LTP2681
9,LTP268,1,old,new,2,2430.0,yes,LTP2681


In [15]:
from ipywidgets import FloatProgress
from IPython.display import display
f = FloatProgress(min=0, max=data['participant_session'].nunique())
display(f)
i = 0
sessions_to_delete = list()
for session in data['participant_session'].unique():
    # For both datasets
    responses_old = np.array(data[(data.participant_session == session) & (data.ground_truth == 'old')].recog_resp)
    responses_new = np.array(data[(data.participant_session == session) & (data.ground_truth == 'new')].recog_resp)
    if data_set == 1:
        # Only for recall dataset
        responses_recalled = np.array(data[(data.participant_session == session) & (data.recalled == 'yes')].recog_resp)
        responses_notrecalled = np.array(data[(data.participant_session == session) & (data.recalled == 'no')].recog_resp)
    
    i += 1
    f.value = i
    if data_set == 1:
        if (
            # For both datasets
            'old' in responses_old and 'new' in responses_old
            and 'old' in responses_new and 'new' in responses_new
            # Only for recall dataset
            and 'old' in responses_recalled and 'new' in responses_recalled
            and 'old' in responses_notrecalled and 'new' in responses_notrecalled
           ):
                continue
    else:
        if (
            # For both datasets
            'old' in responses_old and 'new' in responses_old
            and 'old' in responses_new and 'new' in responses_new
           ):
                continue
    
    sessions_to_delete.append(session)

In [16]:
print(len(data['participant_session'].unique()))

3241


In [17]:
print(len(sessions_to_delete))

121


In [18]:
# Len of data before removing sessions
len(data)

883262

In [19]:
for ses in sessions_to_delete:
    data = data[data.participant_session != ses]

In [20]:
# Len of data after removing sessions
len(data)

853624

In [21]:
data[:1]

Unnamed: 0,participant,session,ground_truth,recog_resp,recog_conf,recog_rt,recalled,participant_session
0,LTP268,1,old,old,5,1395.0,yes,LTP2681


In [22]:
# Not really needed to do this
#data.to_csv("unwanted_sessions_removed.csv", sep=',')

In [23]:
del data['session']
del data['recalled']

In [24]:
data[:1]

Unnamed: 0,participant,ground_truth,recog_resp,recog_conf,recog_rt,participant_session
0,LTP268,old,old,5,1395.0,LTP2681


In [25]:
conditions = [(data["ground_truth"] == "old") & (data["recog_resp"] == "old"),
              (data["ground_truth"] == "new") & (data["recog_resp"] == "new"),
              (data["ground_truth"] == "new") & (data["recog_resp"] == "old")]

choices = ['tp', 'tn', 'fp']

data['hmm'] = np.select(conditions, choices, default='fn')

In [26]:
data[:20]

Unnamed: 0,participant,ground_truth,recog_resp,recog_conf,recog_rt,participant_session,hmm
0,LTP268,old,old,5,1395.0,LTP2681,tp
1,LTP268,old,new,2,1788.0,LTP2681,fn
2,LTP268,old,new,3,2308.0,LTP2681,fn
3,LTP268,old,old,5,1320.0,LTP2681,tp
4,LTP268,old,old,5,1025.0,LTP2681,tp
5,LTP268,new,new,4,1108.0,LTP2681,tn
6,LTP268,old,old,5,1207.0,LTP2681,tp
7,LTP268,new,new,5,1125.0,LTP2681,tn
8,LTP268,old,new,3,1635.0,LTP2681,fn
9,LTP268,old,new,2,2430.0,LTP2681,fn


In [27]:
if data_set == 0:
    # For general dataset
    data.to_csv("final_general_data.csv", sep=',', index=False)
    
else:
    # For recall dataset
    data.to_csv("final_recall_data.csv", sep=',', index=False)

In [28]:
print("All done! :)")

All done! :)
