In [1]:
import pandas as pd
import numpy as np

# disable warnings
import warnings
warnings.filterwarnings('ignore')

csv_file = '../ukb52305.csv'
data = pd.read_csv(csv_file, nrows=100)

In [2]:
# 20244 is pair matching completion status
# 20134 is when pair matching test is completed
# 20129 = number of columns displayed in round -> contains array with length of 3 -> each ele represents number of col for that round
# 20130 = number of rows displayed in round -> contains array with length of 3
# 20131 = number of correct matches in round -> contains array with length of 3
# 20132 = number of incorrect matches in round -> contains array with length of 3
# 20133 = time to complete round -> contains array with length of 3
pair_matching = []

In [3]:
# 20131 = Number of symbol digit matches made correctly
# 20132 = Number of symbol digit matches attempted
# 20133 = Time to complete round

cols = ['20131', '20132', '20133']
to_retrieve = ['eid']
for col in cols:
    for data_col in data.columns:
        if col in data_col:
            to_retrieve.append(data_col)

print(f"Found {len(to_retrieve)} columns to be retrieved: {to_retrieve}")

pair_matching_df = pd.read_csv(csv_file, usecols=to_retrieve)

Found 10 columns to be retrieved: ['eid', '20131-0.0', '20131-0.1', '20131-0.2', '20132-0.0', '20132-0.1', '20132-0.2', '20133-0.0', '20133-0.1', '20133-0.2']


In [4]:
# 20131 = mean correct matches
# 20132 = mean incorrect matches
# 20133 = mean completion time 

pair_matching_df = pair_matching_df.dropna(thresh=4)

cols_map = {
    '20131': ['20131-0.0', '20131-0.1', '20131-0.2'],
    '20132': ['20132-0.0', '20132-0.1', '20132-0.2'],
    '20133': ['20133-0.0', '20133-0.1', '20133-0.2']
}

for col in cols:
    pair_matching_df[f'mean_{col}'] = pair_matching_df[cols_map[col]].mean(axis=1)

# Outlier Data

In [5]:
# found columns with -1 mean completion time
pair_matching_df.loc[pair_matching_df.mean_20133 == -1.0]

Unnamed: 0,eid,20131-0.0,20131-0.1,20131-0.2,20132-0.0,20132-0.1,20132-0.2,20133-0.0,20133-0.1,20133-0.2,mean_20131,mean_20132,mean_20133
132,1001326,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
272,1002722,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
303,1003033,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
480,1004803,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
685,1006856,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501470,6015536,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
501799,6018827,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
502121,6022048,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
502125,6022089,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0


In [6]:
pair_matching_df = pair_matching_df[pair_matching_df.mean_20133 != -1.0]
pair_matching_df

Unnamed: 0,eid,20131-0.0,20131-0.1,20131-0.2,20132-0.0,20132-0.1,20132-0.2,20133-0.0,20133-0.1,20133-0.2,mean_20131,mean_20132,mean_20133
9,1000085,3.0,6.0,,0.0,2.0,,19017.0,26659.0,,4.500000,1.000000,22838.0
15,1000158,3.0,6.0,,1.0,9.0,,18517.0,52529.0,,4.500000,5.000000,35523.0
19,1000193,3.0,6.0,8.0,0.0,1.0,11.0,9078.0,32000.0,97603.0,5.666667,4.000000,46227.0
25,1000251,3.0,6.0,,1.0,7.0,,6838.0,22344.0,,4.500000,4.000000,14591.0
27,1000279,3.0,6.0,8.0,1.0,1.0,8.0,11990.0,15646.0,38343.0,5.666667,3.333333,21993.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
502385,6024683,3.0,6.0,,0.0,5.0,,7113.0,35157.0,,4.500000,2.500000,21135.0
502393,6024769,3.0,6.0,,0.0,10.0,,7126.0,59251.0,,4.500000,5.000000,33188.5
502395,6024781,3.0,6.0,,0.0,3.0,,6147.0,22757.0,,4.500000,1.500000,14452.0
502399,6024822,3.0,6.0,,0.0,6.0,,19373.0,27589.0,,4.500000,3.000000,23481.0


In [7]:
cols_of_interest = ['eid', 'mean_20131', 'mean_20132', 'mean_20133']
pair_matching = pair_matching_df[cols_of_interest]

In [8]:
pair_matching

Unnamed: 0,eid,mean_20131,mean_20132,mean_20133
9,1000085,4.500000,1.000000,22838.0
15,1000158,4.500000,5.000000,35523.0
19,1000193,5.666667,4.000000,46227.0
25,1000251,4.500000,4.000000,14591.0
27,1000279,5.666667,3.333333,21993.0
...,...,...,...,...
502385,6024683,4.500000,2.500000,21135.0
502393,6024769,4.500000,5.000000,33188.5
502395,6024781,4.500000,1.500000,14452.0
502399,6024822,4.500000,3.000000,23481.0


In [9]:
pair_matching.to_csv('../data/pair_matching.csv', index=False)