In [4]:
import pandas as pd
import numpy as np

# disable warnings
import warnings
warnings.filterwarnings('ignore')

csv_file = '../ukb52305.csv'
data = pd.read_csv(csv_file, nrows=100)

In [5]:
# 20244 is pair matching completion status
# 20134 is when pair matching test is completed
# 20129 = number of columns displayed in round -> contains array with length of 3 -> each ele represents number of col for that round
# 20130 = number of rows displayed in round -> contains array with length of 3
# 20131 = number of correct matches in round -> contains array with length of 3
# 20132 = number of incorrect matches in round -> contains array with length of 3
# 20133 = time to complete round -> contains array with length of 3
pair_matching = []

In [6]:
# 20131 = Number of symbol digit matches made correctly
# 20132 = Number of symbol digit matches attempted
# 20133 = Time to complete round

cols = ['20131', '20132', '20133']
to_retrieve = ['eid']
for col in cols:
    for data_col in data.columns:
        if col in data_col:
            to_retrieve.append(data_col)

print(f"Found {len(to_retrieve)} columns to be retrieved: {to_retrieve}")

pair_matching_df = pd.read_csv(csv_file, usecols=to_retrieve)

Found 10 columns to be retrieved: ['eid', '20131-0.0', '20131-0.1', '20131-0.2', '20132-0.0', '20132-0.1', '20132-0.2', '20133-0.0', '20133-0.1', '20133-0.2']


In [7]:
# 20131 = mean correct matches
# 20132 = mean incorrect matches
# 20133 = mean completion time 

pair_matching_df = pair_matching_df.dropna(thresh=4)

cols_map = {
    '20131': ['20131-0.0', '20131-0.1', '20131-0.2'],
    '20132': ['20132-0.0', '20132-0.1', '20132-0.2'],
    '20133': ['20133-0.0', '20133-0.1', '20133-0.2']
}

for col in cols:
    pair_matching_df[f'mean_{col}'] = pair_matching_df[cols_map[col]].mean(axis=1)

# Outlier Data

In [8]:
# found columns with -1 mean completion time
pair_matching_df.loc[pair_matching_df.mean_20133 == -1.0]

Unnamed: 0,eid,20131-0.0,20131-0.1,20131-0.2,20132-0.0,20132-0.1,20132-0.2,20133-0.0,20133-0.1,20133-0.2,mean_20131,mean_20132,mean_20133
132,1001326,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
272,1002722,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
303,1003033,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
480,1004803,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
685,1006856,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501470,6015536,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
501799,6018827,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
502121,6022048,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
502125,6022089,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0


In [9]:
missing_completion_time = pair_matching_df.loc[pair_matching_df.mean_20133 == -1.0]
missing_completion_time

Unnamed: 0,eid,20131-0.0,20131-0.1,20131-0.2,20132-0.0,20132-0.1,20132-0.2,20133-0.0,20133-0.1,20133-0.2,mean_20131,mean_20132,mean_20133
132,1001326,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
272,1002722,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
303,1003033,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
480,1004803,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
685,1006856,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501470,6015536,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
501799,6018827,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
502121,6022048,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0
502125,6022089,0.0,0.0,,0.0,0.0,,-1.0,-1.0,,0.0,0.0,-1.0


In [10]:
missing_completion_time['mean_20131'].value_counts()

0.0    2942
4.5      39
1.5      10
2.5       2
0.5       1
Name: mean_20131, dtype: int64

In [11]:
missing_completion_time['mean_20132'].value_counts()

0.0     2942
5.0        5
6.0        5
8.5        4
7.5        3
6.5        3
4.0        3
9.5        3
7.0        2
8.0        2
16.5       2
22.5       2
0.5        2
1.5        2
11.0       2
3.0        1
10.5       1
11.5       1
14.0       1
12.0       1
4.5        1
23.5       1
5.5        1
2.5        1
15.0       1
24.5       1
25.0       1
Name: mean_20132, dtype: int64

In [12]:
pair_matching_df = pair_matching_df[pair_matching_df.mean_20133 != -1.0]
pair_matching_df

Unnamed: 0,eid,20131-0.0,20131-0.1,20131-0.2,20132-0.0,20132-0.1,20132-0.2,20133-0.0,20133-0.1,20133-0.2,mean_20131,mean_20132,mean_20133
9,1000085,3.0,6.0,,0.0,2.0,,19017.0,26659.0,,4.500000,1.000000,22838.0
15,1000158,3.0,6.0,,1.0,9.0,,18517.0,52529.0,,4.500000,5.000000,35523.0
19,1000193,3.0,6.0,8.0,0.0,1.0,11.0,9078.0,32000.0,97603.0,5.666667,4.000000,46227.0
25,1000251,3.0,6.0,,1.0,7.0,,6838.0,22344.0,,4.500000,4.000000,14591.0
27,1000279,3.0,6.0,8.0,1.0,1.0,8.0,11990.0,15646.0,38343.0,5.666667,3.333333,21993.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
502385,6024683,3.0,6.0,,0.0,5.0,,7113.0,35157.0,,4.500000,2.500000,21135.0
502393,6024769,3.0,6.0,,0.0,10.0,,7126.0,59251.0,,4.500000,5.000000,33188.5
502395,6024781,3.0,6.0,,0.0,3.0,,6147.0,22757.0,,4.500000,1.500000,14452.0
502399,6024822,3.0,6.0,,0.0,6.0,,19373.0,27589.0,,4.500000,3.000000,23481.0


In [13]:
pair_matching_df.sort_values('mean_20133', ascending=False)[:20]

Unnamed: 0,eid,20131-0.0,20131-0.1,20131-0.2,20132-0.0,20132-0.1,20132-0.2,20133-0.0,20133-0.1,20133-0.2,mean_20131,mean_20132,mean_20133
438545,5386189,3.0,6.0,,0.0,9.0,,90246.0,99722.0,,4.5,4.5,94984.0
487826,5879067,3.0,6.0,,5.0,15.0,,98906.0,90693.0,,4.5,10.0,94799.5
383320,4833827,3.0,6.0,,12.0,14.0,,98871.0,90162.0,,4.5,13.0,94516.5
185629,2856528,3.0,6.0,,4.0,6.0,,94405.0,89674.0,,4.5,5.0,92039.5
387105,4871679,3.0,6.0,,3.0,12.0,,96658.0,86909.0,,4.5,7.5,91783.5
229490,3295196,3.0,6.0,,8.0,7.0,,90039.0,92173.0,,4.5,7.5,91106.0
315740,4157902,3.0,6.0,,5.0,8.0,,85411.0,96034.0,,4.5,6.5,90722.5
446450,5465240,3.0,6.0,,2.0,6.0,,95425.0,85793.0,,4.5,4.0,90609.0
31050,1310533,3.0,6.0,,3.0,15.0,,83590.0,97203.0,,4.5,9.0,90396.5
438818,5388911,3.0,6.0,,3.0,3.0,,80241.0,99807.0,,4.5,3.0,90024.0


In [14]:
cols_of_interest = ['eid', 'mean_20131', 'mean_20132', 'mean_20133']
pair_matching = pair_matching_df[cols_of_interest]

In [15]:
pair_matching

Unnamed: 0,eid,mean_20131,mean_20132,mean_20133
9,1000085,4.500000,1.000000,22838.0
15,1000158,4.500000,5.000000,35523.0
19,1000193,5.666667,4.000000,46227.0
25,1000251,4.500000,4.000000,14591.0
27,1000279,5.666667,3.333333,21993.0
...,...,...,...,...
502385,6024683,4.500000,2.500000,21135.0
502393,6024769,4.500000,5.000000,33188.5
502395,6024781,4.500000,1.500000,14452.0
502399,6024822,4.500000,3.000000,23481.0


In [16]:
# pair_matching.to_csv('../data/pair_matching.csv', index=False)

In [17]:
pair_matching_df[~pair_matching_df['20131-0.2'].isna()]

Unnamed: 0,eid,20131-0.0,20131-0.1,20131-0.2,20132-0.0,20132-0.1,20132-0.2,20133-0.0,20133-0.1,20133-0.2,mean_20131,mean_20132,mean_20133
19,1000193,3.0,6.0,8.0,0.0,1.0,11.0,9078.0,32000.0,97603.0,5.666667,4.000000,46227.000000
27,1000279,3.0,6.0,8.0,1.0,1.0,8.0,11990.0,15646.0,38343.0,5.666667,3.333333,21993.000000
50,1000509,3.0,6.0,8.0,0.0,1.0,7.0,4297.0,11477.0,33312.0,5.666667,2.666667,16362.000000
69,1000696,3.0,6.0,8.0,0.0,1.0,9.0,24261.0,24324.0,66614.0,5.666667,3.333333,38399.666667
89,1000898,3.0,6.0,8.0,2.0,1.0,7.0,29452.0,25194.0,54608.0,5.666667,3.333333,36418.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
502304,6023871,3.0,6.0,8.0,1.0,1.0,9.0,7683.0,16153.0,35849.0,5.666667,3.666667,19895.000000
502305,6023882,3.0,6.0,8.0,1.0,1.0,10.0,13748.0,20054.0,57744.0,5.666667,4.000000,30515.333333
502308,6023910,3.0,6.0,8.0,0.0,1.0,12.0,4057.0,21635.0,60896.0,5.666667,4.333333,28862.666667
502338,6024211,3.0,6.0,8.0,2.0,1.0,8.0,12479.0,27096.0,48581.0,5.666667,3.666667,29385.333333


In [21]:
pair_matching_df[(pair_matching_df['eid'] == 1000193) | (pair_matching_df['eid'] ==1000158)]

Unnamed: 0,eid,20131-0.0,20131-0.1,20131-0.2,20132-0.0,20132-0.1,20132-0.2,20133-0.0,20133-0.1,20133-0.2,mean_20131,mean_20132,mean_20133
15,1000158,3.0,6.0,,1.0,9.0,,18517.0,52529.0,,4.5,5.0,35523.0
19,1000193,3.0,6.0,8.0,0.0,1.0,11.0,9078.0,32000.0,97603.0,5.666667,4.0,46227.0
