In [1]:
import pandas as pd
import numpy as np

# Define File Paths

In [2]:
RQA_DIR = "/Users/angelina/Dropbox (Emotive Computing)/CPS_Gaze_Fixations/RQA/"
MASTERFILE = RQA_DIR + "CPS2_Gaze_All_Participants_Masterfile_TimeSeries.csv"
ALL_PP = "/Users/angelina/Dropbox (Emotive Computing)/CPS_Gaze_Fixations/utils/All_PP.csv"
RAW_OUTPUTDIR = RQA_DIR + "MdRQA_Time_Series/RAW/"
FIX_OUTPUTDIR = RQA_DIR + "MdRQA_Time_Series/FIXATIONS/"
TIMESERIES_DIR = RQA_DIR + "Participant_Block_225ms_Time_Series/"

# Import Masterfile

In [3]:
# Read in the All_PP file which is our source of truth for the complete teams (2 participants) and their PIDs
dfAll_PP = pd.read_csv(ALL_PP)

# Read in masterfile which contains one row per participant + block and filepaths to each file we need
dfMaster = pd.read_csv(MASTERFILE)

dfMaster['Analyze'] = np.where(dfMaster['ParticipantBlock'] == "CPS2-CU-T60-PB-Transfer", "no", dfMaster['Analyze'])
dfMaster['Analyze'] = np.where(dfMaster['ParticipantBlock'] == "CPS2-CU-T29-PC-Transfer", "no", dfMaster['Analyze'])
dfMaster['Analyze'] = np.where(dfMaster['ParticipantBlock'] == "CPS2-CU-T27-PA-Transfer", "no", dfMaster['Analyze'])

dfMasterSelected = dfMaster.loc[dfMaster['Analyze'] == 'yes']
dfMasterSelected.reset_index(drop=True, inplace=True)

# Informational
print("Number of unique team + blocks in All_PP: ", len(dfAll_PP.groupby(['GROUPID', 'block'])))
print("Number of unique team + blocks in Masterfile: ", len(dfMasterSelected['GROUPID_block'].unique()))
print("Number of participant + blocks in Masterfile: ", len(dfMasterSelected))


Number of unique team + blocks in All_PP:  274
Number of unique team + blocks in Masterfile:  582
Number of participant + blocks in Masterfile:  1516


  dfAll_PP = pd.read_csv(ALL_PP)


In [5]:
# Group by team, block, and participant(GROUPID represents a unique team because it is specific to a school)
all_pp_team_block_pids = dfAll_PP.groupby(['GROUPID', 'block', 'participant_id'], \
                                          as_index=False).size()[['GROUPID', 'block', 'participant_id']]
all_pp_team_block_pids.rename(columns={"block": "Block", "participant_id": "pid"}, inplace=True)


# Merge Master w/ All_PP to filter teams we should not process
## NOTE: Team 1094 gets filtered from All_PP file because there is no gaze data (result is 271 team + blocks)
team_blocks = pd.merge(all_pp_team_block_pids, dfMasterSelected[['GROUPID', 'Block', 'pid']], \
                       on=['GROUPID', 'Block', 'pid'], how='inner')

In [8]:
print(len(team_blocks.groupby(['GROUPID', 'Block'])))
print(len(team_blocks.groupby(['GROUPID'])))

271
93


# Generate MdRQA Team Time Series

In [11]:
# Group by team, block, and participant(GROUPID represents a unique team because it is specific to a school)
all_pp_team_block_pids = dfAll_PP.groupby(['GROUPID', 'block', 'participant_id'], \
                                          as_index=False).size()[['GROUPID', 'block', 'participant_id']]
all_pp_team_block_pids.rename(columns={"block": "Block", "participant_id": "pid"}, inplace=True)


# Merge Master w/ All_PP to filter teams we should not process
## NOTE: Team 1094 gets filtered from All_PP file because there is no gaze data (result is 271 team + blocks)
team_blocks = pd.merge(all_pp_team_block_pids, dfMasterSelected[['GROUPID', 'Block', 'pid']], \
                       on=['GROUPID', 'Block', 'pid'], how='inner')

# Get number of participants in each team 
## NOTE: Before filtering, in the Masterfile we have teams of sizes 1, 2, 3, and 4. 
##       Now, teams should only be size 3
team_sizes = team_blocks.groupby(['GROUPID', 'Block']).size().reset_index(name='num_participants')

# Add on number of participants
team_blocks = pd.merge(team_blocks, team_sizes, on=['GROUPID', 'Block'], how='inner')

# Add on time series file paths
team_blocks = pd.merge(team_blocks, dfMasterSelected[['GROUPID', 'Block', 'pid', \
                                                      '225ms-time-series-RAW', '225ms-time-series-FIXATIONS']], \
                       on=['GROUPID', 'Block', 'pid'], how='inner')

# Group by filtered team and block (GROUPID represents a unique team because it is specific to a school)
grp_team_block = team_blocks.groupby(['GROUPID', 'Block'])

num_nan = 0
num_total = 0
# Iterate through each team and block to create their MdRQA time series
team_block_cnt = 0
for team_block,_ in grp_team_block:
    team_block_cnt += 1
    GROUPID = team_block[0]
    block = team_block[1]
    print("Processing Team %d: %s %s" % (team_block_cnt, str(GROUPID), block))
    
    # Create MdRQA time series DataFrames
    MdRQA_raw = pd.DataFrame()
    MdRQA_fix = pd.DataFrame()

        
    # Get time series of all participants
    all_raw_time_series = team_blocks.loc[(team_blocks['GROUPID'] == GROUPID) & \
                                          (team_blocks['Block'] == block), '225ms-time-series-RAW']
    
    all_fix_time_series = team_blocks.loc[(team_blocks['GROUPID'] == GROUPID) & \
                                          (team_blocks['Block'] == block), '225ms-time-series-FIXATIONS']
    
    
    # Add all time series to an MdRQA DataFrame
    for i,raw_time_series_filepath in enumerate(all_raw_time_series):
        raw_time_series = pd.read_csv(TIMESERIES_DIR + raw_time_series_filepath)
        raw_time_series = raw_time_series['grid_loc'].rename('p'+str(i+1))

        MdRQA_raw = pd.concat([MdRQA_raw, raw_time_series], axis=1)
        
        num_nan += raw_time_series.isna().sum()
        num_total += len(raw_time_series)


    for j,fix_time_series_filepath in enumerate(all_fix_time_series):
        fix_time_series = pd.read_csv(TIMESERIES_DIR + fix_time_series_filepath)
        fix_time_series = fix_time_series['grid_loc'].rename('p'+str(j+1))

        MdRQA_fix = pd.concat([MdRQA_fix, fix_time_series], axis=1)

    # Save MdRQA DataFrames as csv files
    filename = str(GROUPID) + "-" + block + "_"
#     MdRQA_raw.to_csv(RAW_OUTPUTDIR + filename + "RAW.csv", index=False)
#     MdRQA_fix.to_csv(FIX_OUTPUTDIR + filename + "FIXATIONS.csv", index=False)
    print("\tMdRQA files generated.")


print("All files generated.")
print("Num NaN: ", num_nan)
print("Num total time steps: ", num_total)

Processing Team 1: 1010 ExpBlock1
	MdRQA files generated.
Processing Team 2: 1010 ExpBlock2
	MdRQA files generated.
Processing Team 3: 1020 ExpBlock1
	MdRQA files generated.
Processing Team 4: 1020 ExpBlock2
	MdRQA files generated.
Processing Team 5: 1020 Warmup
	MdRQA files generated.
Processing Team 6: 1024 ExpBlock1
	MdRQA files generated.
Processing Team 7: 1024 ExpBlock2
	MdRQA files generated.
Processing Team 8: 1024 Warmup
	MdRQA files generated.
Processing Team 9: 1025 ExpBlock1
	MdRQA files generated.
Processing Team 10: 1025 ExpBlock2
	MdRQA files generated.
Processing Team 11: 1025 Warmup
	MdRQA files generated.
Processing Team 12: 1032 ExpBlock1
	MdRQA files generated.
Processing Team 13: 1032 ExpBlock2
	MdRQA files generated.
Processing Team 14: 1032 Warmup
	MdRQA files generated.
Processing Team 15: 1034 ExpBlock1
	MdRQA files generated.
Processing Team 16: 1034 ExpBlock2
	MdRQA files generated.
Processing Team 17: 1034 Warmup
	MdRQA files generated.
Processing Team 18: 1

	MdRQA files generated.
Processing Team 142: 1096 Warmup
	MdRQA files generated.
Processing Team 143: 1097 ExpBlock1
	MdRQA files generated.
Processing Team 144: 1097 ExpBlock2
	MdRQA files generated.
Processing Team 145: 1097 Warmup
	MdRQA files generated.
Processing Team 146: 1098 ExpBlock1
	MdRQA files generated.
Processing Team 147: 1098 ExpBlock2
	MdRQA files generated.
Processing Team 148: 1098 Warmup
	MdRQA files generated.
Processing Team 149: 1099 ExpBlock1
	MdRQA files generated.
Processing Team 150: 1099 ExpBlock2
	MdRQA files generated.
Processing Team 151: 1099 Warmup
	MdRQA files generated.
Processing Team 152: 2014 ExpBlock1
	MdRQA files generated.
Processing Team 153: 2014 ExpBlock2
	MdRQA files generated.
Processing Team 154: 2014 Warmup
	MdRQA files generated.
Processing Team 155: 2015 ExpBlock1
	MdRQA files generated.
Processing Team 156: 2015 ExpBlock2
	MdRQA files generated.
Processing Team 157: 2015 Warmup
	MdRQA files generated.
Processing Team 158: 2016 ExpBlock

## File stats to consider

In [4]:
# Group by team and block (GROUPID represents a unique team because it is specific to a school)
grp_team_block = dfMasterSelected.groupby(['GROUPID', 'Block'])

# for grp_name, grp in grp_team_block:
#     print(grp_name)

team_sizes = grp_team_block.size().reset_index(name='counts')

print("Group stats:\n")

display(team_sizes)

display(team_sizes.groupby('counts').size().reset_index(name='num_teams'))

# team_sizes.to_csv(OUTPUTDIR + "team_sizes.csv", index=False)


Group stats:



Unnamed: 0,GROUPID,Block,counts
0,1010,ExpBlock1,3
1,1010,ExpBlock2,3
2,1010,Transfer,3
3,1010,Warmup,3
4,1011,ExpBlock1,1
...,...,...,...
577,10104,Warmup,3
578,10105,ExpBlock1,2
579,10105,ExpBlock2,2
580,10105,Transfer,2


Unnamed: 0,counts,num_teams
0,1,64
1,2,155
2,3,310
3,4,53


In [None]:
# Group by team and block (GROUPID represents a unique team because it is specific to a school)
grp_team_block = dfMasterSelected.groupby(['GROUPID', 'Block'])

# Get number of participants in each team
## NOTE: We have teams of sizes 1, 2, 3, and 4
team_sizes = grp_team_block.size().reset_index(name='num_participants')

# Map each team with the team size and its participants
cols = ['GROUPID', 'Block', 'pid', '225ms-time-series-RAW', '225ms-time-series-FIXATIONS']
teams_to_participants = pd.merge(team_sizes, dfMasterSelected[cols], on=['GROUPID', 'Block'])

# Iterate through each team to create their MdRQA time series
team_cnt = 0
for team_block,_ in grp_team_block:
    team_cnt += 1
    GROUPID = team_block[0]
    block = team_block[1]
    print("Processing Team %d: %s %s" % (team_cnt, str(GROUPID), block))
    
    # Create MdRQA time series DataFrames
    MdRQA_raw = pd.DataFrame()
    MdRQA_fix = pd.DataFrame()

    # Check the size of the team (should be 3 or 4)
    if team_sizes.loc[(team_sizes['GROUPID'] == GROUPID) & \
                      (team_sizes['Block'] == block), 'num_participants'].values[0] >= 3:
        
        # Get time series of all participants
        all_raw_time_series = dfMasterSelected.loc[(dfMasterSelected['GROUPID'] == GROUPID) & \
                                                    (dfMasterSelected['Block'] == block), '225ms-time-series-RAW']
        
        all_fix_time_series = dfMasterSelected.loc[(dfMasterSelected['GROUPID'] == GROUPID) & \
                                                     (dfMasterSelected['Block'] == block), '225ms-time-series-FIXATIONS']
        
        # Add all time series to an MdRQA DataFrame
        for i,raw_time_series_filepath in enumerate(all_raw_time_series):
            raw_time_series = pd.read_csv(TIMESERIES_DIR + raw_time_series_filepath)
            raw_time_series = raw_time_series['grid_loc'].rename('p'+str(i+1))
            
            MdRQA_raw = pd.concat([MdRQA_raw, raw_time_series], axis=1)

        
        for j,fix_time_series_filepath in enumerate(all_fix_time_series):
            fix_time_series = pd.read_csv(TIMESERIES_DIR + fix_time_series_filepath)
            fix_time_series = fix_time_series['grid_loc'].rename('p'+str(j+1))
            
            MdRQA_fix = pd.concat([MdRQA_fix, fix_time_series], axis=1)
            
        # Save MdRQA DataFrames as csv files
        filename = str(GROUPID) + "-" + block + "_"
        MdRQA_raw.to_csv(RAW_OUTPUTDIR + filename + "RAW.csv", index=False)
        MdRQA_fix.to_csv(FIX_OUTPUTDIR + filename + "FIXATIONS.csv", index=False)
        print("\tMdRQA files generated.")

print("All files generated.")

