#### Creating data for sequential choice models
The series of models I plan to use in my thesis project, inspired by habitat-selection functions from movement ecology, use environmental covariates of available resources (e.g., distance from current location, turning angle from previous bearing angle, size/point value, etc.) to predict whether a resource will be chosen next. By specifying a series of models containing different parameters, I will fit different cognitive heuristic strategies to human sequential choice data and determine parameters of significance to the foraging strategies of participants. 

To start, I must take existing foraging data in the form of logs of collected objects during play and expand it out to include the covariates of interest of all available objects. This data needs to also reflect the removal and reintroduction of collected objects during the course of play. 

In [1]:
# import libraries
import pandas as pd
import numpy as np
from tqdm import tqdm

In [4]:
# read in simulation results
simul_results = pd.read_csv(
    '../../data/simulation/runs/pure_strats/simul_weighted_forages_10_10_25.csv',
    index_col=0
)
# clean up
# reset index
simul_results.index = [*range(len(simul_results))]

# read in obj locs
all_lvls_coco_locs = pd.read_csv(
    '../../data/level_arrangements/all_levels_arrangements.csv'
)

##### Adding covariates and all available objects
The two covariates of interest needed to be added to this expanded data set are distance from current location and turning angle 

In [5]:
# distance matrices store distances between coconuts
def create_distance_matrices(obj_locs=pd.DataFrame):

    # create list for levels
    dist_matrix_list = []

    # for each level, create matrix of all distances
    for level in range(0, 10):
        # filter for this level
        level_locs = obj_locs[obj_locs.level == level+1]

        # reset index for consistent indexing
        level_locs.index = [*range(0, len(level_locs))]

        # create empty matrix
        level_m = np.zeros((len(level_locs), len(level_locs)), dtype=float)

        # fill this level's matrix
        for i in range(0, len(level_locs)):
            for j in range(0, len(level_locs)):
                # check if same location
                if i == j:
                    level_m[i, j] = 0  # set to zero, save computation
                else:
                    # calc distance and store in matrix
                    level_m[i, j] = np.sqrt(
                        np.pow(level_locs.x[i]-level_locs.x[j], 2) +
                        np.pow(level_locs.y[i]-level_locs.y[j], 2)
                    )

        # add to list
        dist_matrix_list.append(level_m)

    return dist_matrix_list

In [6]:
d = create_distance_matrices(all_lvls_coco_locs)

In [21]:
# calculate turning angles
def calculate_turning_angles(
    previous_head_angle=float, curr_coco_id=int, lvl_coco_locs=pd.DataFrame
):

    # empty list for angles
    ta_list = []

    # loop through df and calculate each ta
    for coco_id in lvl_coco_locs.obj_ID:
        if coco_id == curr_coco_id:
            # add NA for curr obj
            ta_list.append(pd.NA)
        else:
            ta = np.atan2(
                lvl_coco_locs.iloc[coco_id-1].y -
                lvl_coco_locs.iloc[curr_coco_id-1].y,
                lvl_coco_locs.iloc[coco_id-1].x -
                lvl_coco_locs.iloc[curr_coco_id-1].x
            ) - previous_head_angle

            ta_list.append(ta)

    return ta_list

In [None]:
ta = calculate_turning_angles(
    0, 1, all_lvls_coco_locs[all_lvls_coco_locs.level == 1])

len(ta)

68

In [32]:
# neighbors distance
def neighborhood_value(
    curr_coco_id=int, lvl_dist_matrix=list, num_neighbors=int, avail_indices=list
):
    # filter the df for currently available obj and their distances from focal obj
    active_obj_dists = lvl_dist_matrix[curr_coco_id-1][avail_indices]

    return np.sum(np.reciprocal(sorted(active_obj_dists)[1:num_neighbors+1]))

In [37]:
act_obj_dists = d[0][2]

print(np.sum(np.reciprocal(sorted(act_obj_dists)[1:3])))

0.482842712474619


In [None]:
# combine together to create function to expand df
def expand_df_with_covariates(
    orig_df=pd.DataFrame,
    all_coco_locs_df=pd.DataFrame
):

    # create all distance matrices
    lvl_dist_matrices = create_distance_matrices(all_coco_locs_df)

    # loop through df and expand
    for i in tqdm(range(0, len(orig_df)-1)):

        # skip last collection in level/subject to avoid predicting collection on
        # following level or subject
        if (orig_df.iloc[i].level != orig_df.iloc[i+1].level) | (
                orig_df.iloc[i].forager != orig_df.iloc[i+1].forager):

            # complete df and write to file
            expanded_df.to_csv(
                '../../data/simulation/expansion_chunks/exp_for' +
                str(orig_df.iloc[i].forager) +
                '_lvl_'+str(orig_df.iloc[i].level)+'.csv',
                index=False
            )

            # remove from memory
            del expanded_df

            # do not add data and continue on to next level or subject
            continue

        # set level location distances and arrangement or reset on new level
        if (i == 0) | (orig_df.iloc[i].level != orig_df.iloc[i-1].level):

            # reset collection_num
            collect_num = 0

            # use level list to determine locations to consider
            curr_matrix = lvl_dist_matrices[int(orig_df.iloc[i].level-1)]

            # and level arrangement from the locations df
            curr_level_locs = all_coco_locs_df[
                all_coco_locs_df.level == int(orig_df.iloc[i].level)
            ]

            # set available obj list
            avail_obj_indices = [i for i in range(len(curr_level_locs))]

            # index for location df is same
            curr_level_locs.index = avail_obj_indices

            # set heading angle from 0,0 to current location
            heading_angle = np.atan2(orig_df.iloc[i].y, orig_df.iloc[i].x)

            # grab obj_ID
            curr_obj_ID = int(orig_df.iloc[i].obj_ID)

            ## Covariate calculation ###################################

            # get distances for available objects
            dist_from_curr = curr_matrix[curr_obj_ID-1][avail_obj_indices]

            # neighborhood influenced distance values of available objects
            nhood_val_from_curr = [
                neighborhood_value(obj_ID, curr_matrix, 3, avail_obj_indices) for obj_ID in avail_obj_indices
            ]

            # turning angles from current object
            tas_from_curr = calculate_turning_angles(
                heading_angle, curr_obj_ID, curr_level_locs
            )

            ## Collection ##############################################
            collect_df = pd.DataFrame({
                'obj_ID': curr_obj_ID,
                'time_to_respawn': orig_df.iloc[i].time+5
            }, index=[0])

            ## Used/Unused criterion ###################################

            # set length of expansion
            rep_len = len(avail_obj_indices)

            # create used/unused criterion list
            used_list = [0]*rep_len

            # set indexed value to one
            avail_obj_used_index = avail_obj_indices.index(
                int(orig_df.iloc[i+1].obj_ID-1)
            )

            used_list[avail_obj_used_index] = 1

            ## Add expanded row ########################################

            # create expanded df
            expanded_df = pd.DataFrame({
                'strategy': [orig_df.iloc[i].strategy]*rep_len,
                'forager': [orig_df.iloc[i].forager]*rep_len,
                'level': [orig_df.iloc[i].level]*rep_len,
                'collection_num': [collect_num]*rep_len,
                'obj_ID': [i+1 for i in avail_obj_indices],
                'point_value': [orig_df.iloc[i].point_value]*rep_len,
                'distance': dist_from_curr,
                'turning_angle': tas_from_curr,
                'neighbor_value': nhood_val_from_curr,
                'used': used_list,
                'time': [orig_df.iloc[i].time]*rep_len
            })

        else:

            collect_num += 1

            # grab previously calculated turning angle from df
            # heading_angle = expanded_df[(expanded_df.collection_num == (
            #     collect_num-1)) & (expanded_df.used == 1)].turning_angle.iloc[0]

            heading_angle = np.atan2(
                orig_df.iloc[i].y-orig_df.iloc[i-1].y,
                orig_df.iloc[i].x-orig_df.iloc[i-1].x
            )

            # grab obj_ID
            curr_obj_ID = int(orig_df.iloc[i].obj_ID)

            ## Respawning ##############################################
            if len(collect_df) > 0:

                # see if any obj need to respawn
                obj_to_respawn = collect_df[
                    collect_df.time_to_respawn < orig_df.iloc[i+1].time
                ].obj_ID

                # filter out if there is an obj to respawn
                if len(obj_to_respawn) > 0:
                    collect_df = collect_df[
                        collect_df.time_to_respawn > orig_df.iloc[i+1].time
                    ]

                # reset available indices
                avail_obj_indices = [
                    x for x in [i for i in range(len(curr_matrix))] if x not in collect_df.obj_ID.values-1
                ]

            ## Covariate calculation ###################################

            # get distances for available objects
            dist_from_curr = curr_matrix[curr_obj_ID-1][avail_obj_indices]

            # neighborhood influenced distance values of available objects
            nhood_val_from_curr = [
                neighborhood_value(obj_ID, curr_matrix, 2, avail_obj_indices) for obj_ID in avail_obj_indices
            ]

            # turning angles from current object
            tas_from_curr = calculate_turning_angles(
                heading_angle, curr_obj_ID, curr_level_locs
            )

            # filter for only available obj
            tas_from_curr = [
                tas_from_curr[i] for i in avail_obj_indices
            ]

            ## Collection ##############################################

            # add new row
            collect_df.loc[len(collect_df)] = [
                curr_obj_ID, orig_df.iloc[i].time+5]

            ## Used/Unused criterion ###################################

            # set length of expansion
            rep_len = len(avail_obj_indices)

            used_list = [0]*rep_len

            # set indexed value to one
            avail_obj_used_index = avail_obj_indices.index(
                int(orig_df.iloc[i+1].obj_ID-1))
            used_list[avail_obj_used_index] = 1

            ## Add expanded row ########################################
            row_expansion = pd.DataFrame({
                'strategy': [orig_df.iloc[i].strategy]*rep_len,
                'forager': [orig_df.iloc[i].forager]*rep_len,
                'level': [orig_df.iloc[i].level]*rep_len,
                'collection_num': [collect_num]*rep_len,
                'obj_ID': [i+1 for i in avail_obj_indices],
                'point_value': [orig_df.iloc[i].point_value]*rep_len,
                'distance': dist_from_curr,
                'turning_angle': tas_from_curr,
                'neighbor_value': nhood_val_from_curr,
                'used': used_list,
                'time': [orig_df.iloc[i].time]*rep_len
            })

            # add to the new expanded df
            expanded_df = pd.concat(
                [expanded_df, row_expansion], ignore_index=True
            )

    # finish and return
    return expanded_df

In [12]:
# grab 3 random runs from each strat
nn_runs = simul_results[
    (simul_results.strategy == 'nn') & (
        simul_results.forager.isin(np.random.randint(0, 100, 1)))
]
ta_runs = simul_results[
    (simul_results.strategy == 'ta') & (
        simul_results.forager.isin(np.random.randint(0, 100, 3)))
]
clst_runs = simul_results[
    (simul_results.strategy == 'clst') & (
        simul_results.forager.isin(np.random.randint(0, 100, 3)))
]

# concat
all_runs = pd.concat(
    [nn_runs, ta_runs, clst_runs]
)

# reset index
all_runs.index = [i for i in range(0, len(all_runs))]

In [14]:
all_runs = all_runs[all_runs.strategy == 'nn']

In [15]:
all_runs

Unnamed: 0,nn_weight,ta_weight,clst_weight,pv_weight,obj_ID,x,y,point_value,time,dist,points,strategy,forager,level
0,1.665741,0.0,0.0,1.0,48,-10.0,10.0,2.0,1.229751,14.142136,2.0,nn,11,1
1,1.665741,0.0,0.0,1.0,46,-12.5,12.5,2.0,1.537189,3.535534,4.0,nn,11,1
2,1.665741,0.0,0.0,1.0,49,-15.0,10.0,2.0,1.844626,3.535534,6.0,nn,11,1
3,1.665741,0.0,0.0,1.0,51,-15.0,6.5,2.0,2.148974,3.500000,8.0,nn,11,1
4,1.665741,0.0,0.0,1.0,50,-20.0,10.0,2.0,2.679694,6.103278,10.0,nn,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3269,1.665741,0.0,0.0,1.0,47,12.5,-12.5,2.0,126.774169,3.535534,692.0,nn,11,10
3270,1.665741,0.0,0.0,1.0,44,15.0,-10.0,2.0,127.081607,3.535534,694.0,nn,11,10
3271,1.665741,0.0,0.0,1.0,45,18.5,-10.0,2.0,127.385954,3.500000,696.0,nn,11,10
3272,1.665741,0.0,0.0,1.0,38,15.0,-5.0,2.0,127.916674,6.103278,698.0,nn,11,10


In [27]:
expanded_df = expand_df_with_covariates(
    orig_df=all_runs, all_coco_locs_df=all_lvls_coco_locs
)

100%|██████████| 3273/3273 [00:20<00:00, 159.36it/s]


In [48]:
expanded_df

Unnamed: 0,strategy,forager,level,collection_num,obj_ID,point_value,distance,turning_angle,neighbor_value,used,time
0,nn,17,1,0,1,2.0,33.871079,2.208034,0.285714,0,1.229751
1,nn,17,1,0,2,2.0,30.413813,2.191046,0.285714,0,1.229751
2,nn,17,1,0,3,2.0,30.000000,2.356194,0.285714,0,1.229751
3,nn,17,1,0,4,2.0,31.622777,2.034444,0.282843,0,1.229751
4,nn,17,1,0,5,2.0,27.613403,2.265535,0.282843,0,1.229751
...,...,...,...,...,...,...,...,...,...,...,...
37214,nn,17,3,3,64,4.0,46.097722,-7.930754,0.282843,0,2.152064
37215,nn,17,3,3,65,4.0,38.890873,-7.853982,0.282843,0,2.152064
37216,nn,17,3,3,66,4.0,42.573466,-7.937123,0.282843,0,2.152064
37217,nn,17,3,3,67,4.0,46.502688,-8.006631,0.285714,0,2.152064


In [44]:
expanded_df[expanded_df.used == 1].obj_ID

28       29
91       25
156      24
217      20
283      22
         ..
36950    66
36964    12
37028     9
37091     6
37154     4
Name: obj_ID, Length: 654, dtype: object

In [45]:
expanded_df.groupby(['level', 'collection_num']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,strategy,forager,obj_ID,point_value,distance,turning_angle,neighbor_value,used,time
level,collection_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,68,68,68,68,68,67,68,68,68
1,1,67,67,67,67,67,66,67,67,67
1,2,66,66,66,66,66,65,66,66,66
1,3,65,65,65,65,65,64,65,65,65
1,4,64,64,64,64,64,63,64,64,64
...,...,...,...,...,...,...,...,...,...,...
2,300,57,57,57,57,57,56,57,57,57
3,0,68,68,68,68,68,67,68,68,68
3,1,67,67,67,67,67,66,67,67,67
3,2,66,66,66,66,66,65,66,66,66


In [None]:
expanded_df.to_csv(
    '../../data/simulation/expanded_simul_w_cov_10_16_25.csv', index=False)