In [1]:
import pandas as pd
from collections import defaultdict
from itertools import combinations
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

## Define core functions

In [2]:
# Thanks to https://stackoverflow.com/a/312464
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def get_answers(entry):
    """
    Just get the plain answers without anything else.
    """
    answers = []
    for prompt in prompts:
        answer = entry[prompt]
        if answer:
            answers.append(answer)
    return answers


def get_answer_dict(entry):
    """
    Get the answered prompts with answers.
    """
    answers = dict()
    for prompt in prompts:
        answer = entry[prompt]
        if answer:
            answers[prompt] = answer
    return answers


likert_map = {'Strongly disagree': 0,
              'Somewhat disagree': 1,
              'Neither agree nor disagree': 2,
              'Somewhat agree': 3,
              'Strongly agree': 4}


def map_to_numbers(values):
    """
    Map list of responses to numerical values
    """
    return [likert_map[v] for v in values]


def answers_to_numbers(d):
    """
    Generate list of numbers based on dict of answers.
    """
    answers = [d[k] for k in sorted(d.keys())]
    return map_to_numbers(answers)
               

def average_lists(list_of_lists):
    """
    Average values for a list of lists.
    """
    return [sum(pair)/len(pair) for pair in zip(*list_of_lists)]

## Load the data

Load the data and select those responses where we have obtained informed consent, and who answered all 40 questions.

In [3]:
import warnings

# Suppress warning from openpyxl that the workbook contains no default style:
with warnings.catch_warnings(record=True):
    df = pd.read_excel("Qualtrics-round2.xlsx", na_filter=False)

df = df[df['Status'] != "Survey Preview"]

prompts = [label for label in df.columns if label.isdigit()]
blocks = {' '.join(sorted(chunk)): i for i, chunk in enumerate(chunks(prompts, 40), start=1)}

with_consent = df[df['Q1002']=='Yes']
responses = with_consent.to_dict('records')
responses = [r for r in responses if len(get_answers(r)) == 40]

print("This should be 80 (our total number of participants):", len(responses))

This should be 80 (our total number of participants): 80


## Group participants

Group participants by the questions they answered. Recall that each participant only answered 40 questions, out of 1000. In the next step we want to compare participants who answered the same questions.

In [4]:
grouped = defaultdict(list)
for response in responses:
    answers = get_answer_dict(response)
    key = ' '.join(sorted(answers.keys()))
    grouped[key].append((response['PROLIFIC_PID'], answers))

## Select updated blocks

First look at the updated block to see if correlations improved. We collected:
* Two more responses for block 5
* One more response for block 17
* Two more responses for block 23

In [5]:
{block: prompts for prompts,block in blocks.items() if block in [5,17,23]}

{5: '11 113 154 168 197 199 23 233 242 25 276 289 299 336 362 393 417 444 481 543 570 594 628 655 67 673 697 698 709 726 730 734 762 77 811 814 89 913 916 988',
 17: '136 138 160 170 173 218 240 274 288 333 335 346 347 348 376 41 411 439 545 547 560 576 626 66 740 748 778 783 809 86 890 898 908 922 940 959 968 970 98 986',
 23: '110 167 177 194 248 290 310 35 375 383 402 413 424 449 491 511 512 517 519 520 561 601 603 679 680 686 690 719 728 755 761 794 797 816 849 866 873 88 903 952'}

## Compare participants

We again use a leave-one-out strategy to compare each participant with the average scores of the other participants who answered the same question.

In [6]:
corr_per_block = defaultdict(list)

for group in grouped.values():
    # Indices corresponds to the indices for each participant.
    indices = set(range(len(group)))
    # This for-loop looks at each participant in turn.
    for i in range(len(group)):
        # Select participant with index i.
        selected_id, selected_answers = group[i]
        block = blocks[' '.join(sorted(selected_answers.keys()))]

        # only run code for selected blocks:
        if block in {5,17,23}:
            selected_values = answers_to_numbers(selected_answers)

            # Others are the complement of that:
            others = [answers_to_numbers(group[x][1]) for x in (indices - {i})]
            avg_others = average_lists(others)

            # Compute Spearman correlation between the selected participant and the others.
            corr, pvalue = spearmanr(selected_values, avg_others)
            
            # Store results
            corr_per_block[block].append((corr, selected_id))

for block, results in corr_per_block.items():
    print(f"Results for block {block}:")
    for corr, participant in sorted(results):
        print(f"Score for {participant} is: {corr}")

Results for block 17:
Score for 5e9d59e26627630009d1f013 is: 0.348955064468428
Score for 62a8375ef5d3cb6960e4ed2e is: 0.6017170744243113
Score for 5f5a292573b827085d758463 is: 0.6150521180597369
Score for 5e8e0592039e71067ee874a5 is: 0.7066257113774909
Results for block 23:
Score for 60d2295bcb7135a034690583 is: -0.11146199789072132
Score for 5b5eda691900510001e970b3 is: 0.3151651477365368
Score for 60eec4cb6baff7af36bc4c45 is: 0.49385179175410276
Score for 62926f5a375553f13d68bc92 is: 0.5112887731612005
Score for 6100af50f41049317edf4099 is: 0.6398311136865183
Results for block 5:
Score for 614e31ad5927c89cb387915d is: 0.13529577644610655
Score for 5f60a29da41c6f05389423b5 is: 0.6272686329531223
Score for 5c4b987538878c0001c7883b is: 0.6745976324154203
Score for 5f8bb4f68cb05a2f5c15130a is: 0.6764845576375023
Score for 5c982494dd325800146f870e is: 0.7512766297970671


**Observation:** two participants really stand out:
* `60d2295bcb7135a034690583` with a negative correlation.
* `614e31ad5927c89cb387915d` with a very low correlation of 0.135.

What are their answers like?

In [7]:
outliers = set()
for block, results in corr_per_block.items():
    for corr, participant in results:
        if corr < 0.15:
            outliers.add(participant)

for response in responses:
    identifier = response['PROLIFIC_PID']
    if identifier in outliers:
        answers = get_answer_dict(response)
        print(f"Answers for participant {identifier}:")
        print(Counter(answers.values()))
        print()

Answers for participant 614e31ad5927c89cb387915d:
Counter({'Neither agree nor disagree': 14, 'Somewhat disagree': 13, 'Strongly disagree': 7, 'Somewhat agree': 4, 'Strongly agree': 2})

Answers for participant 60d2295bcb7135a034690583:
Counter({'Neither agree nor disagree': 39, 'Somewhat agree': 1})



In [8]:
# Let's define outliers again, so that we select the three best responses:
outliers = []
for block, results in corr_per_block.items():
    sorted_scores = sorted(results, reverse=True)
    leftover = [participant for corr, participant in sorted_scores[3:]]
    outliers.extend(leftover)

print("Excluded outliers:")
for outlier in outliers:
    print(outlier)

responses_for_analysis = [r for r in responses if not r["PROLIFIC_PID"] in outliers]
df = pd.DataFrame(responses_for_analysis)
df.to_excel("Selected_for_analysis.xlsx", index=False)

Excluded outliers:
5e9d59e26627630009d1f013
5b5eda691900510001e970b3
60d2295bcb7135a034690583
5f60a29da41c6f05389423b5
614e31ad5927c89cb387915d


## Comparing, again

Let's compare the participants again, without the outliers.

In [9]:
grouped = defaultdict(list)
for response in responses:
    answers = get_answer_dict(response)
    key = ' '.join(sorted(answers.keys()))
    identifier = response['PROLIFIC_PID']
    if not identifier in outliers:
        grouped[key].append((identifier, answers))

In [10]:
corr_per_block = defaultdict(list)

for group in grouped.values():
    # Indices corresponds to the indices for each participant.
    indices = set(range(len(group)))
    # This for-loop looks at each participant in turn.
    for i in range(len(group)):
        # Select participant with index i.
        selected_id, selected_answers = group[i]
        block = blocks[' '.join(sorted(selected_answers.keys()))]

        # only run code for selected blocks:
        if block in {5,17,23}:
            selected_values = answers_to_numbers(selected_answers)

            # Others are the complement of that:
            others = [answers_to_numbers(group[x][1]) for x in (indices - {i})]
            avg_others = average_lists(others)

            # Compute Spearman correlation between the selected participant and the others.
            corr, pvalue = spearmanr(selected_values, avg_others)
            
            # Store results
            corr_per_block[block].append((corr, selected_id))

for block, results in corr_per_block.items():
    print(f"Results for block {block}:")
    for corr, participant in sorted(results):
        print(f"Score for {participant} is: {corr}")

Results for block 23:
Score for 62926f5a375553f13d68bc92 is: 0.4827382302572014
Score for 60eec4cb6baff7af36bc4c45 is: 0.5647026851639074
Score for 6100af50f41049317edf4099 is: 0.607219512214692
Results for block 5:
Score for 5c4b987538878c0001c7883b is: 0.6872742703094431
Score for 5c982494dd325800146f870e is: 0.6939449149164243
Score for 5f8bb4f68cb05a2f5c15130a is: 0.7447242954260107
Results for block 17:
Score for 5f5a292573b827085d758463 is: 0.6557561485739504
Score for 5e8e0592039e71067ee874a5 is: 0.6686957367267742
Score for 62a8375ef5d3cb6960e4ed2e is: 0.6849337909794981
