In [1]:
import os
import json
import random

# Seed the random number generator predictably for every cell,
# so that we get the same random results when re-running a cell
seed = 123
def get_rng(task_index, cell_index):
    return random.Random(seed + task_index + 1000 * cell_index)

# Overall study design

In [2]:
rng = get_rng(-1, 0)

# Not all tasks could be fit into a 60 minute session
tasks = [
    'data-analysis',
    'natural-language-processing',
    # 'physics-simulation',
    # 'web-programming',
]
assert len(tasks) % 2 == 0
rng.shuffle(tasks)

design = {
    "stages": {
        "A": {
            "tasks": tasks[:len(tasks) // 2]
        },
        "B": {
            "tasks": tasks[len(tasks) // 2:]
        }
    },
    "groups": {
        "control-first": {
            "models": {
                "A": "control",
                "B": "tom"
            }
        },
        "tom-first": {
            "models": {
                "A": "tom",
                "B": "control"
            }
        }
    },
    "example": "string-anagram"
}
with open("design.json", "w") as file:
    json.dump(design, file, indent=4)

# Task questions

In [3]:
# Randomise the order of the distractors and the answer
def randomise_multiple_choice(answer, distractors, rng):
    options = distractors + [answer]
    rng.shuffle(options)
    answerIndex = options.index(answer)

    options = {chr(ord('a') + i): option for i, option in enumerate(options)}
    answer = chr(ord('a') + answerIndex)

    return {
        "options": options,
        "type": "multiple_choice",
        "answer": answer
    }

question_type_choices = {
    "program-atom": [
        "Determine the value of an expression for given values of the involved variables",
        "Trace a sequence of expressions for given values of the involved variables",
    ],

    "program-block/relational": [
        "Determine the number of iterations of a loop construct for a given initial state",
        "Identify the block implementing some specific program pattern",
        "Identify the variable playing a specific role",
    ],

    "program-macro": [
        "Select from given options the program that is computationally equivalent to a reference one",
        "Indicate whether two given programs are computationally equivalent",
    ],

    "function-atom": [
        "Identify the purpose of an expression in connection with the problem domain",
        "Identify the expression with a given function, described in problem-domain terms",
    ],

    "function-block/relational": [
        "Identify the block with a given function, described in problem-domain terms",
        "Identify the purpose of a block, in connection with the problem domain",
    ],

    "function-macro": [
        "Choose an appropriate name for a program",
        "Select the sentence, from a few options, which most accurately summarises the program’s purpose",
    ]
}

## String anagram (example)

In [4]:
questions = [
    {
        "dimension": "program",
        "level": "macro",
        "question": "Does the following code snippet return the same result as lines 15-19 of the original program?",
        "snippet": '''for i in range(len(second_str)):
    count[first_str[i]] -= 1
    count[second_str[i]] += 1

return all(_count == 0 for _count in count.values())''',
        "type": "yes_no",
        "answer": True
    },
    {
        "dimension": "function",
        "level": "macro",
        "question": "Which sentence best describes the purpose of the program?",
        "options": {
            "a": "Make sure two strings are palindromes.",
            "b": "See if one string contains the other.",
            "c": "Check whether two strings are anagrams.",
            "d": "Count which string contains more vowels.",
        },
        "type": "multiple_choice",
        "answer": "c",
    }
]
with open(os.path.join("questions", "string-anagram.json"), "w") as file:
    json.dump(questions, file, indent=4)

## Data analysis

In [5]:
rng = get_rng(0, 0)
question_types = {key: rng.choice(choices) for key, choices in question_type_choices.items() }
question_types

{'program-atom': 'Determine the value of an expression for given values of the involved variables',
 'program-block/relational': 'Identify the block implementing some specific program pattern',
 'program-macro': 'Select from given options the program that is computationally equivalent to a reference one',
 'function-atom': 'Identify the expression with a given function, described in problem-domain terms',
 'function-block/relational': 'Identify the purpose of a block, in connection with the problem domain',
 'function-macro': 'Choose an appropriate name for a program'}

In [6]:
rng = get_rng(0, 1)
parameter_choices = {
    "program-atom": [
        {"lines": (17, 17)},
        {"lines": (21, 24)},
        {"lines": (36, 37)},
    ],

    "program-block/relational": [
        {
            "lines": [
                (44, 44),
                (45, 45),
                (44, 45),
                (44, 47),
                (45, 47),
            ], 
            "pattern": "traversal"
        },
        {
            "lines": [
                (19, 19),
                (21, 34),
                (26, 34),
            ],
            "pattern": "filter input"
        },
        {
            "lines": [
                (17, 17),
                (37, 37)
            ],
            "pattern": "table lookup based conversion"
        }
    ],

    "program-macro": [
        {"lines": (19, 19)},
        {"lines": (21, 24)},
        {"lines": (26, 34)},
        {"lines": (43, 47)},
    ],

    "function-atom": [
        {"line": 37},
        {"line": 47},
        {"line": 51},
    ],

    "function-block/relational": [
        {"lines": (17, 17)},
        {"lines": (21, 34)},
        {"lines": (52, 54)},
    ]
}
parameters = {key: rng.choice(choices) for key, choices in parameter_choices.items() }
{key: {
    "question_type": question_type, 
    "parameters": parameters.get(key, {})
} for key, question_type in question_types.items()}

{'program-atom': {'question_type': 'Determine the value of an expression for given values of the involved variables',
  'parameters': {'lines': (21, 24)}},
 'program-block/relational': {'question_type': 'Identify the block implementing some specific program pattern',
  'parameters': {'lines': [(19, 19), (21, 34), (26, 34)],
   'pattern': 'filter input'}},
 'program-macro': {'question_type': 'Select from given options the program that is computationally equivalent to a reference one',
  'parameters': {'lines': (19, 19)}},
 'function-atom': {'question_type': 'Identify the expression with a given function, described in problem-domain terms',
  'parameters': {'line': 51}},
 'function-block/relational': {'question_type': 'Identify the purpose of a block, in connection with the problem domain',
  'parameters': {'lines': (17, 17)}},
 'function-macro': {'question_type': 'Choose an appropriate name for a program',
  'parameters': {}}}

In [7]:
rng = get_rng(0, 2)
questions = [
    {
        "dimension": "program",
        "level": "atom",
        "question": "Given the following values for `_median`, `_quantile75`, and `_quantile25`, indicate the value `iqr` (line 24) will take on.",
        "description": {
            "_median": {
                "table": [
                    ["vehicleType", ""],
                    ["bus", 3],
                    ["cabrio", 1],
                ]
            },
            "_quantile75": {
                "table": [
                    ["vehicleType", ""],
                    ["bus", 6],
                    ["cabrio", 4],
                ]
            },
            "_quantile25": {
                "table": [
                    ["vehicleType", ""],
                    ["bus", 2],
                    ["cabrio", 1],
                ]
            },
            "iqr": {
                "table": [
                    ["vehicleType", ""],
                    ["a", "b"],
                    ["c", "d"],
                ]
            },
        },
        "type": "text",
        "multiple": True,
        "answer": [
            ["bus"],
            [9, "9", "9.0", "9,0"],
            ["cabrio"],
            [5.5, "5.5", "5,5"],
        ]
    },
    {
        "dimension": "program",
        "level": "block/relational",
        "question": "Indicate which block of code is responsible for filtering data. A block may consist of one or more sequential lines. Give the starting and ending line numbers of this block. Note that there may be multiple correct answers.",
        "type": "block",
        "answer": [
            [19, 19],
            [19, 34],
            [21, 34],
            [25, 34],
            [26, 34],
            [19, 35],
            [21, 35],
            [25, 35],
            [26, 35],
        ]
    },
    {
        "dimension": "program",
        "level": "macro",
        "question": "Which of the following code snippets is equivalent to (e.g., gives the same result as) line 19 of the program?",
        **randomise_multiple_choice(
            '''df = df[df["yearOfRegistration"] >= 1890]
df = df[df["yearOfRegistration"] <= 2016]''',
            [
                '''df = df[(df["yearOfRegistration"] >= 2016) & (df["yearOfRegistration"] <= 1890)]''',
                '''if (df["yearOfRegistration"] >= 1890) & (df["yearOfRegistration"] <= 2016):
    df = df''',
                '''df = df[(df["yearOfRegistration"] >= 1890) | (df["yearOfRegistration"] <= 2016)]''',
            ],
            rng
        )
    },
    {
        "dimension": "function",
        "level": "atom",
        "question": "Indicate which line of code performs the following function: 'Restructure the table so that columns represent vehicle types, rows represent brands, and the values represent average prices'. Give the line number of this line.",
        "type": "line",
        "answer": [
            51
        ],
    },
    {
        "dimension": "function",
        "level": "block/relational",
        "question": "Indicate the purpose of the code on line 17.",
        **randomise_multiple_choice(
            "Translate the offer type column from German to English.",
            [
                "Remove all values from the offer type column that are not either 'Gesuch', 'Request', 'Angebot', or 'Offer'.",
                "Remove duplicate values from the offer type column.",
                "Create a new column to indicate if the offer is in German or English.",
            ],
            rng
        )
    },
    {
        "dimension": "function",
        "level": "macro",
        "question": "What is the most appropriate name for the entire program?",
        **randomise_multiple_choice(
            "vehicle_price_analysis",
            [
                "price_trend_plot",
                "car_brand_visualization",
                "vehicle_data_cleanup"
            ],
            rng
        )
    }
]
rng.shuffle(questions)
with open(os.path.join("questions", "data-analysis.json"), "w") as file:
    json.dump(questions, file, indent=4)

## Natural language processing

In [8]:
rng = get_rng(1, 0)
question_types = {key: rng.choice(choices) for key, choices in question_type_choices.items() }
question_types

{'program-atom': 'Trace a sequence of expressions for given values of the involved variables',
 'program-block/relational': 'Identify the variable playing a specific role',
 'program-macro': 'Select from given options the program that is computationally equivalent to a reference one',
 'function-atom': 'Identify the purpose of an expression in connection with the problem domain',
 'function-block/relational': 'Identify the block with a given function, described in problem-domain terms',
 'function-macro': 'Select the sentence, from a few options, which most accurately summarises the program’s purpose'}

In [9]:
rng = get_rng(1, 1)
parameter_choices = {
    "program-atom": [
        {"lines": (22, 25)},
        {"lines": (28, 31)},
    ],

    "program-block/relational": [
        {
            "lines": [
                (9, 12),
                (12, 12)
            ],
            "pattern": "input processing"
        },
        {
            "lines": [
                (31, 32),
                (32, 32)
            ],
            "pattern": "results accumulation"
        },
        {
            "lines": [
                (9, 10),
                (14, 15)
            ],
            "pattern": "utility function"
        }
    ],

    "program-macro": [
        {"lines": (5, 7)},
        {"lines": (22, 25)},
        {"lines": (28, 31)},
    ],

    "function-atom": [
        {"lines": (12, 12)},
        {"lines": (17, 17)},
        {"lines": (28, 28)},
    ],

    "function-block/relational": [
        {"lines": (5, 7)},
        {"lines": (22, 25)},
        {"lines": (28, 30)},
    ]
}
parameters = {key: rng.choice(choices) for key, choices in parameter_choices.items() }
{key: {
    "question_type": question_type, 
    "parameters": parameters.get(key, {})
} for key, question_type in question_types.items()}

{'program-atom': {'question_type': 'Trace a sequence of expressions for given values of the involved variables',
  'parameters': {'lines': (22, 25)}},
 'program-block/relational': {'question_type': 'Identify the variable playing a specific role',
  'parameters': {'lines': [(31, 32), (32, 32)],
   'pattern': 'results accumulation'}},
 'program-macro': {'question_type': 'Select from given options the program that is computationally equivalent to a reference one',
  'parameters': {'lines': (28, 31)}},
 'function-atom': {'question_type': 'Identify the purpose of an expression in connection with the problem domain',
  'parameters': {'lines': (17, 17)}},
 'function-block/relational': {'question_type': 'Identify the block with a given function, described in problem-domain terms',
  'parameters': {'lines': (5, 7)}},
 'function-macro': {'question_type': 'Select the sentence, from a few options, which most accurately summarises the program’s purpose',
  'parameters': {}}}

In [10]:
rng = get_rng(1, 2)
questions = [
    {
        "dimension": "program",
        "level": "atom",
        "question": "Given the following value for `s_vectors` and `student_a`, determine the results of each of the operations below.",
        "input": {
            "s_vectors": '''[("alice.txt", [1, 2, 3]), ("bert.txt", [4, 5, 6])]''',
            "student_a": '''"bert.txt"''',
        },
        "operations": [
            {
                "line": 22,
                "output": {
                    "new_vectors": "a",
                },
            },
            {
                "line": 24,
                "output": {
                    "current_index": "b",
                },
            },
            {
                "line": 25,
                "output": {
                    "new_vectors": "c",
                },
            },
        ],
        "type": "regex",
        "multiple": True,
        "answer": [
            r'^(s_vectors: )?\[?\("alice\.txt", \[1, 2, 3\]\), \("bert\.txt", \[4, 5, 6\]\)\]?$'.replace(' ', r'\s*').replace('"', r"""["']"""),
            r'^1$',
            r'^(s_vectors: )?\[?\("alice\.txt", \[1, 2, 3\]\)\]?$'.replace(' ', r'\s*').replace('"', r"""["']"""),
        ]
    },
    {
        "dimension": "program",
        "level": "block/relational",
        "question": "An accumulator is a variable that is updated several times with partial information, so that it can later be used to present a composite result. Name the variable that acts as an accumulator in this program.",
        "type": "text",
        "answer": "comparisons"
    },
    {
        "dimension": "program",
        "level": "macro",
        "question": "Which of the following code snippets is equivalent to (e.g., gives the same result as) lines 28-31 of the program?",
        **randomise_multiple_choice(
            '''comparison = (*sorted((student_a, student_b)), similarity(text_vector_a, text_vector_b)[0][1], similarity(text_vector_a, text_vector_b)[0][1] > threshold)''',
            [
                '''comparison = (*sorted((student_a, student_b)), similarity(text_vector_a, text_vector_b)[0][1], sim_score > threshold)''',
                '''comparison = (student_a, student_b, similarity(text_vector_a, text_vector_b)[0][1], similarity(text_vector_a, text_vector_b)[0][1] > threshold)''',
                '''comparison = (student_a, student_b, similarity([text_vector_a, text_vector_b])[0][1], sim_score > threshold)'''
            ],
            rng
        )
    },
    {
        "dimension": "function",
        "level": "atom",
        "question": "Indicate the purpose of the code on line 17.",
        **randomise_multiple_choice(
            "Pair up the names of the student's documents with the corresponding vectors.",
            [
                "Sort the student filenames and their corresponding vectors.",
                "Determine the similarity score for each student's document.",
                "Remove the student files for which the vector is empty."
            ],
            rng
        )
    },
    {
        "dimension": "function",
        "level": "block/relational",
        "question": "Give the starting and ending line numbers of the code that performs the following function: 'Get the names of each student's file and their contents'.",
        "type": "block",
        "answer": [
            [4, 7],
            [5, 7],
            [4, 8],
            [5, 8],
        ]
    },
    {
        "dimension": "function",
        "level": "macro",
        "question": "Which sentence best describes the purpose of the program?",
        **randomise_multiple_choice(
            "Check whether students have plagiarized each other.",
            [
                "Determine if students have submitted a correct solution.",
                "Sort student's assignments based on their similarity.",
                "Analyze texts submitted by students.",
            ],
            rng
        )
    }
]
rng.shuffle(questions)
with open(os.path.join("questions", "natural-language-processing.json"), "w") as file:
    json.dump(questions, file, indent=4)