# Generate the data for the statistical in-context learning experiments

In [1]:
import numpy as np
import pandas as pd

In [2]:
def linear_classification_task(dim, num_examples, seed=None):
    """binary classification with a linear decision boundary"""
    rng = np.random.default_rng(seed)
    X_data = rng.normal(0, 1, size=(num_examples, dim))
    X_data = np.round(X_data, 2)
    coeff = rng.normal(0, 1, size=(dim, 1))
    y_data = (X_data @ coeff >= 0).astype(int)
    return X_data, y_data

linear_classification_task(3, 3)

(array([[-0.14, -0.03,  0.03],
        [ 1.13,  0.23,  0.57],
        [-1.44, -1.35,  0.87]]),
 array([[0],
        [1],
        [0]]))

In [3]:
# we generate multiple datasets with different dimensions and coefficients
for idx in range(1000):
    for dim in [8]:
        X_data, y_data = linear_classification_task(dim, 20000, seed=1000*idx + dim)  # 1500, 

        # put into a pandas data frame with colum names  X1, X2, ... and target name Y
        df = pd.DataFrame(X_data, columns=[f"X{i+1}" for i in range(dim)])
        df["Y"] = y_data

        # save as csv
        df.to_csv(f"../synthetic/linear-classification-d={dim}-replication={idx}.csv", index=False)

    df.head()