In [None]:
# optional setup; use if the notebook is not running inside the rtfm conda environment
!git clone https://github.com/mlfoundations/rtfm.git
%cd rtfm

# Ensure pip is up to date
!pip install --upgrade pip

# Install Python 3.8 using pip
!pip install python==3.8

# Install pip dependencies from requirements.txt
!pip install -r requirements.txt

# Install additional dependencies
!pip install git+https://github.com/jpgard/llama-recipes.git
!pip install -e .
!pip install --no-deps git+https://github.com/mlfoundations/tableshift.git

# Inference with TabuLa-8B

This notebook shows some example workflows of how to perform inference with TabuLa-8B. 

For best performance, this notebook should be run with access to a GPU.

TabuLa-8B supports inference on zero- and few-shot tabular data (with the number of shots only limited by the context window of the model) and both categorical and continuous inputs. Below, we show examples of both. 

TabuLa's inference uses pandas DataFrames to construct examples for downstream inference. We directly construct Pandas DataFrames below, but you can also read DataFrames from CSV files or any other source that can be converted to DataFrame.

**Note about evaluation with labeled data**: If you only want to perform efficient evaluation on data that is already labeled (i.e. to assess the accuracy of TabuLa on your own dataset), we provide separate code to do this which is likely to be more performant than the code in this notebook (which is optimized for simplicity/usability, not performance). Please see the README in the main repo for instructions on how to prepare your data for evaluation with our eval pipeline. Note that that eval pipeline (not the code in this notebook) is also what was used to evaluate TabuLa-8B on our [paper](https://arxiv.org/abs/2406.12031).

# Model loading and setup

First, load the model and tokenizer. It is important to use the TabuLa tokenizer (not the base Llama 3 tokenizer) due to the special tokens used for serialization.

In [None]:
import pandas as pd
import torch
import numpy as np
import statsmodels.formula.api as sm
from transformers import AutoTokenizer, LlamaForCausalLM, AutoConfig
from statsmodels.stats.proportion import proportion_confint
import random
from rtfm.configs import TrainConfig, TokenizerConfig
from rtfm.inference_utils import InferenceModel
from rtfm.serialization.serializers import get_serializer
from rtfm.tokenization.text import prepare_tokenizer

train_config = TrainConfig(model_name="mlfoundations/tabula-8b", context_length=8192)

# If using a base llama model (not fine-tuned TabuLa),
# make sure to set add_serializer_tokens=False
# (because we do not want to use special tokens for 
# the base model which is not trained on them).
tokenizer_config = TokenizerConfig()

# Load the configuration
config = AutoConfig.from_pretrained(train_config.model_name)

# Set the torch_dtype to bfloat16 which matches TabuLa train/eval setup
config.torch_dtype = 'bfloat16'

device = "cuda" if torch.cuda.is_available() else "cpu"

model = LlamaForCausalLM.from_pretrained(
    train_config.model_name, device_map="auto", config=config).to(device)

tokenizer = AutoTokenizer.from_pretrained(train_config.model_name)
serializer = get_serializer(train_config.serializer_cls)

tokenizer, model = prepare_tokenizer(
    model,
    tokenizer=tokenizer,
    pretrained_model_name_or_path=train_config.model_name,
    model_max_length=train_config.context_length,
    use_fast_tokenizer=tokenizer_config.use_fast_tokenizer,
    serializer_tokens_embed_fn=tokenizer_config.serializer_tokens_embed_fn,
    serializer_tokens=serializer.special_tokens
    if tokenizer_config.add_serializer_tokens
    else None,
)

inference_model = InferenceModel(model=model, tokenizer=tokenizer, serializer=serializer)

# Creating your own data for inference

If you simply want to explore the model, or would like to construct your own data for inference, you can simply construct DataFrames to represent the labeled examples ("shots"), if any are used, and the target example that you want to predict on.

Below is an example.


In [None]:
def get_embeddings(model, dataframe, target_colname, target_choices):
    """
    computes model embeddings for examples
    """
    result = []
    for i in range(dataframe.shape[0]):
        if i % 50 == 0:
            print(i)
        embedding = model.predict(
            target_example=dataframe.iloc[[i]],
            target_colname=target_colname,
            target_choices=target_choices,
            labeled_examples=None,
            embed=True
        )
        result.append(embedding)
    return torch.concat(result, axis=0)

def select_examples(k, example_embeddings, target_embedding):
    """
    selects examples according to cosine distance
    """
    examples = torch.nn.functional.normalize(example_embeddings)
    target = torch.nn.functional.normalize(target_embedding)
    scores = torch.flatten(examples @ target.T)
    return torch.topk(scores, k).indices.tolist()

# compute accuracy with random examples
def get_acc(df, embeddings, num_shots, use_rices, target_col, target_choices):
    n = df.shape[0]
    num_correct = 0
    for i in range(n):
        if i % 10 == 0:
            print(i)
        available_ixs = list(range(n))
        del available_ixs[i]

        if use_rices == False:
            selected = random.choices(available_ixs, k=num_shots)
        else:
            selected = select_examples(num_shots, embeddings[available_ixs], embeddings[[i]])

        output = inference_model.predict(
            target_example=df.iloc[[i]],
            target_colname=target_col,
            max_new_tokens=10,
            target_choices=target_choices,
            labeled_examples=df.iloc[selected],
        )

        true_label = df.iloc[i][target_col]
        print(output, true_label)
        correct = int(output == true_label)
        num_correct += correct

    return num_correct / n

In [None]:
n,d = 1000, 2
X = np.random.randn(n,d)
beta = np.sign(np.random.randn(d))
y = X @ beta + .5 * np.random.randn(n)
data_matrix =  np.concatenate([X, y.reshape(-1,1)], axis=1)
columns = ['x0', 'x1', 'y']
regression_df = pd.DataFrame(data=data_matrix, columns=columns)

In [None]:
result = sm.ols(formula="y ~ x0 + x1", data=regression_df).fit()
print(result.summary())
beta

In [None]:
from datasets import load_dataset
dataset = load_dataset("inria-soda/tabular-benchmark", data_files="reg_cat/house_sales.csv")
dataset = dataset['train'].to_pandas().sample(frac=1).iloc[:1000]

In [None]:
def discretize_continuous_column(column: pd.Series, thresholds) -> pd.Series:
    """Take a continuous-valued column and discretize it into num_buckets.

    The formatting of the outputs is of the form 'less than 0', 'between 0.5 and 0.9', or 'greater than 1.99'
    etc. depending on the value of the observation and the number of buckets used.

    This is the same format used in training TabuLa-8B and should be used for inference and
    evaluation of that model.
    """
    assert pd.api.types.is_numeric_dtype(column)

    # Compute bucket thresholds
    # thresholds = [column.quantile(i / num_buckets) for i in range(1, num_buckets)]
    

    # Define a function to categorize each value
    def categorize_value(x):
        for i, threshold in enumerate(thresholds):
            if x < threshold:
                if i == 0:
                    return f"less than {threshold}"
                else:
                    return f"between {thresholds[i-1]} and {threshold}"
        return f"greater than {thresholds[-1]}"

    # Apply the categorization function to the column
    return column.apply(categorize_value)

def get_regression_bucket_choices(thresholds):
    target_choices = []
    for i, threshold in enumerate(thresholds):
        if i == 0:
            target_choices.append(f"less than {threshold}") 
        else:
            target_choices.append(f"between {thresholds[i-1]} and {threshold}")
    target_choices.append(f"greater than {thresholds[-1]}")
    return target_choices

In [None]:
get_regression_bucket_choices([.333, .66, .888])

In [None]:
def regress_example(model, df, target_col, target_ix, shots_ixs, lower_bound, upper_bound, num_buckets, tol=.01):

    while upper_bound - lower_bound >  tol:
        """
        this logic is a bit confusing, but the following example helps
        say the bound in [0,1] and you have 3 buckets
        you want to do
        less than .333, betweeen .333 and .666, and greater than .666
        so len(thresholds) = num_buckets - 1
        and the thresholds are always strictly between the upper and lower bounds
        """
        thresholds = np.linspace(lower_bound, upper_bound, num_buckets + 1)[1:-1].round(4)

        curr_df = df.copy()

        curr_df[target_col] = discretize_continuous_column(curr_df[target_col], thresholds)
        target_choices = get_regression_bucket_choices(thresholds)

        output = model.predict(
            target_example = curr_df.iloc[[target_ix]],
            target_colname = target_col,
            target_choices = target_choices,
            max_new_tokens = 50,
            labeled_examples = curr_df.iloc[shots_ixs],
        )
        print(lower_bound, upper_bound)
        print(thresholds)
        print(target_choices)
        print(output)
        print("------- \n")

        num_thresholds = len(target_choices) 

        i = target_choices.index(output) # returns index of matching output, Value error if not

        if i == 0:
            upper_bound = thresholds[0]
        elif i > 0 and i == num_thresholds - 1:
            lower_bound = thresholds[-1]
        else:
            lower_bound = thresholds[i-1]
            upper_bound = thresholds[i]

    return (upper_bound + lower_bound) / 2





In [None]:
# n_digits = 2
df = dataset.round(4)
TARGET_COL = df.columns[-1]
N = 50
num_shots = 32
init_a = df[TARGET_COL].min()
init_b = df[TARGET_COL].max()
num_buckets = 3
TARGET_COL, init_a, init_b, num_buckets
range_shots = [4,8]

In [None]:
res = []
for i in range(2):
    for num_shots in range_shots:
        available_ixs = list(range(df.shape[0]))
        del available_ixs[i]
        shot_ixs = random.choices(available_ixs, k=num_shots)

        pred = regress_example(inference_model, df, TARGET_COL, i, shot_ixs, 
                            init_a, init_b, num_buckets, tol=.01)


        print(f"true label {df.iloc[i][TARGET_COL]}, prediction {pred}")
        res.append((i, df.iloc[i][TARGET_COL], pred, num_shots))

In [None]:
res_df = pd.DataFrame(res, columns=['ix', 'true_label', 'prediction', 'num_shots'])
res_df.to_csv('test.csv', index=False)

In [None]:
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

In [None]:
len(preds)

In [None]:
plt.scatter(preds, df[TARGET_COL].values[:len(preds)])

In [None]:
len(preds)

In [None]:
r2_score(df[TARGET_COL].values[:len(preds)], preds)

In [None]:
# predict example


target_example = df.iloc[[102]]
output = inference_model.predict(
    target_example= target_example,
    target_colname=TARGET_COL,
    target_choices=TARGET_CHOICES,
    max_new_tokens=10,
    labeled_examples= df.iloc[[0]],
)
print(f"Prediction for sample \n {target_example} \n is: {output}")

In [None]:
from datasets import load_dataset
df_names = ['MagicTelescope', 'covertype', 'house_16H', 'Diabetes130US', 'Higgs']  
df_list = []
for name in df_names:     
    dataset = load_dataset("inria-soda/tabular-benchmark", data_files="clf_num/{}.csv".format(name))
    df = dataset['train'].to_pandas().astype(str).sample(frac=1).iloc[:1000]
    df_list.append(df)

In [None]:
# df = pd.read_csv("../multiclass_logistic.csv").astype(str)
# TARGET_COL = 'y'
# TARGET_CHOICES = list(df['y'].unique())

In [None]:
for df in df_list:
    TARGET_COL = df.columns[-1]
    TARGET_CHOICES = list(df[TARGET_COL].unique())
    # print(TARGET_COL, TARGET_CHOICES)
    target_example = df.iloc[[102]]
    output = inference_model.predict(
        target_example= target_example,
        target_colname=TARGET_COL,
        target_choices=TARGET_CHOICES,
        max_new_tokens=10,
        labeled_examples= df.iloc[[0]],
    )
    print(f"Prediction for sample \n {target_example} \n is: {output}")

In [None]:
for i, df in enumerate(df_list):    
    
    TARGET_COL = df.columns[-1]
    TARGET_CHOICES = list(df[TARGET_COL].unique())

    df_embeddings = get_embeddings(inference_model, df, TARGET_COL, TARGET_CHOICES)
    N = 1000
    NUM_SHOTS = [2, 8]
    # df = binary_df 
    results_df = []
    for num_shots in NUM_SHOTS:
        for use_rices in [True, False]:
            acc = get_acc(df.iloc[:N], df_embeddings[:N], num_shots, use_rices, TARGET_COL, TARGET_CHOICES)
            ci = proportion_confint(int(acc * N), N, alpha=0.05, method='beta')
            res = {"RICES": use_rices, "num_shots": num_shots, "acc": acc, 'n_test': N, 'ci': ci}
            print(res)
            results_df.append(res)

    results_df = pd.DataFrame(results_df)
    results_df.to_csv('{}_rices_{}.csv'.format(df_names[i], N), index=False)

In [None]:
results_df

In [None]:
n=100
proportion_confint(int(.8 * n), n, alpha=0.05, method='beta')