In [None]:
# optional setup; use if the notebook is not running inside the rtfm conda environment
!git clone https://github.com/mlfoundations/rtfm.git
%cd rtfm

# Ensure pip is up to date
!pip install --upgrade pip

# Install Python 3.8 using pip
!pip install python==3.8

# Install pip dependencies from requirements.txt
!pip install -r requirements.txt

# Install additional dependencies
!pip install git+https://github.com/jpgard/llama-recipes.git
!pip install -e .
!pip install --no-deps git+https://github.com/mlfoundations/tableshift.git

# Inference with TabuLa-8B

This notebook shows some example workflows of how to perform inference with TabuLa-8B. 

For best performance, this notebook should be run with access to a GPU.

TabuLa-8B supports inference on zero- and few-shot tabular data (with the number of shots only limited by the context window of the model) and both categorical and continuous inputs. Below, we show examples of both. 

TabuLa's inference uses pandas DataFrames to construct examples for downstream inference. We directly construct Pandas DataFrames below, but you can also read DataFrames from CSV files or any other source that can be converted to DataFrame.

**Note about evaluation with labeled data**: If you only want to perform efficient evaluation on data that is already labeled (i.e. to assess the accuracy of TabuLa on your own dataset), we provide separate code to do this which is likely to be more performant than the code in this notebook (which is optimized for simplicity/usability, not performance). Please see the README in the main repo for instructions on how to prepare your data for evaluation with our eval pipeline. Note that that eval pipeline (not the code in this notebook) is also what was used to evaluate TabuLa-8B on our [paper](https://arxiv.org/abs/2406.12031).

# Model loading and setup

First, load the model and tokenizer. It is important to use the TabuLa tokenizer (not the base Llama 3 tokenizer) due to the special tokens used for serialization.

In [1]:
import pandas as pd
import torch
import numpy as np
import statsmodels.formula.api as sm
from transformers import AutoTokenizer, LlamaForCausalLM, AutoConfig
from statsmodels.stats.proportion import proportion_confint
import random
from rtfm.configs import TrainConfig, TokenizerConfig
from rtfm.inference_utils import InferenceModel
from rtfm.serialization.serializers import get_serializer
from rtfm.tokenization.text import prepare_tokenizer

train_config = TrainConfig(model_name="mlfoundations/tabula-8b", context_length=8192)

# If using a base llama model (not fine-tuned TabuLa),
# make sure to set add_serializer_tokens=False
# (because we do not want to use special tokens for 
# the base model which is not trained on them).
tokenizer_config = TokenizerConfig()

# Load the configuration
config = AutoConfig.from_pretrained(train_config.model_name)

# Set the torch_dtype to bfloat16 which matches TabuLa train/eval setup
config.torch_dtype = 'bfloat16'

device = "cuda" if torch.cuda.is_available() else "cpu"

model = LlamaForCausalLM.from_pretrained(
    train_config.model_name, device_map="auto", config=config).to(device)

tokenizer = AutoTokenizer.from_pretrained(train_config.model_name)
serializer = get_serializer(train_config.serializer_cls)

tokenizer, model = prepare_tokenizer(
    model,
    tokenizer=tokenizer,
    pretrained_model_name_or_path=train_config.model_name,
    model_max_length=train_config.context_length,
    use_fast_tokenizer=tokenizer_config.use_fast_tokenizer,
    serializer_tokens_embed_fn=tokenizer_config.serializer_tokens_embed_fn,
    serializer_tokens=serializer.special_tokens
    if tokenizer_config.add_serializer_tokens
    else None,
)

inference_model = InferenceModel(model=model, tokenizer=tokenizer, serializer=serializer)

  from .autonotebook import tqdm as notebook_tqdm
2024-07-18 20:14:51,054	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-07-18 20:14:51,229	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
read user yaml files: 0it [00:00, ?it/s]
Loading checkpoint shards: 100%|██████████| 7/7 [00:05<00:00,  1.26it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Creating your own data for inference

If you simply want to explore the model, or would like to construct your own data for inference, you can simply construct DataFrames to represent the labeled examples ("shots"), if any are used, and the target example that you want to predict on.

Below is an example.


In [2]:
def get_embeddings(model, dataframe, target_colname, target_choices):
    """
    computes model embeddings for examples
    """
    result = []
    for i in range(dataframe.shape[0]):
        if i % 50 == 0:
            print(i)
        embedding = model.predict(
            target_example=dataframe.iloc[[i]],
            target_colname=target_colname,
            target_choices=target_choices,
            labeled_examples=None,
            embed=True
        )
        result.append(embedding)
    return torch.concat(result, axis=0)

def select_examples(k, example_embeddings, target_embedding):
    """
    selects examples according to cosine distance
    """
    examples = torch.nn.functional.normalize(example_embeddings)
    target = torch.nn.functional.normalize(target_embedding)
    scores = torch.flatten(examples @ target.T)
    return torch.topk(scores, k).indices.tolist()

# compute accuracy with random examples
def get_acc(df, embeddings, num_shots, use_rices, target_col, target_choices):
    n = df.shape[0]
    num_correct = 0
    for i in range(n):
        if i % 10 == 0:
            print(i)
        available_ixs = list(range(n))
        del available_ixs[i]

        if use_rices == False:
            selected = random.choices(available_ixs, k=num_shots)
        else:
            selected = select_examples(num_shots, embeddings[available_ixs], embeddings[[i]])

        output = inference_model.predict(
            target_example=df.iloc[[i]],
            target_colname=target_col,
            max_new_tokens=10,
            target_choices=target_choices,
            labeled_examples=df.iloc[selected],
        )

        true_label = df.iloc[i][target_col]
        print(output, true_label)
        correct = int(output == true_label)
        num_correct += correct

    return num_correct / n

In [4]:
n,d = 1000, 2
X = np.random.randn(n,d)
beta = np.sign(np.random.randn(d))
y = X @ beta + .5 * np.random.randn(n)
data_matrix =  np.concatenate([X, y.reshape(-1,1)], axis=1)
columns = ['x0', 'x1', 'y']
regression_df = pd.DataFrame(data=data_matrix, columns=columns)

In [5]:
result = sm.ols(formula="y ~ x0 + x1", data=regression_df).fit()
print(result.summary())
beta

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.892
Model:                            OLS   Adj. R-squared:                  0.892
Method:                 Least Squares   F-statistic:                     4123.
Date:                Thu, 18 Jul 2024   Prob (F-statistic):               0.00
Time:                        20:15:19   Log-Likelihood:                -695.99
No. Observations:                1000   AIC:                             1398.
Df Residuals:                     997   BIC:                             1413.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0026      0.015      0.171      0.8

array([-1.,  1.])

In [None]:
df = regression_df 
preds = []
for i in range(50):
    
    print('example {}'.format(i))
    a = -4
    b = 4
    tol = .01

    target_ix = i
    available_ixs = list(range(n))
    del available_ixs[i]
    labeled_examples = random.choices(available_ixs, k=32)
    
    TARGET_COL = 'y'

    print('true label', df.iloc[target_ix]['y'])

    while b - a >  tol:

        print(a,b)

        curr_df = df.copy()
        cutoff = (a + b) / 2
        curr_df[TARGET_COL] = (curr_df[TARGET_COL] >= cutoff).astype(int)

        curr_df = curr_df.astype(str)

        output = inference_model.predict(
            target_example = curr_df.iloc[[target_ix]],
            target_colname = TARGET_COL,
            target_choices = ['0', '1'],
            max_new_tokens = 4,
            labeled_examples = curr_df.iloc[labeled_examples],
        )

        if output == '1':
            a = cutoff
        else:
            b= cutoff

    print("prediction ", cutoff)
    preds.append(cutoff)

In [None]:
# predict example


target_example = df.iloc[[102]]
output = inference_model.predict(
    target_example= target_example,
    target_colname=TARGET_COL,
    target_choices=TARGET_CHOICES,
    max_new_tokens=10,
    labeled_examples= df.iloc[[0]],
)
print(f"Prediction for sample \n {target_example} \n is: {output}")

In [None]:
from datasets import load_dataset
df_names = ['MagicTelescope', 'covertype', 'house_16H', 'Diabetes130US', 'Higgs']  
df_list = []
for name in df_names:     
    dataset = load_dataset("inria-soda/tabular-benchmark", data_files="clf_num/{}.csv".format(name))
    df = dataset['train'].to_pandas().astype(str).sample(frac=1).iloc[:1000]
    df_list.append(df)

In [None]:
# df = pd.read_csv("../multiclass_logistic.csv").astype(str)
# TARGET_COL = 'y'
# TARGET_CHOICES = list(df['y'].unique())

In [None]:
for df in df_list:
    TARGET_COL = df.columns[-1]
    TARGET_CHOICES = list(df[TARGET_COL].unique())
    # print(TARGET_COL, TARGET_CHOICES)
    target_example = df.iloc[[102]]
    output = inference_model.predict(
        target_example= target_example,
        target_colname=TARGET_COL,
        target_choices=TARGET_CHOICES,
        max_new_tokens=10,
        labeled_examples= df.iloc[[0]],
    )
    print(f"Prediction for sample \n {target_example} \n is: {output}")

In [None]:
for i, df in enumerate(df_list):    
    
    TARGET_COL = df.columns[-1]
    TARGET_CHOICES = list(df[TARGET_COL].unique())

    df_embeddings = get_embeddings(inference_model, df, TARGET_COL, TARGET_CHOICES)
    N = 1000
    NUM_SHOTS = [2, 8]
    # df = binary_df 
    results_df = []
    for num_shots in NUM_SHOTS:
        for use_rices in [True, False]:
            acc = get_acc(df.iloc[:N], df_embeddings[:N], num_shots, use_rices, TARGET_COL, TARGET_CHOICES)
            ci = proportion_confint(int(acc * N), N, alpha=0.05, method='beta')
            res = {"RICES": use_rices, "num_shots": num_shots, "acc": acc, 'n_test': N, 'ci': ci}
            print(res)
            results_df.append(res)

    results_df = pd.DataFrame(results_df)
    results_df.to_csv('{}_rices_{}.csv'.format(df_names[i], N), index=False)

In [None]:
results_df

In [None]:
n=100
proportion_confint(int(.8 * n), n, alpha=0.05, method='beta')