In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
!nvidia-smi

Fri Jul  4 01:28:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             46W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/',force_remount=True)

path = "/content/drive/MyDrive/dev/"

Mounted at /content/drive/


# Setup

I struggled with the installation with GPU support, only able to fix from https://gemini.google.com/app/f3ec74b59f92f56f

In [4]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

CUDA available: True
CUDA version: 12.4


In [5]:
# Install the pre-built wheel for CUDA 12.4
!pip uninstall -y llama-cpp-python
!pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

[0mLooking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu124
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.10.tar.gz (79.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.0/79.0 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python

In [6]:
from llama_cpp import Llama

In [None]:
model_path = path+".models/tinyllama-1.1b-chat-v1.0.Q2_K.gguf"
generator_q2_k = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=33,
    verbose=False,
)

In [None]:
# Check the number of layers offloaded to the GPU
offloaded_layers = generator_q2_k.model_params.n_gpu_layers
print(f"✅ Successfully offloaded {offloaded_layers} layers to the GPU")

✅ Successfully offloaded 33 layers to the GPU.


In [7]:
model_path = path+".models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
generator_q4_k_m = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=33,
    verbose=False,
)

In [None]:
# Check the number of layers offloaded to the GPU
offloaded_layers = generator_q4_k_m.model_params.n_gpu_layers
print(f"✅ Successfully offloaded {offloaded_layers} layers to the GPU")

✅ Successfully offloaded 33 layers to the GPU.


In [None]:
model_path = path+".models/tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
generator_q6_k = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=33,
    verbose=False,
)

In [None]:
# Check the number of layers offloaded to the GPU
offloaded_layers = generator_q6_k.model_params.n_gpu_layers
print(f"✅ Successfully offloaded {offloaded_layers} layers to the GPU")

✅ Successfully offloaded 33 layers to the GPU.


In [None]:
model_path = path+".models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
generator_q8_0 = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=33,
    verbose=False,
)

In [None]:
# Check the number of layers offloaded to the GPU
offloaded_layers = generator_q8_0.model_params.n_gpu_layers
print(f"✅ Successfully offloaded {offloaded_layers} layers to the GPU")

✅ Successfully offloaded 2147483647 layers to the GPU


# Tests

In [None]:
import time
import numpy as np

In [None]:
tokens = [1, 2, 3, 4, 5, 6]
probabilities = [0.1, 0.2, 0.3, 0.2, 0.1, 0.1]
size = (50, 20)

In [None]:
def create_prompt(tokens, probabilities, size, seed=1293):
    x = []
    np.random.seed(seed)
    prompts = np.random.choice(tokens, size=size, p=probabilities)
    for prompt in prompts:
        x = x+[' '.join(str(s) for s in prompt)]
    return x

In [None]:
prompts = create_prompt(tokens=tokens, probabilities=probabilities, size=size)
for prompt in prompts:
    print(prompt)

1 3 2 5 3 5 3 3 6 1 2 2 2 1 3 3 2 3 2 2
4 3 1 3 3 5 3 4 6 1 2 5 6 2 3 4 1 4 4 3
3 1 4 4 2 6 1 3 2 3 2 3 3 5 6 5 5 5 1 3
4 5 1 3 5 2 3 1 3 3 1 3 1 6 4 1 3 4 3 1
3 4 3 2 4 6 4 2 4 3 4 6 4 4 3 4 1 3 2 2
5 3 3 3 4 1 3 3 5 2 1 2 1 6 3 2 2 2 3 5
6 6 3 5 2 1 1 4 2 3 4 6 3 4 6 2 4 3 1 4
3 3 4 3 1 4 6 3 4 2 2 2 3 4 2 3 3 3 2 3
1 5 3 3 3 2 2 2 5 1 3 6 3 3 3 2 4 2 6 3
4 4 2 3 4 4 1 1 4 1 4 1 4 5 5 4 1 3 4 2
1 4 3 3 3 3 3 6 6 2 4 3 2 4 6 6 1 2 3 3
6 4 5 3 4 4 6 2 2 6 2 6 6 2 1 2 1 3 4 3
1 3 3 2 1 4 2 3 1 3 5 4 3 3 4 4 4 3 3 6
3 3 5 3 5 4 2 5 3 2 3 2 5 3 3 4 1 4 3 4
2 3 1 1 4 4 5 4 3 2 3 5 4 3 4 4 1 6 3 5
3 3 3 6 4 2 5 2 5 4 6 3 2 2 4 2 2 3 1 6
2 2 3 5 6 6 3 6 6 1 3 6 3 3 6 5 4 5 3 1
4 2 2 3 4 2 3 3 4 3 1 2 2 3 1 3 2 4 2 2
2 3 3 4 2 3 1 4 1 3 5 4 5 4 4 6 4 1 2 3
3 2 2 2 3 3 3 6 4 3 2 3 5 3 3 2 6 4 2 2
3 1 6 3 1 1 4 4 5 3 2 2 3 5 4 1 1 6 6 3
1 4 2 3 5 4 3 3 3 3 4 4 3 3 4 4 6 4 2 3
2 2 3 1 4 4 4 6 4 3 4 1 3 3 3 5 5 4 3 5
6 3 1 3 5 4 3 5 4 3 3 3 2 6 2 4 4 3 3 3
6 3 2 4 3 1 2 2 6 1 3 5 6 6 4 5 3 2 6 1


In [None]:
generators = {
    'q2_k': {'func': generator_q2_k, 'max_tokens': 'max_tokens', 'stop': 'stop'},
    'q4_k_m': {'func': generator_q4_k_m, 'max_tokens': 'max_tokens', 'stop': 'stop'},
    'q6_k': {'func': generator_q6_k, 'max_tokens': 'max_tokens', 'stop': 'stop'},
    'q8_0': {'func': generator_q8_0, 'max_tokens': 'max_tokens', 'stop': 'stop'}
}

In [None]:
def time_execution(generator, prompt, params):
    start_time = time.time()
    response = generator(prompt, **params)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return {'response': response, 'elapsed_time': elapsed_time}

In [None]:
max_tokens = 100
stop = ["\n"]

In [None]:
results = {}
r = []
t = []

for key in generators:
    results[key] = {}
    params = {
        generators[key]['max_tokens']: max_tokens
    }
    if key=='q2_k' or key=='q4_k_m':
        params['stop'] = stop
    print('Running ',key)
    for prompt in prompts:
        result = time_execution(generators[key]['func'], prompt, params)
        r = r+[result['response']['choices'][0]['text']]
        t = t+[result['elapsed_time']]
    results[key]['response'] = r
    results[key]['elapsed_time'] = t

Running  q2_k
Running  q4_k_m
Running  q6_k


In [None]:
# Running only q8_0
results = {}
r = []
t = []

for key in ['q8_0']:
    results[key] = {}
    params = {
        generators[key]['max_tokens']: max_tokens
    }
    if key=='q2_k' or key=='q4_k_m':
        params['stop'] = stop
    print('Running ',key)
    for prompt in prompts:
        result = time_execution(generators[key]['func'], prompt, params)
        r = r+[result['response']['choices'][0]['text']]
        t = t+[result['elapsed_time']]
    results[key]['response'] = r
    results[key]['elapsed_time'] = t

Running  q8_0


In [None]:
# GPU running slower than in CPUs
for key in results:
    print(key,': ',np.mean(results[key]['elapsed_time']))

q8_0 :  7.859624714851379


In [None]:
# GPU running slower than in CPUs
results

{'q2_k': {'response': [' 5 3 5 3 3 5 3 5 3 3 4 3 3 3 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3',
   ' 1 4 3 3 5 0 5 1 7 6 3 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
   ' 1 0 1 0 1 1 2 3 3 3 3 3 6 5 6 7 7 6 6 6 7 7 7 7 7 6 7 7 7 6 6 6 6 7 7 6 7 7 6 7 7 6 6 7 7 7 7 7 7 7',
   ' 3 2 3 1 3 1 3 4 5 4 5 3 1 1 1 1 3 3 1 1 3 1 1 5 1 1 1 3 1 1 3 1 1 1 1 1 3 1 1 1 3 3 1 3 1 1 1 3 1 3',
   ' 4 3 4 6 4 4 1 3 2 2 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4',
   ' 3 6 3 3 3 3 4 4 3 5 3 5 3 3 6 3 4 4 4 3 4 5 2 4 5 4 4 4 4 5 2 4 5 3 5 5 6 5 4 5 5 5 4 5 4 4 5 5 3 5',
   ' 3 1 5 2 3 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
   ' 4 4 3 4 4 5 5 6 4 5 6 6 5 5 6 6 5 6 6 6 6 6 6 6 5 6 5 6 6 5 6 6 6 6 6 6 6 6 6 5 6 7 5 7 6 7 6 7 5 6',
   ' 7 3 5 2 3 5 3 2 6 3 4 3 3 3 2 6 3 3 6 2 3 3 2 2 5 3 3 3 3 3 3 2 6 3 3 3 3 3 2 3 2 4 3 3 3 4 3 4 3 3',
   ' 3 3 3 4 4 5 

In [None]:
# GPU running slower than in CPUs
for key in results:
    print(key,': ',np.mean(results[key]['elapsed_time']))

q2_k :  5.874327306747436
q4_k_m :  6.052161073684692
q6_k :  6.56054488658905


In [None]:
# Results on CPU
print(results)

Results in CPU:
{'q2_k': {'response': [' 1 1 1 2 2', '', ' 3 8 5 0', ' 5 7 1', ' 4 ', ' 5 0 3 7 3 3 3 4 3 6 3 3 3 6 3 3 5 3 5 3 8 3 3 3 5 2 1 2 3 4 3 3 4 1 2 5 0 0 3 7 5 3 2 1 1 1 6 1 0 ', ' 7 7 7 7', ' ', ' 2 3 0 0 0 0 0 0', ' 1 ', ' 1', ' 5 5 ', ' 5 2 4 2 1 0 7 8', ' ', '', '', ' 0 1 6 4 4 0', ' 3 ', ' 5 6 7 8 9 10 11 12 13 14 15 16 17 18', ' 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0', '', ' 5 4', ' 4 3 4', ' ', ' 1 ', '', ' 8 7 5 5 1 1 0', ' 2 2 2 0 4 8 9 7 2 0 3 4 3 3 5 3', ' 0', ' 3 2 3 3', ' 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 4', ' ', ' 6 8 7', ' 5 1 ', ' 1 1 0 9 7 6 5 5 4 3 3 3 3 3 4 3 3 2 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 ', '', ' 4 ', ' 2', ' 4', ' 0 0', ' 7', ' 0 0 0', ' 2 2 4 5 6 7 8 9 ', ' 1 2 10 8 1', ' 5 1 0 9 7 6', ' ', ' 1 4 1 2 0 2 0 4 9 4', ' 2 4 3 6 3 2 5 2 ', '', ' 4 5 6 0 0 0 2'], 'elapsed_time': [1.0324163436889648, 0.46370482444763184, 0.9405839443206787, 0.8096699714660645, 0.6914944648742676, 7.5140745639801025, 

# Data

In [8]:
import pandas as pd
import numpy as np

In [9]:
data_path = path+"sapheneia/github/sapheneia/data/"
btc  = pd.read_csv(data_path+"btc.csv")
wm2 = pd.read_csv(data_path+"wm2.csv")

In [10]:
# Define the horizon for computation of changes in each time-series
# Given the use case of BTC and WM2 and that WM2 is given in weeks, the number of periods for horizon will be in weeks
horizon = 12

# Define the lookback period for the computation of which quantile the value on a certain date is in respect of the lookback period
# Given the use case of BTC and WM2 and that WM2 is given in weeks, the number of periods for horizon will be in weeks
lookback = 52

In [11]:
# Convert 'Date' column in btc to datetime objects
btc['Date'] = pd.to_datetime(btc['Date'])

# Convert 'observation_date' column in wm2 to datetime objects
wm2['observation_date'] = pd.to_datetime(wm2['observation_date'])

# Merge the two dataframes on the date columns
data = pd.merge(btc, wm2, left_on='Date', right_on='observation_date', how='inner')
data = data[['Date','Close','WM2NS']]
data = data.rename(columns={'Date': 'date','Close': 'btc','WM2NS': 'wm2'})

# Display the first few rows of the merged dataframe
display(data.head())

Unnamed: 0,date,btc,wm2
0,2014-04-21,495.16,11355.4
1,2014-04-28,441.92,11231.2
2,2014-05-05,430.87,11285.3
3,2014-05-12,438.21,11286.0
4,2014-05-19,443.9,11278.4


In [12]:
# prompt: Using dataframe data: Compute a column for each btc and wm2 with the changes from one period to another. Change here is defined as log(x_2,x_1) where x_2 is the value of the column in a period ahead of the value x_1, where the number of periods between x_2 and x_1 is defined by a parameter horizon. Start with the first x_1 as the first period in the dataframe.

# Calculate the log change for 'btc'
data['btc_change'] = np.log(data['btc'] / data['btc'].shift(horizon))

# Calculate the log change for 'wm2'
data['wm2_change'] = np.log(data['wm2'] / data['wm2'].shift(horizon))

# prompt: Using dataframe data: Now given btc_change and wm2_change, first clean up rows NaN.

# Clean up rows with NaN values in 'btc_change' or 'wm2_change'
data.dropna(subset=['btc_change', 'wm2_change'], inplace=True)

# Sort by date to ensure calculations are in chronological order
data.sort_values(by='date', inplace=True)

# Display the first few rows of the merged dataframe
display(data.head())

Unnamed: 0,date,btc,wm2,btc_change,wm2_change
12,2014-07-14,618.32,11423.3,0.222125,0.005962
13,2014-07-21,625.13,11393.9,0.346831,0.014383
14,2014-07-28,587.93,11364.3,0.310801,0.006976
15,2014-08-04,592.77,11461.6,0.302108,0.015439
16,2014-08-11,575.89,11431.4,0.260317,0.013475


In [13]:
# prompt: Using dataframe data: Now given btc_change and wm2_change, first clean up rows NaN. Then for every date in the dataframe compute what the quantile in each rwo is btc_change and wm2_change in relation to values in a look back number of periods given by the parameter 'lookback'. Please the results on two other btc_quantile and wm2_quantile. Start the computation at a row which is 'lookback' number of periods from the start date in the dataframe, with all values of btc_quantile and wm2_quantile before that marked as NaN.

# Initialize quantile columns with NaN
data['btc_quantile'] = np.nan
data['wm2_quantile'] = np.nan

# Compute quantiles for each row from the lookback period onwards
for i in range(lookback, len(data)):
    # Get the lookback window for btc_change and wm2_change
    lookback_window_btc = data['btc_change'].iloc[i-lookback:i]
    lookback_window_wm2 = data['wm2_change'].iloc[i-lookback:i]

    # Calculate the quantile of the current value within the lookback window
    data.loc[data.index[i], 'btc_quantile'] = (lookback_window_btc < data['btc_change'].iloc[i]).sum() / lookback
    data.loc[data.index[i], 'wm2_quantile'] = (lookback_window_wm2 < data['wm2_change'].iloc[i]).sum() / lookback

# Clean up rows with NaN values in 'btc_change' or 'wm2_change'
data.dropna(subset=['btc_quantile', 'wm2_quantile'], inplace=True)

# Display the first few rows of the merged dataframe
display(data)

Unnamed: 0,date,btc,wm2,btc_change,wm2_change,btc_quantile,wm2_quantile
64,2015-07-13,290.350,12045.8,0.258370,0.000166,0.923077,0.115385
65,2015-07-20,277.680,12010.7,0.192922,0.012416,0.903846,0.461538
66,2015-07-27,293.010,11960.7,0.203869,0.004449,0.923077,0.192308
67,2015-08-03,281.580,12045.2,0.152264,0.010918,0.884615,0.384615
68,2015-08-10,263.300,12073.7,0.123802,0.012091,0.884615,0.480769
...,...,...,...,...,...,...,...
572,2025-04-07,79143.063,22081.1,-0.177530,0.019075,0.057692,0.923077
573,2025-04-14,84575.750,22113.7,-0.188782,0.024194,0.038462,1.000000
574,2025-04-21,87515.570,21935.8,-0.153800,0.020691,0.134615,0.942308
575,2025-04-28,95035.063,21779.0,-0.065333,0.012029,0.307692,0.634615


In [14]:
# prompt: Now create two other columns btc_bins and wm2_bins which have strings indicating the bins in which btc_quantile and wm2_quantile fall in. These are bins of quantiles defined with the strings ["q_10", "q_25", "q_50", "q_75", "q_90", "q_100"] where, for instance, "q_10" represents a quantile that falls within 0% and 10% quantile, the "q_25" bin will have quantiles above 10% and lower or equal to 25%, and so on.

import pandas as pd
# Define the bin edges based on quantiles
bins = [0, 0.10, 0.25, 0.50, 0.75, 0.90, 1.00]
# Define the bin labels
labels = ["q_10", "q_25", "q_50", "q_75", "q_90", "q_100"]

# Create the 'btc_bins' column
data['btc_bins'] = pd.cut(data['btc_quantile'], bins=bins, labels=labels, include_lowest=True, right=True)

# Create the 'wm2_bins' column
data['wm2_bins'] = pd.cut(data['wm2_quantile'], bins=bins, labels=labels, include_lowest=True, right=True)

# Display the first few rows of the merged dataframe
display(data)

Unnamed: 0,date,btc,wm2,btc_change,wm2_change,btc_quantile,wm2_quantile,btc_bins,wm2_bins
64,2015-07-13,290.350,12045.8,0.258370,0.000166,0.923077,0.115385,q_100,q_25
65,2015-07-20,277.680,12010.7,0.192922,0.012416,0.903846,0.461538,q_100,q_50
66,2015-07-27,293.010,11960.7,0.203869,0.004449,0.923077,0.192308,q_100,q_25
67,2015-08-03,281.580,12045.2,0.152264,0.010918,0.884615,0.384615,q_90,q_50
68,2015-08-10,263.300,12073.7,0.123802,0.012091,0.884615,0.480769,q_90,q_50
...,...,...,...,...,...,...,...,...,...
572,2025-04-07,79143.063,22081.1,-0.177530,0.019075,0.057692,0.923077,q_10,q_100
573,2025-04-14,84575.750,22113.7,-0.188782,0.024194,0.038462,1.000000,q_10,q_100
574,2025-04-21,87515.570,21935.8,-0.153800,0.020691,0.134615,0.942308,q_25,q_100
575,2025-04-28,95035.063,21779.0,-0.065333,0.012029,0.307692,0.634615,q_50,q_75


# Logits

In [46]:
model = generator_q4_k_m

In [51]:
vocab_size = model.n_vocab()
vocab_size

32000

In [52]:
last_six_ids = range(vocab_size - 6, vocab_size)
last_six_ids

range(31994, 32000)

In [53]:
candidate_tokens = []

In [57]:
for token_id in last_six_ids:
        # Detokenize returns bytes, so we must decode it.
        # Using errors='ignore' handles potential malformed UTF-8 characters.
        token_str = model.detokenize([token_id]).decode('utf-8', errors='ignore')

        print(f"ID: {token_id}  ->  Token: '{token_str}'")
        candidate_tokens.append(token_str)

print("\nCandidate tokens list:")
print(candidate_tokens)

ID: 31994  ->  Token: '还'
ID: 31995  ->  Token: '黃'
ID: 31996  ->  Token: '왕'
ID: 31997  ->  Token: '收'
ID: 31998  ->  Token: '弘'
ID: 31999  ->  Token: '给'

Candidate tokens list:
['还', '黃', '왕', '收', '弘', '给']


In [58]:
import json

tokens_file = path+"sapheneia/github/sapheneia/data/tinyllama_hijacked_tokens.json"

try:
    with open(tokens_file, 'r') as json_file:
        bin_token_map = json.load(json_file)
    print(f"Dictionary loaded from {tokens_file}:")
    print(bin_token_map)
except FileNotFoundError:
    print(f"Error: The file {tokens_file} was not found.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {tokens_file}.")

Dictionary loaded from /content/drive/MyDrive/dev/sapheneia/github/sapheneia/data/tinyllama_hijacked_tokens.json:
{'q_10': '还', 'q_25': '黃', 'q_50': '왕', 'q_75': '收', 'q_90': '弘', 'q_100': '给'}


In [59]:
def map_bin_to_token(bin_sequence, bin_token_map):
    return [bin_token_map[label] for label in bin_sequence]

In [60]:
def create_lagged_sequence_comprehension(series_a, series_b, bin_token_map, lag):
    """Creates a lagged interleaved sequence using a list comprehension."""
    bin_sequence = [
        item
        for t in range(len(series_a))
        if t + lag < len(series_b)
        for item in (series_a[t], series_b[t + lag])
    ]
    return map_bin_to_token(bin_sequence=bin_sequence, bin_token_map=bin_token_map)
    #return bin_sequence

In [61]:
# Lag
lag = 12

In [62]:
token_sequence = create_lagged_sequence_comprehension(series_a=data['wm2_bins'].tolist(), series_b=data['btc_bins'].tolist(), bin_token_map=bin_token_map, lag=lag)

In [63]:
token_sequence[-10:]

['给', '还', '弘', '还', '왕', '黃', '왕', '왕', '黃', '왕']

In [47]:
token_ids = model.tokenize(" ".join(token_sequence).encode("utf-8"), add_bos=False)

In [65]:
final_token_ids = []
# We process one character at a time, avoiding any string joining issues.
for char in token_sequence:
    # This asks the model for the ID of '给', then for the ID of '还', etc.
    # It's a direct, unambiguous lookup for each character.
    final_token_ids.append(model.tokenize(char.encode("utf-8"), add_bos=False)[0])

In [66]:
final_token_ids

[29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,


In [69]:
token_ids = [model.tokenize(t.encode("utf-8"), add_bos=False)[0] for t in token_sequence]

In [70]:
token_ids

[29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,
 29871,


In [71]:
model.detokenize([31999])

b'\xe7\xbb\x99'

In [48]:
token_ids[-10:]

[29871, 31995, 29871, 31996, 29871, 31996, 29871, 31995, 29871, 31996]

In [23]:
valid_tokens = list(bin_token_map.values())

In [67]:
valid_ids = [model.tokenize(t.encode("utf-8"), add_bos=False)[0] for t in valid_tokens]

In [68]:
valid_tokens

['还', '黃', '왕', '收', '弘', '给']

In [29]:
valid_ids

[29871, 29871, 29871, 29871, 29871, 29871]

In [None]:
def calculate_perplexity(token_ids, model, valid_ids):
    """
    Calculates the perplexity of a sequence using a llama-cpp-python Llama object.
    """
    # Ensure the model is in a clean state before we begin
    model.reset()

    total_negative_log_likelihood = 0.0

    # We start predicting from the second token
    for i in range(1, len(token_ids)):
        # The context is all tokens up to the current one
        # The Llama object's eval method takes a list of integers
        context_ids = token_ids[:i]

        # The target is the ID of the current token
        target_id = token_ids[i]

        # 1. Evaluate the context to get the logits for the *next* token
        model.eval(context_ids)

        # 2. Retrieve the scores (logits) from the model object
        # It's a numpy array, so we convert it to a torch tensor
        logits = torch.from_numpy(model.scores).float().unsqueeze(0) # Shape -> [1, vocab_size]

        # 3. The rest of the logic is identical to before
        mask = torch.full_like(logits, -float('inf'))
        mask[:, valid_ids] = 0
        masked_logits = logits + mask

        log_probabilities = torch.log_softmax(masked_logits, dim=-1)

        target_log_prob = log_probabilities[0, target_id].item()

        total_negative_log_likelihood -= target_log_prob

    avg_nll = total_negative_log_likelihood / (len(token_ids) - 1)
    perplexity = math.exp(avg_nll)

    return perplexity

In [None]:
perplexity = calculate_perplexity(token_ids=token_ids, model=model, valid_ids=valid_ids)

In [None]:
print(f"Sequence: {' '.join(C_q_tokens_str)}")
print(f"Token IDs: {token_ids_for_perplexity}")
print(f"\nPerplexity for this sequence: {pp:.4f}")

In [None]:
import torch
import math
import numpy as np
from llama_cpp import Llama

# --- Setup ---
# This assumes you have the model file in your working directory or provide a full path.
# Set n_gpu_layers to a number > 0 to offload to GPU. Set to 0 for CPU only.
try:
    llm = Llama(
        model_path="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
        n_ctx=2048, # Context window
        n_gpu_layers=32, # Example: offload 32 layers to GPU
        verbose=False # Set to True to see llama.cpp output
    )
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure the model file is correctly placed.")
    llm = None

# Your mapping from the previous step
bin_to_token_map = {
    'q_10': '还', 'q_25': '黃', 'q_50': '왕',
    'q_75': '收', 'q_90': '弘', 'q_100': '给'
}
all_valid_bin_tokens = list(bin_to_token_map.values())

# --- Revised Perplexity Function ---

def calculate_perplexity_llamacpp(token_ids, model, all_valid_token_ids):
    """
    Calculates the perplexity of a sequence using a llama-cpp-python Llama object.
    """
    # Ensure the model is in a clean state before we begin
    model.reset()

    total_negative_log_likelihood = 0.0

    # We start predicting from the second token
    for i in range(1, len(token_ids)):
        # The context is all tokens up to the current one
        # The Llama object's eval method takes a list of integers
        context_ids = token_ids[:i]

        # The target is the ID of the current token
        target_id = token_ids[i]

        # 1. Evaluate the context to get the logits for the *next* token
        model.eval(context_ids)

        # 2. Retrieve the scores (logits) from the model object
        # It's a numpy array, so we convert it to a torch tensor
        logits = torch.from_numpy(model.scores).float().unsqueeze(0) # Shape -> [1, vocab_size]

        # 3. The rest of the logic is identical to before
        mask = torch.full_like(logits, -float('inf'))
        mask[:, all_valid_token_ids] = 0
        masked_logits = logits + mask

        log_probabilities = torch.log_softmax(masked_logits, dim=-1)

        target_log_prob = log_probabilities[0, target_id].item()

        total_negative_log_likelihood -= target_log_prob

    avg_nll = total_negative_log_likelihood / (len(token_ids) - 1)
    perplexity = math.exp(avg_nll)

    return perplexity

# --- Example Usage ---

if llm:
    # Your example interleaved sequence of human-readable labels
    C_q_labels = ['q_25', 'q_50', 'q_50', 'q_50', 'q_50', 'q_25', 'q_25', 'q_25', 'q_75', 'q_25', 'q_75', 'q_90', 'q_75', 'q_10']

    # 1. Convert your bin labels to the actual (ugly) token strings
    C_q_tokens_str = [bin_to_token_map[label] for label in C_q_labels]

    # 2. Use the Llama object's own tokenizer to get the final integer IDs
    # Note: We add a space between tokens to ensure they are tokenized individually.
    # The `add_bos_token=False` is important to not add a "beginning of sentence" token.
    token_ids_for_perplexity = llm.tokenize(" ".join(C_q_tokens_str).encode("utf-8"), add_bos=False)

    # We also need the integer IDs of our valid tokens for the mask
    valid_token_ids = [llm.tokenize(t.encode("utf-8"), add_bos=False)[0] for t in all_valid_bin_tokens]

    # 3. Calculate perplexity
    pp = calculate_perplexity_llamacpp(token_ids_for_perplexity, llm, valid_token_ids)

    print(f"Sequence: {' '.join(C_q_tokens_str)}")
    print(f"Token IDs: {token_ids_for_perplexity}")
    print(f"\nPerplexity for this sequence: {pp:.4f}")

In [None]:
def get_next_token_prob_quantized(token_sequence_str, next_token_str, valid_tokens, tokenizer, model):
    # Get token IDs for the input sequence
    input_ids = tokenizer.encode(token_sequence_str)

    # ctransformers gives logits directly
    logits = model.logits(input_ids) # This returns a numpy array
    next_token_logits = torch.from_numpy(logits) # Convert to a torch tensor

    # The rest of the masking logic is the same!
    mask = torch.full_like(next_token_logits, -float('inf'))
    valid_token_ids = tokenizer.convert_tokens_to_ids(valid_tokens)
    mask[:, valid_token_ids] = 0
    masked_logits = next_token_logits + mask

    probabilities = torch.softmax(masked_logits, dim=-1)
    target_token_id = tokenizer.convert_tokens_to_ids(next_token_str)
    target_probability = probabilities[0, target_token_id].item()

    return target_probability

In [None]:
# --- Example Usage ---
# Let's say our input sequence of bins is ['BIN_2', 'BIN_3']
# We first convert it to a sequence of hijacked tokens
input_bins = ['BIN_2', 'BIN_3']
input_hijacked = [bin_to_token_map[b] for b in input_bins]
input_sequence_str = " ".join(input_hijacked)

# And our target is 'BIN_4'
target_bin = 'BIN_4'
target_hijacked = bin_to_token_map[target_bin]

# Now we run the analysis
# prob = get_next_token_prob_quantized(
#     input_sequence_str,
#     target_hijacked,
#     all_valid_tokens,
#     tokenizer,
#     model
# )
# print(f"Probability of {target_hijacked}: {prob:.4f}")