In [None]:
# For logs like: Premature Layer 28,6, Sequence 0:
from collections import defaultdict
import re
import csv
import pandas as pd
import json

def extract_pip_packages(model_output):
    """
    Extract pip package names from model output.
    """
    packages = re.findall(r'pip install ([a-zA-Z0-9_\-\.]+)', model_output)
    return packages

def process_layer_data(tokens_info, prefix, max_tokens=5):
    """
    Process layer data and create a dictionary with top token information.
    """
    sorted_tokens = sorted(tokens_info, key=lambda x: x['probability'], reverse=True)
    results = {}
    
    for i in range(min(max_tokens, len(sorted_tokens))):
        token = sorted_tokens[i]
        results[f'{prefix}_top{i+1}_token_id'] = token['id']
        results[f'{prefix}_top{i+1}_prob'] = token['probability']
        results[f'{prefix}_top{i+1}_token'] = token['token']  # Also store token text
    
    for i in range(len(sorted_tokens), max_tokens):
        results[f'{prefix}_top{i+1}_token_id'] = None
        results[f'{prefix}_top{i+1}_prob'] = None
        results[f'{prefix}_top{i+1}_token'] = None
    
    return results

def extract_token_info(lines, start_idx):
    """
    Extract token information from multiple lines, handling various newline patterns.
    """
    buffer = []
    i = start_idx
    
    while i < len(lines):
        line = lines[i].strip()
        
        if line.startswith('Token:') and len(buffer) > 0:
            break
        if (line.startswith('Premature Layer') or 
            line.startswith('Mature Layer') or 
            'next_tokens' in line or
            line.startswith('MODEL OUTPUT:')):
            break
            
        if line:
            buffer.append(line)
        elif buffer and buffer[-1].strip():
            buffer.append(line)
            
        i += 1
        
        full_text = ' '.join(buffer)
        if '(ID:' in full_text and 'Probability:' in full_text:
            match = re.search(r'Token:(.*?)\(ID: (\d+)\), Probability: ([\d\.]+)', full_text)
            if match:
                token_text = match.group(1).strip()
                token_id = int(match.group(2))
                probability = float(match.group(3))
                return {
                    'token': token_text,
                    'id': token_id,
                    'probability': probability
                }, i
    
    return None, i

def extract_model_output(log_data):
    """
    Extract the MODEL OUTPUT section from the log.
    """
    start_match = re.search(r'MODEL OUTPUT:\s*\n', log_data)
    end_match = re.search(r'MODEL OUTPUT END', log_data)
    
    if start_match and end_match:
        return log_data[start_match.end():end_match.start()].strip()
    return ""

def parse_and_save_logs(log_data, target_strings, rows):
    """
    Parse log data and save to CSV.
    
    Args:
        log_data: String containing the log data
        output_file: Path to the output CSV file
        target_strings: List of target strings to match against token text
    """
    current_row = {}
    current_layer = None
    
    # Extract MODEL OUTPUT and pip packages
    model_output = extract_model_output(log_data)
    pip_packages = extract_pip_packages(model_output)
    
    lines = log_data.split('\n')
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # Check for Premature Layer
        premature_match = re.match(r'Premature Layer (\d+),(\d+)', line)
        if premature_match:
            layer_num = premature_match.group(1)
            sublayer_num = premature_match.group(2)
            current_layer = f'premature_{layer_num}'
            current_row[current_layer] = []
            i += 1
            continue
            
        # Check for Mature Layer
        if line.startswith('Mature Layer:'):
            current_layer = 'mature'
            current_row[current_layer] = []
            i += 1
            continue
            
        # Parse token information
        if line.startswith('Token:') and current_layer is not None:
            token_info, next_idx = extract_token_info(lines, i)
            if token_info:
                current_row[current_layer].append(token_info)
            i = next_idx
            continue
            
        # Check for sequence end
        if 'next_tokens' in line:
            row_data = {}
            for layer_name, tokens in current_row.items():
                if tokens:
                    row_data.update(process_layer_data(tokens, layer_name))
            
            if row_data and 'mature_top1_token' in row_data:
                # Set default label to 1
                row_data['label'] = 1
                
                # Check if mature top1 token matches any target string
                mature_top1_token = row_data['mature_top1_token']
                if mature_top1_token:
                    # Clean and normalize token text for comparison
                    clean_token = mature_top1_token.strip().lower()
                    
                    # Check if the token matches any target string
                    for target in target_strings:
                        if clean_token in target.lower():
                            # Check if this token appears in a pip install command
                            for package in pip_packages:
                                if target.lower() in package.lower():
                                    row_data['label'] = 0
                                    break
                
                rows.append(row_data)
            
            current_row = {}
            current_layer = None
        
        i += 1
    return rows
    
import os
if __name__ == "__main__":
    with open('/home/hxxzhang/DoLa/llama3-8b-recent.json','r') as file:
        data = json.load(file)
    log_path = '/home/hxxzhang/DoLa/logs/llama3/recent'
    cnt = -1
    rows = []
    for i in data['is_correct']:
        cnt+=1
        # Tokens that do not do DoLa/Originally correct 
        target_strings = data['is_correct'][cnt]['incorrect']
        with open(os.path.join(log_path,'log_'+str(cnt))) as f:
            log_data = f.read()
        rows = parse_and_save_logs(log_data, target_strings, rows)
    df = pd.DataFrame(rows)
    df.to_csv('token_probabilities_recent.csv', index=False, escapechar='\\')

In [1]:
# Test
from collections import defaultdict
import re
import csv
import pandas as pd

def extract_pip_packages(model_output):
    """
    Extract pip package names from model output.
    """
    packages = re.findall(r'pip install ([a-zA-Z0-9_\-\.]+)', model_output)
    return packages

def process_layer_data(tokens_info, prefix, max_tokens=5):
    """
    Process layer data and create a dictionary with top token information.
    """
    sorted_tokens = sorted(tokens_info, key=lambda x: x['probability'], reverse=True)
    results = {}
    
    for i in range(min(max_tokens, len(sorted_tokens))):
        token = sorted_tokens[i]
        results[f'{prefix}_top{i+1}_token_id'] = token['id']
        results[f'{prefix}_top{i+1}_prob'] = token['probability']
        results[f'{prefix}_top{i+1}_token'] = token['token']  # Also store token text
    
    for i in range(len(sorted_tokens), max_tokens):
        results[f'{prefix}_top{i+1}_token_id'] = None
        results[f'{prefix}_top{i+1}_prob'] = None
        results[f'{prefix}_top{i+1}_token'] = None
    
    return results

def extract_token_info(lines, start_idx):
    """
    Extract token information from multiple lines, handling various newline patterns.
    """
    buffer = []
    i = start_idx
    
    while i < len(lines):
        line = lines[i].strip()
        
        if line.startswith('Token:') and len(buffer) > 0:
            break
        if (line.startswith('Premature Layer') or 
            line.startswith('Mature Layer') or 
            'next_tokens' in line or
            line.startswith('MODEL OUTPUT:')):
            break
            
        if line:
            buffer.append(line)
        elif buffer and buffer[-1].strip():
            buffer.append(line)
            
        i += 1
        
        full_text = ' '.join(buffer)
        if '(ID:' in full_text and 'Probability:' in full_text:
            match = re.search(r'Token:(.*?)\(ID: (\d+)\), Probability: ([\d\.]+)', full_text)
            if match:
                token_text = match.group(1).strip()
                token_id = int(match.group(2))
                probability = float(match.group(3))
                return {
                    'token': token_text,
                    'id': token_id,
                    'probability': probability
                }, i
    
    return None, i

def extract_model_output(log_data):
    """
    Extract the MODEL OUTPUT section from the log.
    """
    start_match = re.search(r'MODEL OUTPUT:\s*\n', log_data)
    end_match = re.search(r'MODEL OUTPUT END', log_data)
    
    if start_match and end_match:
        return log_data[start_match.end():end_match.start()].strip()
    return ""

def parse_and_save_logs(log_data, target_strings,rows):
    """
    Parse log data and save to CSV.
    
    Args:
        log_data: String containing the log data
        output_file: Path to the output CSV file
        target_strings: List of target strings to match against token text
    """
    current_row = {}
    current_layer = None
    
    # Extract MODEL OUTPUT and pip packages
    model_output = extract_model_output(log_data)
    pip_packages = extract_pip_packages(model_output)
    print(pip_packages)
    lines = log_data.split('\n')
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # Check for Premature Layer - handles both formats
        premature_match = re.match(r'Premature Layer (\d+)(?:,(\d+))?, Sequence', line)
        if premature_match:
            layer_num = premature_match.group(1)
            # sublayer_num is optional now
            current_layer = f'premature_{layer_num}'
            current_row[current_layer] = []
            i += 1
            continue
            
        # Check for Mature Layer
        if line.startswith('Mature Layer:'):
            current_layer = 'mature'
            current_row[current_layer] = []
            i += 1
            continue
            
        # Parse token information
        if line.startswith('Token:') and current_layer is not None:
            token_info, next_idx = extract_token_info(lines, i)
            if token_info:
                current_row[current_layer].append(token_info)
            i = next_idx
            continue
            
        # Check for sequence end
        if 'next_tokens' in line:
            row_data = {}
            for layer_name, tokens in current_row.items():
                if tokens:
                    row_data.update(process_layer_data(tokens, layer_name))
            
            if row_data and 'mature_top1_token' in row_data:
                # Set default label to 1
                row_data['label'] = 1
                
                # Check if mature top1 token matches any target string
                mature_top1_token = row_data['mature_top1_token']
                if mature_top1_token:
                    # Clean and normalize token text for comparison
                    clean_token = mature_top1_token.strip().lower()

                    # Check if the token matches any target string
                    for target in target_strings:
                        if clean_token in target.lower():
                            # Check if this token appears in a pip install command
                            for package in pip_packages:
                                if target.lower() in package.lower():
                                    row_data['label'] = 0
                                    break
                
                rows.append(row_data)
            
            current_row = {}
            current_layer = None
        
        i += 1
    return rows
    # Convert to DataFrame and save to CSV
    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False)
    return df

import os
import json
import os
if __name__ == "__main__":
    with open('/home/hxxzhang/DoLa/llama3-8b-recent.json','r') as file:
        data = json.load(file)
    log_path = '/home/hxxzhang/DoLa/logs/llama3/recent'
    cnt = -1
    rows = []
    for i in data['clean']:
        cnt+=1
        # Tokens that do not do DoLa/Originally correct 
        target_strings = data['clean'][cnt]['incorrect']
        with open(os.path.join(log_path,'log_'+str(cnt))) as f:
            log_data = f.read()
        rows = parse_and_save_logs(log_data, target_strings, rows)
    df = pd.DataFrame(rows)
    df.to_csv('token_probabilities_recent.csv', index=False, escapechar='\\')

['rds']
['jsonpickle', 'pandas']
['json-schema']
['typing']
['toml', 'setuptools']
['click', 'manpage']
['websocket-client', 'websocket-server', 'pytz']
['python-log4ml', 'log4ml-config', 'log4ml-filter']
['tensorflow', 'torch', 'torchvision']
['rust-lint', 'pydantic']
['numpy', 'scipy', 'matplotlib', 'docker']
['inspect', 'importlib']
['databricks-api', 'sqlalchemy', 'pandas']
['requests']
['transformers', 'torch', 'torchvision', 'pandas', 'numpy']
['opentelemetry-python', 'protobuf']
['pytorch', 'torchvision', 'lightning']
['tiktoken', 'transformers']
['cufft', 'numpy']
['nccl', 'cudnn', 'numpy', 'tensorflow']
['cudanova', 'cudart', 'nvidia-ml', 'profvis']
['cusparse']
['cudanative_runtime', 'cupy']
['curand', 'cudamat', 'cupy']
['nvidia-docker2', 'cupy', 'numpy', 'pyopencl']
['rich', 'pytablewriter']
['re2']
['sphinx', 'jquery']
['yolov8', 'numpy', 'scipy', 'matplotlib']
['dvc']
['tensorflow', 'torch']
['ammonia']
['threading', 'requests']
['pandas', 'numpy']
['matplotlib', 'numpy',

In [None]:
# Test
from collections import defaultdict
import re
import csv
import pandas as pd

def extract_pip_packages(model_output):
    """
    Extract pip package names from model output.
    """
    packages = re.findall(r'pip install ([a-zA-Z0-9_\-\.]+)', model_output)
    return packages

def process_layer_data(tokens_info, prefix, max_tokens=5):
    """
    Process layer data and create a dictionary with top token information.
    """
    sorted_tokens = sorted(tokens_info, key=lambda x: x['probability'], reverse=True)
    results = {}
    
    for i in range(min(max_tokens, len(sorted_tokens))):
        token = sorted_tokens[i]
        results[f'{prefix}_top{i+1}_token_id'] = token['id']
        results[f'{prefix}_top{i+1}_prob'] = token['probability']
        results[f'{prefix}_top{i+1}_token'] = token['token']  # Also store token text
    
    for i in range(len(sorted_tokens), max_tokens):
        results[f'{prefix}_top{i+1}_token_id'] = None
        results[f'{prefix}_top{i+1}_prob'] = None
        results[f'{prefix}_top{i+1}_token'] = None
    
    return results

def extract_token_info(lines, start_idx):
    """
    Extract token information from multiple lines, handling various newline patterns.
    """
    buffer = []
    i = start_idx
    
    while i < len(lines):
        line = lines[i].strip()
        
        if line.startswith('Token:') and len(buffer) > 0:
            break
        if (line.startswith('Premature Layer') or 
            line.startswith('Mature Layer') or 
            'next_tokens' in line or
            line.startswith('MODEL OUTPUT:')):
            break
            
        if line:
            buffer.append(line)
        elif buffer and buffer[-1].strip():
            buffer.append(line)
            
        i += 1
        
        full_text = ' '.join(buffer)
        if '(ID:' in full_text and 'Probability:' in full_text:
            match = re.search(r'Token:(.*?)\(ID: (\d+)\), Probability: ([\d\.]+)', full_text)
            if match:
                token_text = match.group(1).strip()
                token_id = int(match.group(2))
                probability = float(match.group(3))
                return {
                    'token': token_text,
                    'id': token_id,
                    'probability': probability
                }, i
    
    return None, i

def extract_model_output(log_data):
    """
    Extract the MODEL OUTPUT section from the log.
    """
    start_match = re.search(r'MODEL OUTPUT:\s*\n', log_data)
    end_match = re.search(r'MODEL OUTPUT END', log_data)
    
    if start_match and end_match:
        return log_data[start_match.end():end_match.start()].strip()
    return ""

def parse_and_save_logs(log_data, target_strings,rows):
    """
    Parse log data and save to CSV.
    
    Args:
        log_data: String containing the log data
        output_file: Path to the output CSV file
        target_strings: List of target strings to match against token text
    """
    current_row = {}
    current_layer = None
    
    # Extract MODEL OUTPUT and pip packages
    model_output = extract_model_output(log_data)
    pip_packages = extract_pip_packages(model_output)
    print(pip_packages)
    lines = log_data.split('\n')
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # Check for Premature Layer - handles both formats
        premature_match = re.match(r'Premature Layer (\d+)(?:,(\d+))?, Sequence', line)
        if premature_match:
            layer_num = premature_match.group(1)
            # sublayer_num is optional now
            current_layer = f'premature_{layer_num}'
            current_row[current_layer] = []
            i += 1
            continue
            
        # Check for Mature Layer
        if line.startswith('Mature Layer:'):
            current_layer = 'mature'
            current_row[current_layer] = []
            i += 1
            continue
            
        # Parse token information
        if line.startswith('Token:') and current_layer is not None:
            token_info, next_idx = extract_token_info(lines, i)
            if token_info:
                current_row[current_layer].append(token_info)
            i = next_idx
            continue
            
        # Check for sequence end
        if 'next_tokens' in line:
            row_data = {}
            for layer_name, tokens in current_row.items():
                if tokens:
                    row_data.update(process_layer_data(tokens, layer_name))
            
            if row_data and 'mature_top1_token' in row_data:
                # Set default label to 1
                row_data['label'] = 1
                
                # Check if mature top1 token matches any target string
                mature_top1_token = row_data['mature_top1_token']
                if mature_top1_token:
                    # Clean and normalize token text for comparison
                    clean_token = mature_top1_token.strip().lower()

                    # Check if the token matches any target string
                    for target in target_strings:
                        if clean_token in target.lower():
                            # Check if this token appears in a pip install command
                            for package in pip_packages:
                                if target.lower() in package.lower():
                                    row_data['label'] = 0
                                    break
                
                rows.append(row_data)
            
            current_row = {}
            current_layer = None
        
        i += 1
    return rows
    # Convert to DataFrame and save to CSV
    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False)
    return df

import os
import json
# Example usage
if __name__ == "__main__":
    with open('./llama3-8b-all.json','r') as file:
        data = json.load(file)
    log_path = '/home/hxxzhang/DoLa/logs/llama3/python_all'
    cnt = -1
    rows = []
    target_strings = ['re','json']
    with open('/home/hxxzhang/DoLa/logs/llama3/all-base/log_20') as f:
        log_data = f.read()
    rows = parse_and_save_logs(log_data, target_strings, rows)
    df = pd.DataFrame(rows)
    df.to_csv('token_probabilities_dola2.csv', index=False, escapechar='\\')

In [None]:
import torch
probs = torch.tensor([0.5,0.4,0.3,0.2,0.04])
-(probs * torch.log(probs)).sum(dim=-1)

In [None]:
def calculate_confidence_metrics(probs):

    # Probability-based metrics
    top_probs = probs.topk(k=top_k, dim=-1)
    max_prob = top_probs.values[:, 0]
    prob_entropy = -(probs * torch.log(probs)).sum(dim=-1)
    
    # Logit-based metrics
    top_logits = logits.topk(k=top_k, dim=-1)
    logit_gap = top_logits.values[:, 0] - top_logits.values[:, 1]
    logit_variance = top_logits.values.var(dim=-1)
    
    score = (
        -logit_gap +  # Lower gap -> more likely hallucination
        prob_entropy +      # Higher entropy -> more likely hallucination
        -max_prob     # Lower max prob -> more likely hallucination
    )

    return {
        'max_prob': max_prob,
        'entropy': prob_entropy,
        'logit_gap': logit_gap,
        'logit_variance': logit_variance
    }

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('/home/hxxzhang/DoLa/token_probabilities.csv')

# Feature selection: Using 'top1_token_id' and 'top1_prob' from each premature stage
features = [col for col in df.columns if 'token_id' in col or 'prob' in col]
X = df[features]
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the probabilities
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = X_train
X_test_scaled = X_test

# Build and train the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Predictions
y_pred = clf.predict(X_test_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9320990106877508
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.14      0.23      3849
           1       0.93      1.00      0.96     46489

    accuracy                           0.93     50338
   macro avg       0.89      0.57      0.60     50338
weighted avg       0.93      0.93      0.91     50338



In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the SVM model
svm = SVC(probability=True, random_state=42)

# Set up the hyperparameter grid
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.1, 1],
}

# Perform GridSearchCV
grid_svm = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_svm.fit(X_train_scaled, y_train)

# Get the best model and evaluate
best_svm = grid_svm.best_estimator_
y_pred = best_svm.predict(X_test_scaled)
print("Best SVM Model:", grid_svm.best_params_)
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
df = pd.read_csv('/home/hxxzhang/DoLa/token_probabilities_dola.csv')

# Feature selection: Using 'top1_token_id' and 'top1_prob' from each premature stage
features = [col for col in df.columns if 'token_id' in col or 'prob' in col]
X = df[features]
y = df['label']
X_test_scaled2 = scaler.transform(X)

y_pred = best_svm.predict(X_test_scaled2)

print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred))

In [None]:
df = pd.read_csv('/home/hxxzhang/DoLa/token_probabilities_dola.csv')

# Feature selection: Using 'top1_token_id' and 'top1_prob' from each premature stage
features = [col for col in df.columns if 'token_id' in col or 'prob' in col]
X = df[features]
y = df['label']
X_test_scaled2 = scaler.transform(X)

y_pred = clf.predict(X_test_scaled2)

print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred))

Accuracy: 0.9520857951885131
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.46      0.54      5519
           1       0.97      0.98      0.97     84183

    accuracy                           0.95     89702
   macro avg       0.81      0.72      0.76     89702
weighted avg       0.95      0.95      0.95     89702



In [4]:
import joblib

# Save the trained model
joblib.dump(clf, './model/random_forest_model.pkl')

['./model/random_forest_model.pkl']

In [None]:
# Load the saved model
loaded_model = joblib.load('./model/random_forest_model.pkl')

# Predict using the loaded model
new_predictions = loaded_model.predict(X_test_scaled)
print(new_predictions)