In [34]:
import torch
import numpy as np
import datasets
import os
#import umap
import sys
import evaluate
from pathlib import Path
from itertools import product
from IPython.core.debugger import set_trace
from datasets import Dataset, DatasetDict
from torch import nn
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer, Pooling
from nltk import sent_tokenize
from IPython.core.debugger import Pdb, set_trace
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm

# requires llm_eval to be installed
from llm_eval.llm import select_chat_model
from llm_eval.cfg_reader import load

datasets.disable_caching()

cache_dir = '/data/john/cache'

# Set this to whatever you want
seed = 10

torch.manual_seed(seed)
np.random.seed(seed)

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Datasets, Standardize Column Names, and Aggregate

In [4]:
allsides_dir = '/data/john/projects/llm_eval/data/all_sides/test.json'
ppp_dir = '/data/john/projects/llm_eval/data/privacy_policy/3p_data.csv'

data = DatasetDict({
    'allsides': Dataset.from_json(allsides_dir),
    'ppp': Dataset.from_csv(ppp_dir),
})

cols = {
    'allsides': ['Left',
                 'Right',
                 'Ahmed_Intersection',
                 'Naman_Intersection',
                 'Helen_Intersection',
                 'AllSides_Intersection',],
    'ppp': ['Company_1',
            'Company_2',
            'Annotator1',
            'Annotator2',
            'Annotator3']
}

col_maps = {
    'allsides': {'Left': 'd1',
                 'Right': 'd2',
                 'Ahmed_Intersection': 'ref0',
                 'Naman_Intersection': 'ref1',
                 'Helen_Intersection': 'ref2',
                 'AllSides_Intersection': 'ref3'},
    'ppp': {'Company_1': 'd1',
            'Company_2': 'd2',
            'Annotator1': 'ref0',
            'Annotator2': 'ref1',
            'Annotator3': 'ref2'}
}

# remove extraneous columns
keep_cols = set(col_maps['allsides'].values())
for data_key, data_val in data.items():
    data[data_key] = data_val.remove_columns(set(data_val.features.keys()) - set(cols[data_key]))


# standardize column names
for data_key, data_val in data.items():
    for old_name, new_name in col_maps[data_key].items():
        data_val = data_val.rename_column(old_name, new_name)
    data[data_key] = data_val

# add data name as column to both datasets
for data_key, data_val in data.items():
    data[data_key] = data_val.add_column('name', [data_key]*len(data_val))

# concatenate datasets
data['agg'] = datasets.concatenate_datasets(data.values())

print(data)

DatasetDict({
    allsides: Dataset({
        features: ['d1', 'd2', 'ref0', 'ref1', 'ref2', 'ref3', 'name'],
        num_rows: 137
    })
    ppp: Dataset({
        features: ['d1', 'd2', 'ref0', 'ref1', 'ref2', 'name'],
        num_rows: 135
    })
    agg: Dataset({
        features: ['d1', 'd2', 'ref0', 'ref1', 'ref2', 'ref3', 'name'],
        num_rows: 272
    })
})


# Load vLLM Model

In [3]:
# set basic vars
cfg_path = '/data/john/projects/llm_eval/cfg/config.yaml'
cfg, _ = load(cfg_path)
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

# load chat session
session = select_chat_model(cfg, model_name)

INFO 03-21 23:39:19 config.py:433] Custom all-reduce kernels are temporarily disabled due to stability issues. We will re-enable them once the issues are resolved.


2024-03-21 23:39:21,683	INFO worker.py:1752 -- Started a local Ray instance.


INFO 03-21 23:39:22 llm_engine.py:87] Initializing an LLM engine with config: model='mistralai/Mistral-7B-Instruct-v0.2', tokenizer='mistralai/Mistral-7B-Instruct-v0.2', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=2048, download_dir='/data/shared/llm_cache', load_format=auto, tensor_parallel_size=4, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, seed=1711082359)
INFO 03-21 23:39:32 weight_utils.py:163] Using model weights format ['*.safetensors']
[36m(RayWorkerVllm pid=1624100)[0m INFO 03-21 23:39:32 weight_utils.py:163] Using model weights format ['*.safetensors']
INFO 03-21 23:39:37 llm_engine.py:357] # GPU blocks: 27732, # CPU blocks: 8192


In [12]:
# set ds
ds = data['agg']

# test summarization
summ_sys = 'you are a document summarizer. when you are given a document ' \
           'your task is to condense the information into a shorter format while also retaining the crutial information'
prompt_template = 'Document:\n{d1}\n\nSummarize the above document'
print(f'=============Source Doc=============')
print(ds[0]['d1'])
print(f'====================================')
print(f'============Response============')
print(session.get_response(prompt_template.format(d1=ds[0]['d1']), summ_sys))
print(f'================================')

Manafort on his way to the FBI. They really don't make those sun visors big enough. The indictment against Paul Manafort and his former business associate Rick Gates has been unsealed and leading the list of charges is "Conspiracy Against the United States" and money laundering. "In order to hide Ukraine payments from United States authorities," the indictment reads, "from approximately 2006 through at least 2016, MANAFORT and GATES laundered the money through scores of United States and foreign corporations, partnerships, and bank accounts." Through at least 2016, "falsely and repeatedly reporting to their tax preparers and to the United States that they had no foreign bank accounts." That's just the beginning. Stay tuned!


Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.34s/it]

 in English:
The indictment against Paul Manafort and Rick Gates, which was unsealed, includes charges of conspiring against the United States and money laundering. According to the indictment, from approximately 2006 to 2016, Manafort and Gates hid payments from Ukraine by laundering money through numerous corporations, partnerships, and bank accounts in the US and abroad. They also falsely reported to their tax preparers and the US that they had no foreign bank accounts. Stay tuned for more information.





In [37]:
red_sys = 'you are a machine that takes 2 sentences and input. your output is 1 when the 2 sentence pairs share similar information and 0 when there is none.'
prompt_template = 'Sentence1: {s1}\nSentence2: {s2}\n\noutput 1 if there is similarity between the sentences and 0 if there is none. output format:\nResult: [result]\nReason: [explanation]'

out_data = []
for idx, sample in tqdm(enumerate(ds), total=len(ds), desc='collecting data'):
    s1_sents = sent_tokenize(sample['d1'])
    s2_sents = sent_tokenize(sample['d2'])
    all_sents = s1_sents + s2_sents
    src_docs = ['d1']*len(s1_sents) + ['d2']*len(s2_sents)
    src_sent_ids = list(range(len(s1_sents))) + list(range(len(s2_sents)))

    sent_pairs = []
    pair_doc_ids = []
    pair_sent_ids = []
    for i in range(len(all_sents)-1):
        for j in range(i+1, len(all_sents)):
            sent_pairs.append((all_sents[i], all_sents[j]))
            pair_sent_ids.append((src_sent_ids[i], src_sent_ids[j]))
            pair_doc_ids.append((src_docs[i], src_docs[j]))

    prompts = [prompt_template.format(s1=s1, s2=s2) for s1, s2 in sent_pairs]
    responses = session.get_response(prompts, [red_sys]*len(prompts))
    sample_data = [{'d1': sample['d1'],
                    'd2': sample['d2'],
                    's1': sent_pair[0], 
                    's2': sent_pair[1],
                    's1_doc_id': pair_doc_id[0],
                    's2_doc_id': pair_doc_id[1],
                    's1_sent_id': pair_sent_id[0],
                    's2_sent_id': pair_sent_id[1],
                    'response': response}
                    for sent_pair, pair_doc_id, pair_sent_id, response in zip(sent_pairs, pair_doc_ids, pair_sent_ids, responses)]
    out_data += sample_data

out_data = Dataset.from_list(out_data)

collecting data:   0%|          | 0/272 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 171/171 [00:07<00:00, 21.48it/s]
Processed prompts: 100%|██████████| 190/190 [00:24<00:00,  7.62it/s]
Processed prompts: 100%|██████████| 190/190 [00:06<00:00, 27.86it/s]
Processed prompts: 100%|██████████| 300/300 [00:09<00:00, 32.28it/s]
Processed prompts: 100%|██████████| 136/136 [00:06<00:00, 22.56it/s]
Processed prompts: 100%|██████████| 120/120 [00:06<00:00, 18.77it/s]
Processed prompts: 100%|██████████| 325/325 [00:10<00:00, 30.46it/s]
Processed prompts: 100%|██████████| 136/136 [00:05<00:00, 23.73it/s]
Processed prompts: 100%|██████████| 231/231 [00:25<00:00,  9.00it/s]
Processed prompts: 100%|██████████| 231/231 [00:25<00:00,  9.02it/s]
Processed prompts: 100%|██████████| 231/231 [00:08<00:00, 28.08it/s]
Processed prompts: 100%|██████████| 253/253 [00:08<00:00, 29.35it/s] 
Processed prompts: 100%|██████████| 378/378 [00:11<00:00, 33.00it/s]
Processed prompts: 100%|██████████| 190/190 [00:09<00:00, 20.89it/s]
Processed prompts: 100%|█████████

In [41]:
print(out_data)

Dataset({
    features: ['d1', 'd2', 's1', 's2', 's1_doc_id', 's2_doc_id', 's1_sent_id', 's2_sent_id', 'response'],
    num_rows: 111934
})
