%pip install transformers==4.38.2
%pip install peft==0.10.0
%pip install sentencepiece==0.2.0
%pip install accelerate==0.28.0
%pip install -i https://pypi.org/simple/ bitsandbytes

%pip install --force-reinstall chromadb==0.4.23 
%pip install --force-reinstall llama_index==0.10.12
%pip install --force-reinstall sentence_transformers==2.2.2

In [1]:
from typing import List
import json
import chromadb
from llama_index.core import ServiceContext, StorageContext
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from llama_index.llms.openai import OpenAI
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, GenerationConfig
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import ast
import torch
from tqdm.auto import tqdm
set_seed(42)

2024-07-04 12:51:34.245334: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-04 12:51:34.398196: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-04 12:51:34.398261: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-04 12:51:34.418815: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-04 12:51:34.464096: I tensorflow/core/platform/cpu_feature_guar

Иницилизируем модель для получения эмббедингов

In [2]:
embed_model = SentenceTransformerEmbeddingFunction(model_name="BAAI/bge-small-en-v1.5", 
                                                device='cuda'
                                                )

Создаем коллекцию в векторном хранилище

In [3]:
client = chromadb.Client()
collection = client.get_or_create_collection("RAG", embedding_function=embed_model,
    metadata={"hnsw:space": "cosine"} 
    )
client.list_collections()

[Collection(name=RAG)]

Загружаем данные

In [4]:
#file_path = "triples_ft_pp.jsonl"
file_path = "data/movie_ground_truth.jsonl" # больше данных - больше вариативность

with open(file_path, "r") as f:
    data = f.readlines()

Подготоавливаем данные для загрузки в векторную бд

In [5]:
documents = []
for i_text in data:
    i_text = json.loads(i_text)
    documents.append(i_text['sent'])

In [6]:
metadata = [{'subject': 'movie'} for i in range(len(documents))]

In [7]:
ids = ['id'+str(collection.count()+i+1) for i in range(len(documents))]

In [8]:
collection.add(
    documents=documents,
    metadatas=metadata,
    ids=ids,
)


In [9]:
model_name = "openchat/openchat-3.5-0106"

In [10]:
torch.mps

<module 'torch.mps' from '/home/tiv/projects/ds/nlp-basics/.venv/lib/python3.11/site-packages/torch/mps/__init__.py'>

In [11]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
torch_device = "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             pad_token_id=tokenizer.eos_token_id).to(torch_device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
# model.generation_config = GenerationConfig(
#        do_sample=True,
#        top_k=50,
#        top_p=0.95,
#        num_return_sequences=num_return_sequences, 
#        num_beams=num_beams,
#        temperature=0.5
#    )

Промпт для RAG

In [12]:
prompt = """Context information is below.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query}
Answer:"""

Функция для поиска контекста

In [13]:
def get_context(query):
    context_raw = collection.query(query_texts=query, n_results=3)
    context = "\n".join(context_raw['documents'][0])
    return context

Функция для генерации нескольких ответов модели

Закомментированные параметры - эксперимент

In [14]:
def get_model_output(context, query):
    model_inputs = tokenizer(prompt.format(context=context, query=query), return_tensors='pt').to(torch_device)

    sample_outputs = model.generate(
    **model_inputs,
    max_new_tokens=50,
    do_sample=False, # True,
    #top_k=50,
    #top_p=0.95,
    num_return_sequences=3, #6, 
    num_beams=3, #6,
    temperature=None
    )

    result = []
    for i, output in enumerate(sample_outputs):
        result.append(tokenizer.decode(output, skip_special_tokens=True).split("Answer:")[1].strip().split("\n")[0].strip())
    return result

In [15]:
def get_output_texts(query: str) -> List[str]:
    context = get_context(query)
    output_texts = get_model_output(context, query)
    return output_texts

Тест

In [16]:
get_output_texts("What is a Spirited Away?")

['Spirited Away is a 2001 Japanese animated fantasy film written and directed by Hayao Miyazaki, animated by Studio Ghibli for Tokuma Shoten, Nippon Television Network, Dentsu,',
 'A Spirited Away is a 2001 Japanese animated fantasy film written and directed by Hayao Miyazaki, animated by Studio Ghibli for Tokuma Shoten, Nippon Television Network, Dentsu',
 'Spirited Away is a 2001 Japanese animated fantasy film written and directed by Hayao Miyazaki, animated by Studio Ghibli for Tokuma Shoten, Nippon Television Network, Dentsu.']

In [17]:
#df = pd.read_csv('eval_dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'eval_dataset.csv'

In [None]:
#df = df.head(3)
#df

Получение выходов<br>
Тут сделан алгоритм, который запускает модель несколько раз, чтобы выбрать ответ с отличиющимися ответами<br>
Но когда температура выключена, то повтор не имеет смысле (закомментирован)<br>
Экспериментально, отключение семплинга позволяет beam search выдавать более разнообразные ответы<br>
Поэтому закомментирвоано как эксперимент

In [None]:
def f7(seq):
    """Remove duplicates from a list, while preserving order """
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

num_return_sequences = 3

data = []
for query in tqdm(df.question):
    # когда нет семплинга
    data.append(get_output_texts(query)[:num_return_sequences]) 
    
    # когда есть семплинг
    # max_unique = 0
    # best_outputs = [''] * num_return_sequences
    # for _ in range(5):
    #     outputs = get_output_texts(query)
    #     unique_count = len(set(outputs))
    #     if unique_count > max_unique:
    #         max_unique = unique_count
    #         best_outputs = outputs
    #         
    #     if unique_count >= num_return_sequences:
    #         break
    #         
    #     if unique_count == 1 and max_unique == 1:
    #         break
    #         
    # best_outputs = f7(best_outputs)[:num_return_sequences]
    # if len(best_outputs) != num_return_sequences:
    #     best_outputs += [best_outputs[-1]] * (num_return_sequences - len(best_outputs))
    #        
    #data.append(best_outputs)

100%|██████████| 157/157 [10:58<00:00,  4.20s/it]


Сохранение

In [None]:
columns = [f'rag_{n}' for n in range(num_return_sequences)]
data_t = list(map(list, zip(*data)))

for i, col in enumerate(columns):
    df[col] = data_t[i]

In [None]:
def uniq_count(r):
    return len(set([r['rag_0'], r['rag_1'], r['rag_2']]))

df['uniq_count'] = df.apply(uniq_count, axis=1)
display(df['uniq_count'].value_counts())

df.to_csv('eval_dataset.csv', index=False)

uniq_count
2    63
3    51
1    43
Name: count, dtype: int64