In [1]:
#install required packages
!pip install -q transformers peft  accelerate bitsandbytes safetensors sentencepiece streamlit chromadb langchain sentence-transformers sacremoses pypdf

# fixing unicode error in google colab
import locale
locale.getpreferredencoding = lambda: "UTF-8"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m816.1/816.1 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, MarianMTModel, MarianTokenizer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFDirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ipywidgets import widgets
from IPython.display import display
from google.colab import drive

In [3]:
class Chatbot:
    def __init__(self, translation_model_name, source_language_code, target_language_code, model, tokenizer, retriever, llm, chat_history):
        self.translation_model = MarianMTModel.from_pretrained(translation_model_name)
        self.translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
        self.source_language_code = source_language_code
        self.target_language_code = target_language_code

        self.model = model
        self.tokenizer = tokenizer
        self.retriever = retriever
        self.llm = llm
        self.chat_history = chat_history

    def translate(self, text):
        inputs = self.translation_tokenizer(text, return_tensors="pt", truncation=True)
        translation = self.translation_model.generate(**inputs)
        translated_text = self.translation_tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
        return translated_text

    def create_conversation(self, query):
        try:
            memory = ConversationBufferMemory(
                memory_key='chat_history',
                return_messages=False
            )
            qa_chain = ConversationalRetrievalChain.from_llm(
                llm=self.llm,
                retriever=self.retriever,
                memory=memory,
                get_chat_history=lambda h: h,
            )

            result = qa_chain({'question': query, 'chat_history': self.chat_history})
            self.chat_history.append((query, result['answer']))
            return '', self.chat_history

        except Exception as e:
            self.chat_history.append((query, e))
            return '', self.chat_history

    def on_submit_button_click(self, b):
        query = self.query_input.value
        use_spanish = self.spanish_checkbox.value

        if use_spanish:
            translated_query = self.translate(query)
        else:
            translated_query = query

        response, self.chat_history = self.create_conversation(translated_query)

        if use_spanish:
            translated_response = self.translate(response)
        else:
            translated_response = response

        with self.response_output:
            print(translated_response)

        self.chat_history_output.clear_output(wait=True)
        with self.chat_history_output:
            for entry in self.chat_history:
                print(f"User: {entry[0]}")
                print(f"Bot: {entry[1]}")

    def display_widgets(self):
        self.query_input = widgets.Text(description="Your message:")
        self.response_output = widgets.Output()
        self.chat_history_output = widgets.Output()

        self.spanish_checkbox = widgets.Checkbox(value=False, description="Use Spanish")

        self.submit_button = widgets.Button(description="Send")
        self.submit_button.on_click(self.on_submit_button_click)

        display(self.query_input, self.spanish_checkbox, self.submit_button, self.response_output, self.chat_history_output)


In [4]:
class ChatbotTraining:
    def __init__(self, translation_model_name, source_language_code, target_language_code, model_name, folder_path, embedding_model_name):
        self.translation_model = MarianMTModel.from_pretrained(translation_model_name)
        self.translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
        self.source_language_code = source_language_code
        self.target_language_code = target_language_code

        self.model = self.load_quantized_model(model_name)
        self.tokenizer = self.initialize_tokenizer(model_name)

        drive.mount('/content/drive')
        self.folder_path = folder_path
        self.documents = self.load_documents()

        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        self.all_splits = self.text_splitter.split_documents(self.documents)

        self.embedding_model_name = embedding_model_name
        self.model_kwargs = {"device": "cuda"}
        self.embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model_name, model_kwargs=self.model_kwargs)

        self.vectordb = Chroma.from_documents(documents=self.all_splits, embedding=self.embeddings, persist_directory="chroma_db")
        self.retriever = self.vectordb.as_retriever()

        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2048,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        self.llm = HuggingFacePipeline(pipeline=self.pipeline)

    def load_quantized_model(self, model_name):
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            load_in_4bit=True,
            torch_dtype=torch.bfloat16,
            quantization_config=bnb_config
        )
        return model

    def initialize_tokenizer(self, model_name):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.bos_token_id = 1
        return tokenizer

    def load_documents(self):
        loader = PyPDFDirectoryLoader(self.folder_path)
        return loader.load()

In [5]:
# Usage Example:
translation_model_name = "Helsinki-NLP/opus-mt-en-es"
source_language_code = "es"
target_language_code = "en"
model_name = "anakin87/zephyr-7b-alpha-sharded"
folder_path = '/content/drive/MyDrive/chatbot_data/'
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"

In [6]:
chatbot_training = ChatbotTraining(translation_model_name, source_language_code, target_language_code, model_name, folder_path, embedding_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Mounted at /content/drive


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
# Create chatbot instances
chatbot_1 = Chatbot(translation_model_name, source_language_code, target_language_code,
                   chatbot_training.model, chatbot_training.tokenizer,
                   chatbot_training.retriever, chatbot_training.llm, [])


In [8]:

chatbot_2 = Chatbot(translation_model_name, source_language_code, target_language_code,
                   chatbot_training.model, chatbot_training.tokenizer,
                   chatbot_training.retriever, chatbot_training.llm, [])

In [9]:
# Display widgets for chatbots
chatbot_1.display_widgets()

Text(value='', description='Your message:')

Checkbox(value=False, description='Use Spanish')

Button(description='Send', style=ButtonStyle())

Output()

Output()

  warn_deprecated(


In [10]:
chatbot_2.display_widgets()

Text(value='', description='Your message:')

Checkbox(value=False, description='Use Spanish')

Button(description='Send', style=ButtonStyle())

Output()

Output()

An ITAS data portal is a web application that offers the extensive functionality of the trader's desk and blends the extensive experience of the trading industry with the most up-to-date technology and an improved user experience. It allows you to access information about business entities and companies, display active information in activity tables, and provides customizable data views.