In [14]:
!pip install -q -U torch datasets transformers tensorflow langchain playwright sentence_transformers faiss-cpu accelerate
!pip install -q accelerate==0.21.0 peft==0.4.0 trl==0.4.7
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install flask flask-cors pyngrok
!pip install -U langchain-community

Looking in indexes: https://pypi.org/simple/


In [15]:
import os
import torch
import tokenizers

import transformers
from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  BitsAndBytesConfig,
  pipeline
)
from transformers import BitsAndBytesConfig

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

import nest_asyncio

# Apply nest_asyncio to allow asynchronous events to be processed in Jupyter notebooks.
nest_asyncio.apply()

In [16]:
# Fix for Colab
# Override the locale.getpreferredencoding function to return "UTF-8".
# This is a workaround to avoid issues with character encoding in Google Colab, ensuring
# that all operations using locale settings default to UTF-8 encoding.
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [17]:
#################################################################
# Tokenizer
#################################################################

# The model identifier for the pre-trained model to be used.
model_name = 'mistralai/Mistral-7B-Instruct-v0.1'

# Load the model configuration from the pre-trained model specified by `model_name`.
# AutoConfig automatically determines the configuration class appropriate for the pre-trained model.
model_config = transformers.AutoConfig.from_pretrained(
    model_name,
)

# Load the tokenizer associated with the specified model.
# The tokenizer is responsible for converting text inputs into a format that the model can understand.
# `trust_remote_code=True` allows the loading of custom tokenizers from the hub, which may execute remote code.
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set the padding token to be the same as the end-of-sequence (eos) token.
# This is often done in models that treat the end of a sequence similarly to padding.
tokenizer.pad_token = tokenizer.eos_token

# Configure the tokenizer to pad sequences on the right (default behavior).
# This means that any padding added to a sequence of tokens will be appended at the end.
tokenizer.padding_side = "right"



In [18]:
#################################################################
# bitsandbytes parameters
#################################################################

# use_4bit: Activates 4-bit precision for base model loading.
# Using 4-bit precision can significantly reduce the memory footprint of the model,
# allowing larger models or higher batch sizes on the same hardware.
use_4bit = True

# bnb_4bit_compute_dtype: Specifies the data type for computations when using 4-bit base models.
# "float16" is used here to strike a balance between precision and computational performance,
# particularly on GPUs that support this half-precision format effectively.
bnb_4bit_compute_dtype = "float16"

# bnb_4bit_quant_type: Defines the quantization type to use for 4-bit models.
# Options are 'fp4' for four-bit floating point or 'nf4' for four-bit normalized fixed-point.
# "nf4" can offer better performance and lower memory usage compared to traditional floating point.
bnb_4bit_quant_type = "nf4"

# use_nested_quant: Enables nested quantization for 4-bit base models. This option allows
# double quantization where model weights are first quantized to 4-bit precision and can be
# further quantized internally, potentially improving memory efficiency but at a risk of
# higher information loss.
use_nested_quant = False

In [19]:
#################################################################
# Set up quantization config
#################################################################

# compute_dtype: Retrieves the data type for computations from the PyTorch library,
# which corresponds to the earlier defined `bnb_4bit_compute_dtype` (e.g., float16).
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# bnb_config: Configuration for the BitsAndBytes library, which handles model quantization.
# - load_in_4bit: Whether to load models in 4-bit precision.
# - bnb_4bit_quant_type: Specifies the quantization type (e.g., 'nf4').
# - bnb_4bit_compute_dtype: Sets the data type for computations in quantized models.
# - bnb_4bit_use_double_quant: Enables nested quantization if set to True.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# GPU compatibility check for bfloat16:
# If using float16 and 4-bit precision is enabled, check if the GPU supports bfloat16,
# which is beneficial for accelerating training. This check is crucial for ensuring
# compatibility and performance optimization.
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()  # Retrieves GPU compute capability
    if major >= 8:  # Check if the GPU architecture supports bfloat16 (available in SM >= 8.0)
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [20]:
#################################################################
# Load pre-trained config
#################################################################

# Load a pre-trained causal language model from Hugging Face with a quantization configuration.
# The quantization configuration is specified by `bnb_config` to optimize model size and performance.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

def print_number_of_trainable_model_parameters(model):
    # Function to count the total and trainable parameters in the model.
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()  # Sum up all elements in parameters.
        if param.requires_grad:  # Check if parameters are trainable.
            trainable_model_params += param.numel()  # Sum up trainable parameters.
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# Print the model's parameter stats.
print(print_number_of_trainable_model_parameters(model))

# Configure and create a text generation pipeline using the loaded model and tokenizer.
# This setup uses specific settings to guide the style and constraints of generated text.
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,  # Low temperature results in more predictable text.
    repetition_penalty=1.1,  # Penalty reduces repetition in generated text.
    return_full_text=True,
    max_new_tokens=1000,  # Limit on the number of new tokens generated.
)

# Wrap the pipeline into a higher level abstraction for easier usage.
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


In [21]:
# Create prompt template
# Defines a template for prompts to be used in a language model scenario.
# This template is specifically designed for a game scenario, including roles and context.
prompt_template = """
### [INST] Instruction: Ton rôle est de jouer au jeu du loup-garou. Voici un peu de contexte pour t'aider :

{context}

### QUESTION:
{question}
 [/INST]
"""

# Create a prompt structure from the template with specified input variables.
# This object facilitates the injection of dynamic content ('context' and 'question') into the template.
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create an LLM chain, a sequential model execution pipeline.
# It configures how the large language model (LLM) should process the prompt structure.
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

# Uncomment the lines below to run the LLM chain with actual 'context' and 'question' values and print the result.
# This part of the code is used to execute the model with specific input and get the generated response.
# result = llm_chain.run({"context": context, "question": question})
# print(result)

In [22]:
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
from google.colab import userdata

# Initialize Flask app
app = Flask(__name__)
CORS(app)

# Define a route at '/generate' that accepts POST requests.
# This endpoint will be used to generate responses based on provided context and question.
@app.route('/generate', methods=['POST'])
def generate_text():
    # Retrieve JSON data from the POST request.
    data = request.get_json()

    # Extract 'context' and 'question' from the data, defaulting to an empty string if not found.
    context = data.get('context', '')
    question = data.get('question', '')

    # Run the large language model chain with the provided context and question to generate text.
    result = llm_chain.run({"context": context, "question": question})

    # Prepare the response containing the last part of the result.
    response = {"result": "\n".join(result.splitlines()[-1:])}
    # Return the result as a JSON response.
    return jsonify(response)

# Main entry point of the application
if __name__ == '__main__':
    port = 5000  # Define the port on which to run the Flask application.

    # Authenticate and open an ngrok tunnel to the specified port.
    # This allows the Flask app to be accessible over the internet via the ngrok tunnel.
    ngrok.set_auth_token(userdata.get('NGROK_TOKEN'))
    ngrok_tunnel = ngrok.connect(port)
    print('Public URL:', ngrok_tunnel.public_url)

    # Run the Flask application on the defined port.
    app.run(port=port)

Public URL: https://09aa-34-145-109-152.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
