In [None]:
!python3 --version

Python 3.10.12


In [None]:
!pip install fire
!pip install gradio
!pip install transformers
!pip install git+https://github.com/huggingface/peft.git
!pip install sentencepiece
!pip install accelerate
!pip install bitsandbytes
!pip install langchain
!pip install sentence_transformers
!pip install chromadb
!pip install xformers

---
## **Carga de los datos:**

Despues de procesados los datos con los primeros script, se obtiene un archivo con los parrafos correspondientes a las resoluciones en bruto. Estos deben someterse a un pequeño reporceso en el cual se realiza:

*   Eliminación de caracteres no necesarios (**Tales como:** Tildes, dieresis, virgulillas entre otros)
*   Separación en diccionario, donde se colocan los contenidos de cada resolución en una llave del diccionario.

---

In [None]:
!apt-get install wget
!wget -O resultado.txt "https://raw.githubusercontent.com/bjportelac/UP-0001-MainCodeAndData/main/resultado.txt"

Reading package lists... Done
Building dependency tree       
Reading state information... Done
wget is already the newest version (1.20.3-1ubuntu2).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
--2023-06-12 18:04:35--  https://raw.githubusercontent.com/bjportelac/UP-0001-MainCodeAndData/main/resultado.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 55811 (55K) [text/plain]
Saving to: ‘resultado.txt’


2023-06-12 18:04:35 (30.6 MB/s) - ‘resultado.txt’ saved [55811/55811]



In [None]:
import os
import unicodedata

def processer(fileName:str, divider: str):
  """
  Process a file and return a dictionary containing paragraphs for each key.
  Args:
      fileName (str): The name of the file to process.
      divider (str): The string used to divide paragraphs.
  Returns:
      dict: A dictionary containing keys and list of paragraphs for each key.
  """
  dictionary = {}
  with open(fileName,'r',encoding='latin-1') as archive:
    lines = archive.readlines()

  i = 0
  while i < len(lines):
    line = lines[i].strip()
    if line.startswith("Archivo:"):
      key = line.split(":")[1].strip()
      dictionary[key] = []

    elif key is not None and not lines[i].startswith(divider) and not lines[i].startswith('Contenido:'):
      parragraph = lines[i].strip()
      if(parragraph and parragraph != divider):
        dictionary[key].append(parragraph)

    i+=1

  return dictionary

def stringRegularizer(wordList:list):
  """
  Regularize a list of strings by normalizing, lowercasing, and capitalizing the first letter.
  Args:
      wordList (list): A list of strings to regularize.
  Returns:
      list: A list of regularized strings.
  """
  regularized = []
  for string in wordList:
    string = unicodedata.normalize('NFKD',string).encode('ASCII','ignore').decode('utf-8')
    string = string.lower().strip()
    string = string.title()
    regularized.append(string)

  return regularized

def dictionaryCleaner(dictionary:dict):
  """
  Clean a dictionary by regularizing its values.
  Args:
      dictionary (dict): A dictionary with keys and list of strings as values.
  Returns:
      dict: A cleaned dictionary with regularized values.
  """
  for key in dictionary:
    value = dictionary[key]
    new_value = stringRegularizer(wordList=value)
    dictionary[key] = new_value

  return dictionary


file_name ="resultado.txt"
divider = '-----------------------------'

dictionary = processer(fileName=file_name,divider=divider)
dictionary = dictionaryCleaner(dictionary=dictionary)

result = ""

for key, values in dictionary.items():
    result += key + ": "
    result += ", ".join(values)
    result += "\n"

# Write the result to a text file
with open("Parsed_regularized.txt", "w") as file:
    file.write(result)



---
## **Instalación de requerimentos:**
---

Ya que se debe trabajar con **LangChain** y **Chroma** se trae la importación e unstalacion de langChain para Python.

In [None]:
#Activar CUDA
!nvidia-smi

Mon Jun 12 18:04:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


### 2. Carga del modelo:

* Debido a que no existen muchos modelos que funcionen con el lenguaje español, se debe trabajar con uno que tenga cierta compatibilidad , en este caso se trabajara con el modelo **Alpaca LoRA 7B** el cual es de los que tienen mayor compatibilidad.

* **1.1: Traer el repositorio de Alpaca LoRA** [https://github.com/tloen/alpaca-lora/]

In [None]:
# Clonar el Repo
! git clone https://github.com/tloen/alpaca-lora.git

Cloning into 'alpaca-lora'...
remote: Enumerating objects: 607, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 607 (delta 28), reused 33 (delta 19), pack-reused 556[K
Receiving objects: 100% (607/607), 27.78 MiB | 6.57 MiB/s, done.
Resolving deltas: 100% (360/360), done.


In [None]:
#Inspeccionar la carpeta del repositorio
%cd alpaca-lora

/content/alpaca-lora


In [None]:
#Listar los contenidos
%ls

alpaca_data_cleaned_archive.json  generate.py
alpaca_data_gpt4.json             lengths.ipynb
alpaca_data.json                  LICENSE
DATA_LICENSE                      pyproject.toml
docker-compose.yml                README.md
Dockerfile                        requirements.txt
export_hf_checkpoint.py           [0m[01;34mtemplates[0m/
export_state_dict_checkpoint.py   [01;34mutils[0m/
finetune.py


In [None]:
# Instalar los requerimientos del modelo
!pip install -r requirements.txt
%cd ..

In [None]:
import torch
import tensorflow as tf

In [None]:
torch.cuda.is_available()

True

In [None]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
%cd content
%ls

[Errno 2] No such file or directory: 'content'
/content
[0m[01;34malpaca-lora[0m/  Parsed_regularized.txt  resultado.txt  [01;34msample_data[0m/


In [None]:
# Clonar el Repo
! git clone https://huggingface.co/plncmm/guanaco-lora-7b
#Inspeccionar la carpeta del repositorio
%cd guanaco-lora-7b
#Listar los contenidos
%ls

%cd ..

Cloning into 'guanaco-lora-7b'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (11/11), 2.76 KiB | 1.38 MiB/s, done.
/content/guanaco-lora-7b
adapter_config.json  adapter_model.bin  README.md
/content


In [None]:
%cd alpaca-lora

/content/alpaca-lora


In [None]:
! python finetune.py --base_model 'decapoda-research/llama-7b-hf' --data_path 'yahma/alpaca-cleaned' --output_dir './lora-alpaca' --batch_size 128 --num_epochs 3 --learning_rate 1e-4

In [None]:
%cd content

In [None]:
# Correr el modelo
%cd alpaca-lora

! python generate.py --load_8bit --base_model 'decapoda-research/llama-7b-hf' --lora_weights '/content/guanaco-lora-7b'

In [None]:
# Base model LLaMa-7B
base_model_path = 'decapoda-research/llama-7b-hf'
# Weights Lora-7B fine tuned for Spanish
weights_path = "/content/guanaco-lora-7b"

### 2. Carga y procesamiento de documentos:

* Se cargan los documentos de las resoluciones haciendo uso de la importacón **TextLoader de Langchain**.

* Con la cual podremos cargar el documento obtenido en la carga el cual se encuentra regualrizado a partir del diccionario inicial.

* Se separa el texto usando un separador recursivo, con el cual partimos el texto original en pequeños trosos con el fin de encontrar los que sean mas relevantes para el LLM.

In [None]:
%cd ..

In [None]:
%ls

In [None]:
import langchain
from langchain import text_splitter
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain

loader = TextLoader('/content/Parsed_regularized.txt')
ResolutionDoc = loader.load()

text_splitter = CharacterTextSplitter(chunk_size =256, chunk_overlap=0)
text = text_splitter.split_documents(documents=ResolutionDoc)

#Carga del texto
print(text)

---
### 2. Inicializar una base de vectores persistente de ChromaDB

* Ya que necesitamos una base de vectores enlazados para cada trozo de texto en una base de datos Chroma, utilizaremos un directorio para que la base de datos sea Persistente.

* Debemos utilizar diferentes embeddings ya que necesitamos representar los datos de una forma en la cual la I.A por lo cual importamos embbedings compatibles con chroma.

In [None]:
!pip install llama-cpp-python

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embeding_model_path = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embeding_model_path)

In [None]:
from langchain.vectorstores import Chroma
#from langchain.embeddings import LlamaCppEmbeddings
#from chromadb.utils import embedding_functions


#llm = LlamaCppEmbeddings(model_path=base_model)
persistency_dir = 'chromaDb'
#emb_function = LlamaCppEmbeddings()

chromaVectorDB = Chroma.from_documents(documents=ResolutionDoc, embedding=embeddings,persist_directory=persistency_dir)


---

In [None]:
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

tokenizer = LlamaTokenizer.from_pretrained(base_model_path)

In [None]:
base_model = LlamaForCausalLM.from_pretrained(
        base_model_path,
        load_in_8bit=True,
        device_map="cuda:0",
    )

In [None]:
# Adapt the base model weights
from peft import PeftModel
model = PeftModel.from_pretrained(
    base_model,
    weights_path,
)

In [None]:
pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    max_length=10000,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    repetition_penalty=1.2
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
from langchain import PromptTemplate, LLMChain

template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

Answer:"""

prompt = PromptTemplate(template=template, input_variables=["instruction"])

In [None]:
llm_chain = LLMChain(prompt=prompt,
                     llm=local_llm
                     )

question = "Que es un guanaco?"

print(llm_chain.run(question))

In [None]:
chromaVectorDB.persist()
chromaVectorDB = None

vectordb = Chroma(persist_directory=persistency_dir, embedding_function=embeddings)

In [None]:
from langchain.chains import VectorDBQA
from langchain.chains import RetrievalQA

docsearch = Chroma.from_documents(text, embeddings)
Rqa = RetrievalQA.from_chain_type(llm=local_llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={"k": 1}))

#Rqa = RetrievalQAWithSourcesChain.from_chain_type(llm=local_llm, chain_type="stuff", retriever=chromaVectorDB.as)
Vqa = VectorDBQA.from_chain_type(llm=local_llm, chain_type="stuff", vectorstore=vectordb)

In [None]:
import torch
torch.cuda.empty_cache()

import gc
gc.collect()

In [None]:


query = "Que dice la normativa de admision a la Universidad Nacional de Colombia?"
Vqa.run(query)



In [None]:
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_chain_type(local_llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={"k": 1}))

chain({"question": "Que dice la normativa de admision a la Universidad Nacional de Colombia"}, return_only_outputs=True)

In [None]:
Rqa.run(query)