# Clone this repo

In [None]:
%%bash 

git init
git remote add origin https://github.com/jagalindo/UVL-GGNF-GRAMMAR.git
git pull
git checkout main -f
git branch --set-upstream-to origin/main

# Clone llama.cpp and all its files

In [1]:
!git clone https://github.com/ggerganov/llama.cpp.git
!cd llama.cpp; make LLAMA_CUBLAS=1

Cloning into 'llama.cpp'...
remote: Enumerating objects: 17285, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 17285 (delta 67), reused 99 (delta 46), pack-reused 17153[K
Receiving objects: 100% (17285/17285), 20.79 MiB | 25.62 MiB/s, done.
Resolving deltas: 100% (11962/11962), done.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion 
I CXXFLAGS:  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_

# Download the LLMs models to be used later on this script.

In [2]:
!wget https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q8_0.gguf
#!wget https://huggingface.co/TheBloke/Llama-2-13B-GGUF/resolve/main/llama-2-13b.Q8_0.gguf
#!wget https://huggingface.co/TheBloke/Llama-2-70B-GGUF/resolve/main/llama-2-70b.Q8_0.gguf

--2024-01-29 22:51:03--  https://huggingface.co/TheBloke/Llama-2-13B-GGUF/resolve/main/llama-2-13b.Q8_0.gguf
Resolving huggingface.co (huggingface.co)... 65.9.86.71, 65.9.86.57, 65.9.86.79, ...
Connecting to huggingface.co (huggingface.co)|65.9.86.71|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/b1/be/b1bebadea37cd4dd7655da740ed74e9a825af7c9ffddf51690f59fb89df02706/f60b1d38df8299c926daeb6a81a56004e188b889a9d21dc8b5ef1f58b923c889?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llama-2-13b.Q8_0.gguf%3B+filename%3D%22llama-2-13b.Q8_0.gguf%22%3B&Expires=1706827863&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjgyNzg2M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9iMS9iZS9iMWJlYmFkZWEzN2NkNGRkNzY1NWRhNzQwZWQ3NGU5YTgyNWFmN2M5ZmZkZGY1MTY5MGY1OWZiODlkZjAyNzA2L2Y2MGIxZDM4ZGY4Mjk5YzkyNmRhZWI2YTgxYTU2MDA0ZTE4OGI4ODlhOWQyMWRjOGI1ZWYxZjU4YjkyM2

# Count the number of tokens 
We count the number of tokens of each example to be used for instruction promnting and generate a json file depicting them

In [None]:
# TO BE TRANSLATED TO PYTHON

## Second step. Generate all the possible promnts

### Description
This Python script is designed to process a collection of files based on their metadata defined in a JSON file. It generates combinations of these examples to be used whtin a instruction promnting context, concatenates their contents, and saves the results under specific conditions.

### Key Features

- **JSON File Parsing**: Reads file metadata from a JSON file, including file names and associated number of tokens.
- **Combination Generation**: Creates all possible file combinations of examples to perform instruction promnting, ensuring the sum of their values does not exceed a specified max tokens threshold.
- **No Subsets**: Excludes any combination that is a subset of a larger combination, ensuring only the largest, unique sets are processed.
- **Concatenation with Header**: Concatenates the contents of files in each combination, always starting with the contents of "0.header.txt" which is the default header for the instruction promnting.
- **Output File Generation**: For each valid combination, creates a text file containing the concatenated contents. The filename is the number of tokens within combination of examples used and the header.
- **JSON Output**: Generates a JSON file detailing the combinations used and their corresponding sums.

### Workflow

1. **Load Data**: Reads file names and values from the provided JSON file.
2. **Generate Combinations**: Forms all combinations of files where their total value is less than or equal to a user-defined threshold.
3. **Filter Subsets**: Removes any combination that is entirely included in a larger combination.
4. **Concatenate Contents**: Merges the contents of files in each valid combination, starting with "0.header.txt".
5. **Save Outputs**: Writes the concatenated contents to new files and produces a summary JSON file.

### Usage

- Set the `base_path` to the directory containing the files.
- Place the JSON file with file data in the same directory.
- Adjust the `threshold_values` as needed to generate as many promnnts possible. 

Note that there is a missing word at the end that will perform the actual generation.

In [1]:
import json
import itertools
import os

def concatenate_files(base_path, file_names, output_path):
    # Ensure the mandatory file is concatenated first
    mandatory_file_name = "0.header.txt"
    content = ''
    # Add the mandatory file's content first
    with open(os.path.join(base_path, mandatory_file_name), 'r') as file:
        content += file.read() + '\n\n'

    # Concatenate the contents of the other files
    for file_name in file_names:
        if file_name != mandatory_file_name:
            with open(os.path.join(base_path, file_name), 'r') as file:
                content += file.read() + '\n\n'

    # Write the concatenated content to the output file
    with open(output_path, 'w') as file:
        file.write(content)

def filter_combinations(all_combinations):
    # Filter combinations, removing those that are subsets of larger combinations
    filtered_combinations = set()
    for combo in sorted(all_combinations, key=lambda x: len(x), reverse=True):
        if not any(combo.issubset(other_combo) for other_combo in filtered_combinations if combo != other_combo):
            filtered_combinations.add(frozenset(combo))
    return filtered_combinations

def find_combinations_and_concatenate(base_path, file_path, threshold):
    # Load JSON data from file
    with open(file_path, 'r') as file:
        file_data = json.load(file)

    # Extract and store the mandatory file's value
    mandatory_file_name, mandatory_file_value = "0.header.txt", file_data.pop("0.header.txt")

    # Convert the JSON data into a list of tuples (filename, value)
    items = [(k, v) for k, v in file_data.items()]

    # Create a directory for the output files
    output_dir = os.path.join(base_path, str(threshold))
    os.makedirs(output_dir, exist_ok=True)

    # Dictionary to store combinations and their sums
    combinations_dict = {}

    # Generate all combinations
    all_combinations = set()
    for r in range(1, len(items) + 1):
        for combo in itertools.combinations(items, r):
            total_value = sum(item[1] for item in combo) + mandatory_file_value
            if total_value <= threshold:
                all_combinations.add(frozenset([mandatory_file_name] + [item[0] for item in combo]))

    # Filter combinations
    filtered_combinations = filter_combinations(all_combinations)

    # Process filtered combinations
    for combo in filtered_combinations:
        total_value = sum(file_data.get(file, 0) for file in combo) + mandatory_file_value
        file_names = list(combo)
        output_file_path = os.path.join(output_dir, f"{total_value}.promnt")
        concatenate_files(base_path, file_names, output_file_path)
       #print(f"Saved: {output_file_path}")
        combinations_dict[total_value] = file_names

    # Save the combinations dictionary as JSON
    json_output_path = os.path.join(output_dir, "combinations.json")
    with open(json_output_path, 'w') as json_file:
        json.dump(combinations_dict, json_file, indent=4)
    #print(f"Combinations JSON saved: {json_output_path}")

# Base path where the files are located
base_path = './promnts/promnts_no_ctc/'  # Replace with the path to your files

# Path to the JSON file
file_path = os.path.join(base_path, 'tokens.json')  # JSON file should be in the same directory as the files

# Set the threshold values
threshold_values = [256,512,1024,2048]  # Change this value as needed

# Find valid combinations and concatenate files
for threshold in threshold_values:
    find_combinations_and_concatenate(base_path, file_path, threshold)
    print(f"Models up to context window of: {threshold} - Done")

Models up to context window of: 256 - Done
Models up to context window of: 512 - Done
Models up to context window of: 1024 - Done
Models up to context window of: 2048 - Done


## Llama.cpp execution

This script is designed to automate the execution of a llama.cpp thought its command line (`./main`) with various combinations of parameters, specifically for different models, prompt files, words, and values of `n` and `c`. It organizes the outputs into a structured directory system and handles the modification of prompt files by appending specific words. **We avoid the ussage of more complex bindings for llama.cpp.python as some reported that ot introduces some memory leaks**.

### Key Functionalities

1. **Parameter Combinations**:
   - Iterates over combinations of different parameters:
     - `model`: Different model identifiers.
     - `prompt_files`: Files with prompts, located in a specified directory.
     - `words`: A list of words to append to each prompt. Those words are the domains in which we want the output UVL files generated
     - `n`: Various 'c' values representing the number of tokens to predict.
     - `i`: A range of integers from 1 to 10, representing the different uvl files to generate per word(domain)

2. **Directory Structure**:
   - Creates a unique directory for each combination of `model`, `prompt`, `word`.
   - The structure is as follows: `model_outputs/model/prompt/word_c`.

3. **Prompt File Modification**:
   - Appends a specific word from the `words` list to the content of each prompt file.
   - Saves the modified prompt in the corresponding directory.

4. **Command Execution**:
   - Executes the command `./main` with parameters based on the current combination of `model`, `n`, `c`, and the modified prompt file.
   - The command is executed using Python's `subprocess` module.

5. **Output and Error File Management**:
   - For each execution, output and error files are generated.
   - Files are named using a combination of `i`, `n` (e.g., `1.n10.out.txt` for output files, `1.n10.err.txt` for error files).
   - These files are saved in the corresponding directory based on the `model`, `prompt`, and `word` value.

### Usage

- The script is intended for use in scenarios where you need to test or run a command (`./main`) with various configurations and keep the outputs organized and easily traceable.
- It is particularly useful for experiments or tests where the effect of different parameters (like `model`, `prompt`, `n`) on the command's behavior needs to be assessed.

### Requirements

- Python environment with access to the file system.
- The `./main` command should be executable with the given parameters.
- The `promnts/promtns_no_ctc` directory should exist with `.promnt` files.

In [19]:
import subprocess
import os
import glob
import itertools

# Your defined variables
n = [128, 256, 512] # This is the tokens to predict
model = ["llama-2-7b.Q8_0.gguf"] #This is the models to execute. Remember to add any model you download
#This are the domains to use. First line from MIDAS project. The others are extracted from 
#here https://www.oberlo.com/statistics/most-popular-electronics
words = ["Motorhomes", "Lamps", 
         "Smartphone","Laptop","Tablet","Smartwatch","Games console","Smart home devices",
         "TV streaming devices", "Feature Phone", "Virtual reality device"] 

# Path to the directory with prompt files
prompts_dir = 'promnts/promnts_no_ctc'
prompt_files = glob.glob(os.path.join(prompts_dir, '**/*.promnt'), recursive=True)

# Base directory for outputs
base_output_dir = "./model_outputs"

# Function to create directories and return the path
def create_directories(model_name, subfolder_name, prompt_name, word):
    model_dir = os.path.join(base_output_dir, model_name)
    subfolder_dir = os.path.join(model_dir, subfolder_name)
    prompt_dir = os.path.join(subfolder_dir, prompt_name)
    word_dir = os.path.join(prompt_dir, word)

    os.makedirs(word_dir, exist_ok=True)
    return word_dir

# Function to append a word to a prompt file and return the modified content
def get_modified_prompt_content(file_path, word):
    with open(file_path, 'r') as file:
        content = file.read()
    return content + f"\n{word}"

# Create combinations of model, prompt_files, words, range(1, 11), n
combinations = itertools.product(model, prompt_files, words, range(1, 11), n)

# Iterate over combinations
for model_value, prompt_file, word, i, n_value in combinations:
    # Extract subfolder name and prompt name
    subfolder_path = os.path.dirname(prompt_file)
    subfolder_name = os.path.basename(subfolder_path)
    prompt_name = os.path.basename(prompt_file).split('.')[0]

    # Skip if the prompt file is not in a subdirectory
    if subfolder_path == prompts_dir:
        continue

    # Extract c value from subfolder name
    c = int(subfolder_name)

    dir_path = create_directories(model_value, subfolder_name, prompt_name, word)
    modified_prompt = get_modified_prompt_content(prompt_file, word)

    # File names with n, c, i variables
    output_filename = f"{dir_path}/{i}.n{n_value}.out.txt"
    error_filename = f"{dir_path}/{i}.n{n_value}.err.txt"
    modified_prompt_filename = f"{dir_path}/{i}.n{n_value}.c{c}.promnt"

    # Save the modified prompt file
    with open(modified_prompt_filename, 'w') as file:
        file.write(modified_prompt)

    # Command execution
    command = f"./llama.cpp/main --n-gpu-layers -1 -ngl 32 -m ./{model_value} -c {c} --repeat_penalty 1.1 -n {n_value} -e --grammar-file ./grammars/UVL_no_ctc.gbnf -f {modified_prompt_filename}"

    with open(output_filename, 'w') as output_file, open(error_filename, 'w') as error_file:
        subprocess.run(command, shell=True, stdout=output_file, stderr=error_file)