In [None]:
%pip install llama-index-llms-gradient
%pip install llama-index-finetuning
!pip install llama-index gradientai -q
!pip install datasets -q
!pip install --upgrade pyarrow datasets
!pip install --force-reinstall llama-index[gradient]

In [None]:
import os
from llama_index.llms.gradient import GradientBaseModelLLM, GradientModelAdapterLLM
from llama_index.finetuning import GradientFinetuneEngine

In [None]:
os.environ["GRADIENT_ACCESS_TOKEN"] = "mbrslo7W62CipAM5a6YC4GAg2N5gxoRG"
os.environ["GRADIENT_WORKSPACE_ID"] = "cc607960-b59f-4beb-98ae-8eac267b88a9_workspace"

In [None]:
from datasets import load_dataset
from pathlib import Path
import json


def load_jsonl(data_dir):
    data_path = Path(data_dir).as_posix()
    data = load_dataset("json", data_files=data_path)
    return data


def save_jsonl(data_dicts, out_path):
    with open(out_path, "w") as fp:
        for data_dict in data_dicts:
            fp.write(json.dumps(data_dict) + "\n")


def load_data_sql(data_dir: str = "genereic_python_code"):
    dataset = load_dataset("Vezora/Tested-143k-Python-Alpaca")

    dataset_splits = {"train": dataset["train"]}
    out_path = Path(data_dir)

    out_path.parent.mkdir(parents=True, exist_ok=True)

    for key, ds in dataset_splits.items():
        with open(out_path, "w") as f:
            for item in ds:
                newitem = {
                    "instruction": item["instruction"],
                    "input": item["input"],
                    "output": item["output"],
                }
                f.write(json.dumps(newitem) + "\n")

In [None]:
load_data_sql(data_dir="generic_python_code")

In [None]:
from math import ceil


def get_train_val_splits(
    data_dir: str = "generic_python_code",
    val_ratio: float = 0.1,
    seed: int = 42,
    shuffle: bool = True,
):
    data = load_jsonl(data_dir)
    num_samples = len(data["train"])
    val_set_size = ceil(val_ratio * num_samples)

    train_val = data["train"].train_test_split(
        test_size=val_set_size, shuffle=shuffle, seed=seed
    )
    return train_val["train"].shuffle(), train_val["test"].shuffle()

In [None]:
raw_train_data, raw_val_data = get_train_val_splits(data_dir="generic_python_code")
save_jsonl(raw_train_data, "train_data_raw.jsonl")
save_jsonl(raw_val_data, "val_data_raw.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
raw_train_data[100]

{'instruction': "How can I retrieve the third element from multiple Python arrays, which are embedded within dictionaries, and combine these elements to create a new list?\n\nDictionaries format:\n{'Array1': [1,2,3,4,5], 'Array2':[6,7,8,9,10], 'Array3':[11,12,13,14,15]}",
 'input': '',
 'output': 'In Python, the arrays are commonly referred to as lists, and the "tertiary constituent" simply means the third element in each list. To solve this problem, you can use the following code:\n\n```python\n# Define the input\narrays = {\'Array1\': [1,2,3,4,5], \'Array2\':[6,7,8,9,10], \'Array3\':[11,12,13,14,15]}\n# Extract the third (tertiary) item from each array and store it in a new list\ntertiary_items = [array[2] for array in arrays.values()]\nprint(tertiary_items)\n```\n\nThis code will give you the desired output: `[3, 8, 13]`, which represents the third elements from each list in the example dictionary.\n\nHere\'s a breakdown of the code\'s functionality:\n1. The dictionaries with the ar

In [None]:
text_to_code_tmpl_str = """\
<s>### Instruction:\n{system_message}{user_message}\n\n### Response:\n{response}</s>"""

text_to_code_inference_tmpl_str = """\
<s>### Instruction:\n{system_message}{user_message}\n\n### Response:\n"""

def _generate_prompt_code(input, instruction, output=""):
    system_message = f"""You are a powerful text-to-python code model. Your job is to answer questions according to the instructions given. Use the input if mentioned.

You must output the python code that answers the question.

    """
    user_message = f"""

### Input:
{input}

### Instruction:
{instruction}

### Response:
"""
    if output:
        return text_to_code_tmpl_str.format(
            system_message=system_message,
            user_message=user_message,
            response=output,
        )
    else:
        return text_to_code_inference_tmpl_str.format(
            system_message=system_message, user_message=user_message
        )

def generate_prompt(data_point):
    full_prompt = _generate_prompt_code(
        data_point["input"],
        data_point["instruction"],
        output=data_point["output"],
    )
    return {"inputs": full_prompt}

In [None]:
train_data = []
val_data = []
for d in raw_train_data.map(generate_prompt):
  train_data.append({"inputs": d["inputs"]})
save_jsonl(train_data, "train_data.jsonl")

for d in raw_val_data.map(generate_prompt):
  val_data.append({"inputs": d["inputs"]})
save_jsonl(val_data, "val_data.jsonl")

Map:   0%|          | 0/128994 [00:00<?, ? examples/s]

Map:   0%|          | 0/14333 [00:00<?, ? examples/s]

In [None]:
print(train_data[5]["inputs"])

<s>### Instruction:
You are a powerful text-to-python code model. Your job is to answer questions according to the instructions given. Use the input if mentioned. 

You must output the python code that answers the question.
    
    

### Input:


### Instruction:
You are tasked with implementing a function to calculate the elapsed time in seconds between the current time and a given start time. You are provided with a code snippet that captures the start time using the `time.time()` function. Your task is to create a function that takes the start time as an argument and returns the elapsed time in seconds.

Function Signature: `def calculate_elapsed_time(start_time: float) -> float`

Example:
```
start_time = time.time()  # Assume this captures the start time
# Some time-consuming operations
elapsed_time = calculate_elapsed_time(start_time)
print(elapsed_time)  # Output: 5.23 (example value)
```

### Response:


### Response:
```python
import time

def calculate_elapsed_time(start_tim

In [None]:
base_model_slug = "llama2-7b-chat"
base_llm = GradientBaseModelLLM(
    base_model_slug=base_model_slug, max_tokens=300
)

In [None]:
my_model = GradientModelAdapterLLM(model_adapter_id='22e0d393-fd73-4242-b314-121740849d34_model_adapter', max_tokens = 511)

In [None]:
finetune_engine = GradientFinetuneEngine(
    base_model_slug=base_model_slug,
    name="generic_pythn_to_code",
    data_path="train_data.jsonl",
    verbose=True,
    max_steps=200,
    batch_size=4,
)

In [None]:
finetune_engine.model_adapter_id

'd7e5f7af-def8-4a43-993c-c526929b8074_model_adapter'

In [None]:
ft_llm = finetune_engine.get_finetuned_model(max_tokens=511)

In [None]:
def get_text2code_completion(llm, raw_datapoint):
    text2code_tmpl_str = _generate_prompt_code(
        raw_datapoint["input"],
        raw_datapoint["instruction"],
        output=None,
    )

    response = llm.complete(text2code_tmpl_str)
    return str(response)

In [None]:
test_datapoint = raw_val_data[1]
display(test_datapoint)

{'instruction': "How can I implement a Depth-First Search (DFS) algorithm in Python to detect cycles in directed graphs? The graph will be provided as an adjacency matrix. I also need to report the path of the cycle if one is found. It's important that the algorithm can handle deep recursion, large inputs, and manage its memory use effectively. Additionally, it should be able to handle not just connected graphs, but also disconnected graphs. Can you provide a clean, efficient, and well-documented code solution for this?",
 'input': '',
 'output': 'Sure! Here\'s an implementation of the Depth-First Search algorithm for detecting cycles in directed graphs. The algorithm is designed to handle both connected and disconnected graphs, and it uses an adjacency matrix representation.\n\nFirst, I\'ll provide you with the code for the Graph class, which includes the necessary methods to create the graph, add edges, and check for cycles:\n\n```python\nclass Graph:\n    def __init__(self, vertices

In [None]:
print(get_text2code_completion(base_llm, test_datapoint))

 Sure, I can help you with that! Here is a Python implementation of a Depth-First Search (DFS) algorithm to detect cycles in directed graphs:
```python
import numpy as np

def detect_cycles(graph, start=None):
    """
    Detect cycles in a directed graph using Depth-First Search.

    Parameters:
    graph (numpy array): Adjacency matrix of the graph.
    start (numpy array, optional): Starting node of the search. If None, the algorithm will start at the first unvisited node.

    Returns:
    cycle (numpy array): Path of the cycle, if found. Otherwise, None.
    """
    # Initialize the visited nodes set
    visited = set()

    # Initialize the current node
    current = start if start else np.array([0])

    # Depth-First Search
    while current != []:
        # Unvisited nodes
        unvisited = np.where(visited == [])[0]
        if not unvisited.size:
            break

        # Visit the unvisited nodes
        for node in unvisited:
            visited.add(node)
            

In [None]:
print(get_text2code_completion(ft_llm, test_datapoint))

 Sure, I can help you with that! Here is a Python implementation of the Depth-First Search (DFS) algorithm to detect cycles in directed graphs:
```python
import collections

def dfs_cycle_detector(graph, start):
    """
    DFS algorithm to detect cycles in a directed graph.

    Parameters:
        graph (list of lists): The adjacency matrix of the graph.
        start (int): The starting node of the DFS traversal.

    Returns:
        (list of ints, list of ints): The path of the cycle, if one is found, else None.
    """
    # Initialize the stack and the current node
    stack = [(start, {})]
    current = start

    # Perform DFS traversal
    while stack:
        # Pop the current node and its neighbors from the stack
        current = stack.pop()
        neighbors = set(graph[current])
        for neighbor in neighbors:
            if neighbor not in stack:
                # If the neighbor is not already visited, mark it as visited
                stack.append((neighbor, {curr