In [1]:
import os

from langchain.llms import LlamaCpp

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

home = os.getenv('HOME')
model_root = os.path.join(home, 'models', 'ggml', 'codellama')
model_name = 'codellama-7b-instruct.Q4_0.gguf'
model_path = os.path.join(model_root, model_name)

params = dict(
  max_tokens=512,
  temperature=0.8,
  top_k=0,
  top_p=0.99,
  repeat_penalty=1.1,
)

llm = LlamaCpp(
  model_path=model_path,
  **params,
  stop=['</s>', '</SYS>', '\n\n\n\n\n'],
  callback_manager=callback_manager,
  verbose=True,
)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/kiddos/models/ggml/codellama/codellama-7b-instruct.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32016,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight 

In [2]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

template = """<s>[INST] <<SYS>>
{system}
<</SYS>>

{user_msg} [/INST] """

prompt = PromptTemplate(template=template, input_variables=['system', 'user_msg'])
llm_chain = LLMChain(prompt=prompt, llm=llm)

system = """You are an AI assistant that writes clear and concise code.
"""

user_msg = """write unit test for the following python function:

def fib(n):
  if n <= 0:
    return 0
  if n == 1:
    return n
  else:
    return fib(n-1) + fib(n-2)

"""

result = llm_chain.run(system=system, user_msg=user_msg)

 Here is a unit test for the `fib` function:
```python
import unittest

class TestFib(unittest.TestCase):
    def test_fib_0(self):
        self.assertEqual(fib(0)), 0)
    def test_fib_1(self):
        self.assertEqual(fib(1))), 1)
    def test_fib_2(self):
        self.assertEqual(fib(2)))), 1)

if __name__ == '__main__':
    unittest.main()

```


llama_print_timings:        load time =   730.30 ms
llama_print_timings:      sample time =   337.56 ms /   139 runs   (    2.43 ms per token,   411.78 tokens per second)
llama_print_timings: prompt eval time = 10468.72 ms /   102 tokens (  102.63 ms per token,     9.74 tokens per second)
llama_print_timings:        eval time = 21988.65 ms /   138 runs   (  159.34 ms per token,     6.28 tokens per second)
llama_print_timings:       total time = 33148.60 ms


In [3]:
user_msg = """write unit test for the following python function:
```python
def fib(n):
  if n <= 0:
    return 0
  if n == 1:
    return n
  else:
    return fib(n-1) + fib(n-2)
```
"""

result = llm_chain.run(system=system, user_msg=user_msg)

Llama.generate: prefix-match hit


 To write a unit test for the `fib` function, you can use the built-in `unittest` module in Python. Here's an example of how to write a unit test for the `fib` function:
1. Import the `unittest` module at the beginning of your test file. For example:
```python
import unittest
```
2. Create a new class that inherits from the `unittest.TestCase` class. This is where you define the methods for your unit tests. For example:
```python
class TestFib(unittest.TestCase):
    def test_fib_zero(self):
        self.assertEqual(0, fib(0)))
    def test_fib_one(self):
        self.assertEqual(1, fib(1)))
    def test_fib_two(self):
        self.assertEqual(1, fib(2))))
if __name__ == '__main__':
unittest.main()
```
3. In the `test_fib` method, call the `fib` function with different inputs and assert that the output is correct. For example:
```python
class TestFib(unittest.TestCase):
    def test_fib(self):
        self.assertEqual(0, fib(0)))
    def test_fib_one(self):
        self.assertEqual(1, 


llama_print_timings:        load time =   730.30 ms
llama_print_timings:      sample time =   983.78 ms /   407 runs   (    2.42 ms per token,   413.71 tokens per second)
llama_print_timings: prompt eval time =  6420.54 ms /    61 tokens (  105.25 ms per token,     9.50 tokens per second)
llama_print_timings:        eval time = 64151.19 ms /   406 runs   (  158.01 ms per token,     6.33 tokens per second)
llama_print_timings:       total time = 72631.37 ms


In [4]:
user_msg = """implement kadane's algorithm in c++
"""

result = llm_chain.run(system=system, user_msg=user_msg)

Llama.generate: prefix-match hit


 Kadane's algorithm is a simple technique for finding the maximum sum of a subarray within a given array.
Here is an example implementation of Kadane's algorithm in C++:
```
#include <iostream>
using namespace std;
int kadane(int arr[], int n) {
    // Initialize variables for the maximum sum and the current sum
    int max_sum = 0, curr_sum = 0;
    // Iterate through the array from left to right
    for (int i = 0; i < n; i++) {
        // Add the current element to the current sum
        curr_sum += arr[i];
        // If the current sum is greater than the maximum sum, then update the maximum sum with the current sum
        if (curr_sum > max_sum)) {
            max_sum = curr_sum;
        }
    }
    // Return the maximum sum
    return max_sum;
}

int main() {
    // Declare and initialize an integer array with elements in the range [-10, 10]
    int arr[] = {-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    // Declare and initialize variables for t


llama_print_timings:        load time =   730.30 ms
llama_print_timings:      sample time =  1140.05 ms /   461 runs   (    2.47 ms per token,   404.37 tokens per second)
llama_print_timings: prompt eval time =  1803.80 ms /    16 tokens (  112.74 ms per token,     8.87 tokens per second)
llama_print_timings:        eval time = 74732.00 ms /   461 runs   (  162.11 ms per token,     6.17 tokens per second)
llama_print_timings:       total time = 78945.25 ms


In [5]:
user_msg = """Can you implement a hello world widget in flutter?
"""

result = llm_chain.run(system=system, user_msg=user_msg)

Llama.generate: prefix-match hit


 Yes, I can help you with that! Here's an example of how to create a "Hello World" widget in Flutter:
1. Create a new Flutter project by running the following command in your terminal or command prompt: `flutter create my_app` (replace "my_app" with the name you want for your app).
2. Go into your newly created Flutter project's root directory, and run the following command to start the development server for your Flutter app: `flutter run`. This will launch your Flutter app on your Android device or iOS emulator.
3. In the main layout of your Flutter app (i.e., the layout that is defined in the `Widget build(BuildContext context) {...}` method of your main `StatefulWidget` class)), define a new `Text()` widget, and set its `text` property to the string "Hello World!" (i.e., set it to `text: 'Hello World!'`). Then, add the new `Text()` widget as a child of the root layout widget (i.e., the widget that is defined in the `Widget build(BuildContext context) {...}` method of your main `Sta


llama_print_timings:        load time =   730.30 ms
llama_print_timings:      sample time =   974.52 ms /   398 runs   (    2.45 ms per token,   408.41 tokens per second)
llama_print_timings: prompt eval time =  1708.89 ms /    16 tokens (  106.81 ms per token,     9.36 tokens per second)
llama_print_timings:        eval time = 63247.75 ms /   397 runs   (  159.31 ms per token,     6.28 tokens per second)
llama_print_timings:       total time = 67008.28 ms


In [7]:
user_msg = """Write test for the following widget:

```dart
import 'package:flutter/material.dart';
class HelloWorldWidget extends StatelessWidget {
  @override
  Widget build(BuildContext context) {
    return Text("Hello World!");
  }
}
```
"""

result = llm_chain.run(system=system, user_msg=user_msg)

Llama.generate: prefix-match hit


 To write a test for the `HelloWorldWidget` widget, you can use the `testWidgets` function provided by Flutter. Here's an example of how to write a test for the `HelloWorldWidget`:
```dart
import 'package:flutter/material.dart';
void main() {
  testWidgets('Hello World Widget Test', (WidgetTester tester) async {
    await tester.pumpWidget(HelloWorldWidget()));
    expect(find.text('Hello World!')))).toBeVisible();
  });
}
```
In this example, the `testWidgets` function is used to create a widget test for the `HelloWorldWidget`. The test first pumps the widget into the widget tree using the `pumpWidget` method. It then uses the `expect` method to check that the widget's text content is "Hello World!". Finally, it checks that the widget is visible in the widget tree using the `toBeVisible` method.


llama_print_timings:        load time =   730.30 ms
llama_print_timings:      sample time =   506.72 ms /   215 runs   (    2.36 ms per token,   424.30 tokens per second)
llama_print_timings: prompt eval time =  6896.03 ms /    67 tokens (  102.93 ms per token,     9.72 tokens per second)
llama_print_timings:        eval time = 31414.22 ms /   214 runs   (  146.80 ms per token,     6.81 tokens per second)
llama_print_timings:       total time = 39366.37 ms


In [11]:
user_msg = """Write the comment for the following code:
```cpp
void aho_corasick(TrieNode* root) {
  root->failure = root;
  queue<TrieNode*> q;
  for (int i = 0; i < K; ++i) {
    if (root->children[i]) {
      root->children[i]->failure = root;
      q.push(root->children[i]);
    } else {
      root->children[i] = root;
    }
  }

  while (!q.empty()) {
    TrieNode* node = q.front();
    q.pop();
    for (int i = 0; i < K; ++i) {
      if (node->children[i]) {
        TrieNode* f = node->failure;
        while (!f->children[i]) f = f->failure;

        f = f->children[i];
        node->children[i]->failure = f;
        if (node->children[i] != f) {
          for (int idx : f->indices) {
            node->children[i]->indices.insert(idx);
          }
        }
        q.push(node->children[i]);
      }
    }
  }
}
```
"""

result = llm_chain.run(system=system, user_msg=user_msg)

Llama.generate: prefix-match hit


 This is a C++ implementation of the Aho-Corasick algorithm, which is a string matching algorithm that can find all occurrences of a pattern in a given text.
The algorithm works by constructing a trie data structure from the pattern that we want to match. The trie is then used to search for matches in the text.
The code for the Aho-Corasick algorithm is quite complex, but it is well worth understanding how it works, as it can be a very useful tool for string matching tasks.


llama_print_timings:        load time =   730.30 ms
llama_print_timings:      sample time =   269.34 ms /   111 runs   (    2.43 ms per token,   412.12 tokens per second)
llama_print_timings: prompt eval time = 33759.40 ms /   312 tokens (  108.20 ms per token,     9.24 tokens per second)
llama_print_timings:        eval time = 17689.83 ms /   111 runs   (  159.37 ms per token,     6.27 tokens per second)
llama_print_timings:       total time = 52066.50 ms


## CodeLLama 13B

In [7]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

home = os.getenv('HOME')
model_root = os.path.join(home, 'models', 'ggml', 'codellama')
model_name = 'codellama-13b-instruct.Q4_0.gguf'
model_path = os.path.join(model_root, model_name)

params = dict(
  max_tokens=1024,
  temperature=0.8,
  top_k=0,
  top_p=0.99,
  repeat_penalty=1.1,
  n_ctx=2048,
)

llm = LlamaCpp(
  model_path=model_path,
  **params,
  stop=['</s>', '</SYS>', '\n\n\n\n\n'],
  callback_manager=callback_manager,
  verbose=True,
)

prompt_template = PromptTemplate(template=template, input_variables=['system', 'user_msg'])
llm_chain = LLMChain(prompt=prompt_template, llm=llm)

llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /home/kiddos/models/ggml/codellama/codellama-13b-instruct.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32016,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight

In [8]:
user_msg = """Write test for the following widget:
```dart
import 'package:flutter/material.dart';
class HelloWorldWidget extends StatelessWidget {
  @override
  Widget build(BuildContext context) {
    return Text("Hello World!");
  }
}
```
"""

result = llm_chain.run(system=system, user_msg=user_msg)

 Sure! Here's an example test for the `HelloWorldWidget` widget:
[test.dart](https://gist.github.com/mikelaudy/f79d23016a))]}
```dart
import 'package:flutter_test/flutter_test.dart';
void main() {
  testWidgets('HelloWorldWidget test', (WidgetTester tester) async {
    var key = new GlobalKey();


      await gesture.moveTo(const Offset(100, 20)));


llama_print_timings:        load time =  9285.77 ms
llama_print_timings:      sample time =   305.65 ms /   129 runs   (    2.37 ms per token,   422.06 tokens per second)
llama_print_timings: prompt eval time = 27440.87 ms /   100 tokens (  274.41 ms per token,     3.64 tokens per second)
llama_print_timings:        eval time = 36441.38 ms /   128 runs   (  284.70 ms per token,     3.51 tokens per second)
llama_print_timings:       total time = 64541.12 ms


In [9]:
user_msg = """Write a hello world program in c, and c++.
"""

result = llm_chain.run(system=system, user_msg=user_msg)

Llama.generate: prefix-match hit


 Here is the "Hello World" program written in C and C++:
C Program:
```c
#include <stdio.h>
int main() {
    printf("Hello, World!\n");
    return 0;
}
```
C++ Program:
```cpp
#include <iostream>

int main() {
    std::cout << "Hello, World!" << std::endl;
    return 0;
}
```
In both of the above code snippets, we first include external libraries that are required to compile and run our "Hello, World" program.

We then define a main() function which is where the bulk of our "Hello, World" program will be executed.

Within our main() function, we simply print out the string literal "Hello, World!" to the console using either the printf() function in C or the std::cout object in C++.

We then end our main() function by returning a 0 value from this function which indicates that our program executed successfully with no errors.


llama_print_timings:        load time =  9285.77 ms
llama_print_timings:      sample time =   542.30 ms /   229 runs   (    2.37 ms per token,   422.27 tokens per second)
llama_print_timings: prompt eval time =  3225.62 ms /    16 tokens (  201.60 ms per token,     4.96 tokens per second)
llama_print_timings:        eval time = 65280.53 ms /   229 runs   (  285.07 ms per token,     3.51 tokens per second)
llama_print_timings:       total time = 69628.29 ms


# Code Completion

In [18]:
home = os.getenv('HOME')
model_root = os.path.join(home, 'models', 'ggml', 'codellama')
model_name = 'codellama-7b.Q4_0.gguf'
model_path = os.path.join(model_root, model_name)

params = dict(
  max_tokens=256,
  temperature=0.5,
  top_k=0,
  top_p=0.9,
  repeat_penalty=1.1,
)

llm = LlamaCpp(
  model_path=model_path,
  **params,
  stop=['</s>', '\n\n'],
  callback_manager=callback_manager,
  verbose=True,
)

template = """{code}"""

prompt_template = PromptTemplate(template=template, input_variables=['code'])
llm_chain = LLMChain(prompt=prompt_template, llm=llm)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/kiddos/models/ggml/codellama/codellama-7b.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32016,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0     

In [19]:
code = 'def fibonacci(n):\n'

result = llm_chain.run(code=code)

  if n <=1:
    return n
  else:
    return fibonacci(n-2)) + fibonacci(n-1)))
fibonacci(5)
#
#


llama_print_timings:        load time =   818.86 ms
llama_print_timings:      sample time =   112.50 ms /    48 runs   (    2.34 ms per token,   426.65 tokens per second)
llama_print_timings: prompt eval time =   818.80 ms /     8 tokens (  102.35 ms per token,     9.77 tokens per second)
llama_print_timings:        eval time =  6992.82 ms /    48 runs   (  145.68 ms per token,     6.86 tokens per second)
llama_print_timings:       total time =  8036.84 ms


In [20]:
code = 'public void main(String[] args) {\n'

result = llm_chain.run(code=code)

Llama.generate: prefix-match hit


  int i = 0;
  while (i < 5)) {
    System.out.println("Hello World");
    i++;
  }
}
// https://www.youtube.com/watch?v=H_XeTj9oA&list=PLuO-6hVe8hW0xFm2y7U4&index=13
System.out.println(i); // 5


llama_print_timings:        load time =   818.86 ms
llama_print_timings:      sample time =   245.09 ms /   102 runs   (    2.40 ms per token,   416.17 tokens per second)
llama_print_timings: prompt eval time =  1029.77 ms /    10 tokens (  102.98 ms per token,     9.71 tokens per second)
llama_print_timings:        eval time = 14665.80 ms /   101 runs   (  145.21 ms per token,     6.89 tokens per second)
llama_print_timings:       total time = 16190.65 ms


In [21]:
code = 'void js_parser('

result = llm_chain.run(code=code)

Llama.generate: prefix-match hit


char *input) {
  /*
    The parser is a state machine.
    1. Start in the `JS_PARSER_EXPECTING_VALUE`
    2. When we encounter a new object/array, we set the current state to `JS_PARSER_INSIDE_OBJECT` or `JS_PARSER_INSIDE_ARRAY`.
    3. We also support nested objects and arrays.
    4. When we encounter the end of an object/array, we set the current state to `JS_PARSER_EXPECTING_VALUE`
  */
  int currentState = JS_PARSER_EXPECTING_VALUE;
  char *currentKey = NULL;
  int resultCount = 0;
  struct js_parser_result *results = NULL;
  while (input[resultCount] != '\0') {
    switch (currentState) {
      case JS_PARSER_EXPECTING_VALUE: {
        if ((input[resultCount] >= 'a' && input[resultCount] <= 'z')) {
          currentKey = &(input[resultCount]) -


llama_print_timings:        load time =   818.86 ms
llama_print_timings:      sample time =   612.58 ms /   256 runs   (    2.39 ms per token,   417.90 tokens per second)
llama_print_timings: prompt eval time =   523.26 ms /     5 tokens (  104.65 ms per token,     9.56 tokens per second)
llama_print_timings:        eval time = 37239.69 ms /   255 runs   (  146.04 ms per token,     6.85 tokens per second)
llama_print_timings:       total time = 39043.33 ms


# Q2

In [24]:
home = os.getenv('HOME')
model_root = os.path.join(home, 'models', 'ggml', 'codellama')
model_name = 'codellama-7b.Q2_K.gguf'
model_path = os.path.join(model_root, model_name)

params = dict(
  max_tokens=512,
  temperature=0.5,
  top_k=0,
  top_p=0.9,
  repeat_penalty=1.1,
)

llm = LlamaCpp(
  model_path=model_path,
  **params,
  stop=['</s>', '\n\n'],
  callback_manager=callback_manager,
  verbose=True,
)

template = """{code}"""

prompt_template = PromptTemplate(template=template, input_variables=['code'])
llm_chain = LLMChain(prompt=prompt_template, llm=llm)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/kiddos/models/ggml/codellama/codellama-7b.Q2_K.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q2_K     [  4096, 32016,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q3_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q3_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q3_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q2_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q3_K     

In [25]:
code = 'void js_parser('

result = llm_chain.run(code=code)

const char *code, struct parser_state *state)
  {
    int ret;
    ret = js_parse(js_get_parser(), code), state->error = (ret < 0)), state->result = (ret > 0))));
  }


llama_print_timings:        load time =   504.91 ms
llama_print_timings:      sample time =   144.44 ms /    62 runs   (    2.33 ms per token,   429.26 tokens per second)
llama_print_timings: prompt eval time =   504.88 ms /     6 tokens (   84.15 ms per token,    11.88 tokens per second)
llama_print_timings:        eval time =  6855.46 ms /    61 runs   (  112.38 ms per token,     8.90 tokens per second)
llama_print_timings:       total time =  7641.57 ms


In [26]:
code = 'void kandane('

result = llm_chain.run(code=code)

Llama.generate: prefix-match hit


int *a, int n) {
 #ifdef _MSC_VER
  __declspec(align(16))))
  int xmm[4];
 #else
  int xmm[4] __attribute__((aligned(16))));
 #endif


llama_print_timings:        load time =   504.91 ms
llama_print_timings:      sample time =   144.35 ms /    62 runs   (    2.33 ms per token,   429.52 tokens per second)
llama_print_timings: prompt eval time =   338.50 ms /     4 tokens (   84.63 ms per token,    11.82 tokens per second)
llama_print_timings:        eval time =  6845.60 ms /    61 runs   (  112.22 ms per token,     8.91 tokens per second)
llama_print_timings:       total time =  7464.71 ms


In [27]:
code = 'def tokenize(text):\n'

result = llm_chain.run(code=code)

Llama.generate: prefix-match hit


 # TODO: Tokenizer should be configurable.
  return text.split()
def stem_tokens(tokens):
    stemmed = []
    for token in tokens:
        stemmed.append(PorterStemmer().stem(token)))))))))).lower())
    return stemmed


llama_print_timings:        load time =   504.91 ms
llama_print_timings:      sample time =   163.14 ms /    70 runs   (    2.33 ms per token,   429.07 tokens per second)
llama_print_timings: prompt eval time =   572.29 ms /     7 tokens (   81.76 ms per token,    12.23 tokens per second)
llama_print_timings:        eval time =  7771.96 ms /    69 runs   (  112.64 ms per token,     8.88 tokens per second)
llama_print_timings:       total time =  8663.29 ms


## 13B

In [48]:
home = os.getenv('HOME')
model_root = os.path.join(home, 'models', 'ggml', 'codellama')
model_name = 'codellama-13b.Q2_K.gguf'
model_path = os.path.join(model_root, model_name)

params = dict(
  max_tokens=512,
  temperature=0.75,
  top_k=0,
  top_p=0.99,
  repeat_penalty=1.1,
)

llm = LlamaCpp(
  model_path=model_path,
  **params,
  stop=['</s>', '\n\n'],
  callback_manager=callback_manager,
  verbose=True,
)

template = """{code}"""

prompt_template = PromptTemplate(template=template, input_variables=['code'])
llm_chain = LLMChain(prompt=prompt_template, llm=llm)

llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /home/kiddos/models/ggml/codellama/codellama-13b.Q2_K.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q2_K     [  5120, 32016,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q3_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q3_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q3_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q2_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q3_K    

In [49]:
code = 'void js_parser('

result = llm_chain.run(code=code)

void);


llama_print_timings:        load time =  1009.83 ms
llama_print_timings:      sample time =     9.62 ms /     4 runs   (    2.41 ms per token,   415.58 tokens per second)
llama_print_timings: prompt eval time =  1009.67 ms /     6 tokens (  168.28 ms per token,     5.94 tokens per second)
llama_print_timings:        eval time =   664.21 ms /     3 runs   (  221.40 ms per token,     4.52 tokens per second)
llama_print_timings:       total time =  1692.32 ms


In [50]:
code = 'def tokenize(text):\n'

result = llm_chain.run(code=code)

Llama.generate: prefix-match hit


    return text.split()
def sanitize(text):
    return ' '.join(tokenize(text))
# Tokenization with Stemming
import re   # For regular expressions
from nltk import word_tokenize, pos_tag
# Determine whether a given word is a proper noun or not.
def isProperNoun(word)):
    if (re.search('[A-Z]+')$, word)))):
        return True
    else:
        return False
# Tag Proper Nouns and POS Tag everything else.
def tag_proper_nouns(sentence))) :
    # Find proper noun and its associated part of speech tag (e.g P, VB etc)
    proper_nouns = []  # Initialize empty list to store all the words that are proper nouns in the sentence
    parts_of_speech = [] # Initialize an empty list to store all the words parts of speech in the sentence
    for word, part_of_speech in pos_tag(word_tokenize(sentence)))):
        if ((re.search('[A-Z]+')$, word)) and (part_of_speech != 'NNP'))) : # Check whether the word is a proper noun or not by checking whether the part of speech tag for that particular word in 


llama_print_timings:        load time =  1009.83 ms
llama_print_timings:      sample time =  1202.51 ms /   504 runs   (    2.39 ms per token,   419.12 tokens per second)
llama_print_timings: prompt eval time =  1147.06 ms /     7 tokens (  163.87 ms per token,     6.10 tokens per second)
llama_print_timings:        eval time = 114592.10 ms /   503 runs   (  227.82 ms per token,     4.39 tokens per second)
llama_print_timings:       total time = 118346.15 ms


In [51]:
code = '// write a function that adds 2 number\n'

result = llm_chain.run(code=code)

Llama.generate: prefix-match hit


const add = (x, y) => {
    let result = x + y;
    console.log(result));
}


llama_print_timings:        load time =  1009.83 ms
llama_print_timings:      sample time =    73.96 ms /    31 runs   (    2.39 ms per token,   419.13 tokens per second)
llama_print_timings: prompt eval time =  1696.99 ms /    10 tokens (  169.70 ms per token,     5.89 tokens per second)
llama_print_timings:        eval time =  6526.38 ms /    30 runs   (  217.55 ms per token,     4.60 tokens per second)
llama_print_timings:       total time =  8369.98 ms


In [52]:
code = '## write a function that adds 2 number\n'

result = llm_chain.run(code=code)

Llama.generate: prefix-match hit



def add(a, b):
    return a+b


llama_print_timings:        load time =  1009.83 ms
llama_print_timings:      sample time =    38.49 ms /    16 runs   (    2.41 ms per token,   415.71 tokens per second)
llama_print_timings: prompt eval time =  1688.95 ms /    10 tokens (  168.90 ms per token,     5.92 tokens per second)
llama_print_timings:        eval time =  3337.04 ms /    15 runs   (  222.47 ms per token,     4.50 tokens per second)
llama_print_timings:       total time =  5103.29 ms


In [55]:
code = '# write an app with fast api'

result = llm_chain.run(code=code)

Llama.generate: prefix-match hit


 and uvicorn
- [x] install poetry and create a new project 
```bash
poetry init
```


llama_print_timings:        load time =  1009.83 ms
llama_print_timings:      sample time =    66.79 ms /    28 runs   (    2.39 ms per token,   419.21 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  6076.82 ms /    28 runs   (  217.03 ms per token,     4.61 tokens per second)
llama_print_timings:       total time =  6208.12 ms
