In [1]:
import pandas as pd
import dspy
import re

import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Wiring DSPy to Phoenix

In [3]:
readme_df = pd.read_json("../output/paperswithcode_with_readmes.json.gz")

In [4]:
import ast

readme_df["tasks"] = readme_df["tasks"].apply(ast.literal_eval)#.explode().drop_duplicates()

In [5]:
readme_df["tasks"].apply(len).describe()

count    73326.000000
mean         2.959455
std          2.744033
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        101.000000
Name: tasks, dtype: float64

In [6]:
idx = 4
example_repo = readme_df.iloc[idx]["repo"]
print(readme_df.iloc[idx]["readme"])


# SincNet
SincNet is a neural architecture for processing **raw audio samples**. It is a novel Convolutional Neural Network (CNN) that encourages the first convolutional layer to discover more **meaningful filters**. SincNet is based on parametrized sinc functions, which implement band-pass filters.

In contrast to standard CNNs, that learn all elements of each filter, only low and high cutoff frequencies are directly learned from data with the proposed method. This offers a very compact and efficient way to derive a **customized filter bank** specifically tuned for the desired application. 

This project releases a collection of codes and utilities to perform speaker identification with SincNet.
An example of speaker identification with the TIMIT database is provided. If you are interested in **SincNet applied to speech recognition you can take a look into the PyTorch-Kaldi github repository (https://github.com/mravanelli/pytorch-kaldi).** 

<img src="https://github.com/mravanelli/Si

In [23]:
python_files_df = pd.read_feather("../output/selected_python_code.feather")

In [24]:
python_files_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297731 entries, 0 to 297730
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   content        297731 non-null  object
 1   path           297731 non-null  object
 2   repo_name      297731 non-null  object
 3   tasks          297731 non-null  object
 4   selected_code  270093 non-null  object
dtypes: object(5)
memory usage: 11.4+ MB


In [25]:
example_repo 

'008karan/SincNet_demo'

In [26]:
from github_search.lms.code2documentation import Code2Documentation, run_code2doc

In [27]:
lm_base_url="http://localhost:11430"
small_lm_base_url="http://localhost:11431"
lm_model_name = "codellama"
ollama_lm = dspy.OllamaLocal(
            model=lm_model_name,
            base_url=lm_base_url,
            num_ctx=4096,
            max_tokens=1024,
            top_k=100,
)
small_ollama_lm = dspy.OllamaLocal(
    model=lm_model_name,
    base_url=small_lm_base_url,
    num_ctx=1024,
    max_tokens=256,
    top_k=100,
)

In [29]:
run_code2doc(python_files_df[python_files_df["repo_name"] == example_repo], [small_ollama_lm, ollama_lm], 10)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:26<00:00, 26.52s/it]


Unnamed: 0,rationale,answer,repo_name,context_history,filenames,n_files
0,This repository tackles the problem of speaker...,This repository tackles the problem of speaker...,008karan/SincNet_demo,1. `sincnet.py`: This file contains a PyTorch ...,"[compute_d_vector.py, data_io.py, speaker_id.p...",7


## Multiple OLlama models

In [17]:
python_files_df["repo_name"].isin(readme_df["repo"]).mean()

1.0

In [68]:
ollama_lms = [
    dspy.OllamaLocal(model="codellama", base_url=f"http://localhost:1143{i}", num_ctx=2048, max_tokens=128, timeout_s=180)
    for i in range(2)
]

## Cohere

There is a bug in DSPy version from pip

In [19]:
from llms_dspy.dspy_lm_modules import Claude, Cohere

In [69]:
claude_haiku_name = "anthropic.claude-3-haiku-20240307-v1:0"

with open("/home/kuba/.keys/anthropic_key.txt") as f:
    api_key = f.read().strip()
    anthropic_lms = [
        Claude(model="claude-3-haiku-20240307", api_key=api_key, api_base="https://api.anthropic.com", max_tokens=256)
        for i in range(3)
    ]
    del api_key

In [70]:
lms = ollama_lms

In [71]:
def warmup_lm(lm, prompt="print fibonacci function in Python"):
    return lm(prompt)

In [72]:
anthropic_lms[0].kwargs

{'temperature': 0.0,
 'max_tokens': 256,
 'top_p': 1.0,
 'top_k': 1,
 'n': 1,
 'model': 'claude-3-haiku-20240307'}

In [73]:
warmup_lm(anthropic_lms[0])

['Here\'s a Python function that prints the Fibonacci sequence up to a given number of terms:\n\n```python\ndef fibonacci(n):\n    """\n    Prints the Fibonacci sequence up to the nth term.\n    \n    Args:\n        n (int): The number of terms to print in the Fibonacci sequence.\n    """\n    a, b = 0, 1\n    print(a)\n    print(b)\n    \n    for i in range(2, n):\n        c = a + b\n        print(c)\n        a, b = b, c\n```\n\nTo use this function, simply call it with the desired number of terms:\n\n```python\nfibonacci(10)\n```\n\nThis will output the first 10 Fibonacci numbers:\n\n```\n0\n1\n1\n2\n3\n5\n8\n13\n21\n34\n```\n\nHere\'s how the `fibonacci()` function works:\n\n1. The function takes a single argument `n`, which represents the number of terms to print in the Fibonacci sequence.\n2. The initial values of the Fibonacci sequence are set to `a = 0` and `b']


import logging
import os
from typing import Any, Optional

import backoff

from dsp.modules.lm import LM

try:
    import anthropic
    anthropic_rate_limit = anthropic.RateLimitError
except ImportError:
    anthropic_rate_limit = Exception



logger = logging.getLogger(__name__)
BASE_URL = "https://api.anthropic.com/v1/messages"


def backoff_hdlr(details):
    """Handler from https://pypi.org/project/backoff/."""
    print(
        "Backing off {wait:0.1f} seconds after {tries} tries "
        "calling function {target} with kwargs "
        "{kwargs}".format(**details),
    )


def giveup_hdlr(details):
    """Wrapper function that decides when to give up on retry."""
    if "rate limits" in details.message:
        return False
    return True



class Claude(LM):
    """Wrapper around anthropic's API. Supports both the Anthropic and Azure APIs."""

    def __init__(
        self,
        model: str = "claude-3-opus-20240229",
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        only_completed = False,
        **kwargs,
    ):
        super().__init__(model)
        try:
            from anthropic import Anthropic
        except ImportError as err:
            raise ImportError(
                "Claude requires `pip install anthropic`.") from err

        self.provider = "anthropic"
        self.api_key = api_key = os.environ.get(
            "ANTHROPIC_API_KEY") if api_key is None else api_key
        self.api_base = BASE_URL if api_base is None else api_base
        self.kwargs = {
            "temperature": kwargs.get("temperature", 0.0),
            "max_tokens": min(kwargs.get("max_tokens", 4096), 4096),
            "top_p": kwargs.get("top_p", 1.0),
            "top_k": kwargs.get("top_k", 1),
            "n": kwargs.pop("n", kwargs.pop("num_generations", 1)),
            **kwargs,
        }
        self.kwargs["model"] = model
        self.history: list[dict[str, Any]] = []
        self.client = Anthropic(api_key=api_key)
        self.only_completed = only_completed

    def log_usage(self, response):
        """Log the total tokens from the Anthropic API response."""
        usage_data = response.usage
        if usage_data:
            total_tokens = usage_data.input_tokens + usage_data.output_tokens
            logger.info(f'{total_tokens}')

    def basic_request(self, prompt: str, **kwargs):
        raw_kwargs = kwargs
        kwargs = {**self.kwargs, **kwargs}
        # caching mechanism requires hashable kwargs
        kwargs["messages"] = [{"role": "user", "content": prompt}]
        kwargs.pop("n")
        response = self.client.messages.create(**kwargs)
        history = {
            "prompt": prompt,
            "response": response,
            "kwargs": kwargs,
            "raw_kwargs": raw_kwargs,
        }
        self.history.append(history)
        return response

    @backoff.on_exception(
        backoff.expo,
        (anthropic_rate_limit),
        max_time=1000,
        max_tries=8,
        on_backoff=backoff_hdlr,
        giveup=giveup_hdlr,
    )
    def request(self, prompt: str, **kwargs):
        """Handles retrieval of completions from Anthropic whilst handling API errors."""
        return self.basic_request(prompt, **kwargs)

    def __call__(self, prompt, return_sorted=False, **kwargs):
        """Retrieves completions from Anthropic.

        Args:
            prompt (str): prompt to send to Anthropic
            only_completed (bool, optional): return only completed responses and ignores completion due to length. Defaults to True.
            return_sorted (bool, optional): sort the completion choices using the returned probabilities. Defaults to False.

        Returns:
            list[str]: list of completion choices
        """
        only_completed = self.only_completed
        #assert only_completed, "for now"
        #assert return_sorted is False, "for now"
        # per eg here: https://docs.anthropic.com/claude/reference/messages-examples
        # max tokens can be used as a proxy to return smaller responses
        # so this cannot be a proper indicator for incomplete response unless it isnt the user-intent.
        n = kwargs.pop("n", 1)
        completions = []
        for _ in range(n):
            response = self.request(prompt, **kwargs)
            # TODO: Log llm usage instead of hardcoded openai usage
            # if dsp.settings.log_openai_usage:
            #     self.log_usage(response)
            print(response)
            if only_completed and response.stop_reason == "max_tokens":
                continue
            completions = [c.text for c in response.content]
        return completions



In [74]:
#%%time

#list(ThreadPoolExecutor(max_workers=len(lms)).map(warmup_lm, lms))

In [75]:
dspy.configure(lm=lms[0])

In [580]:
#dspy.configure(lm=codellama)

#example_code = example_repo_files_df["content"].iloc[0]

repo_summarizer = RepoCodeSummarizer(fetch_code)
#code_summarizer(code=[example_code])

In [77]:
example_repo = python_files_df["repo_name"].unique()[1]

print(example_repo)
fetch_code(example_repo)

Chenyang-Lu/semantic-foreground-inpainting


Unnamed: 0,content,path,repo_name,tasks
1,import torch\nimport math\nimport numbers\nimp...,ours_model.py,Chenyang-Lu/semantic-foreground-inpainting,"[scene understanding, semantic segmentation]"
7721,import pandas as pd\nimport os\nimport torch\n...,cs_data_loader.py,Chenyang-Lu/semantic-foreground-inpainting,"[scene understanding, semantic segmentation]"
23591,from collections import OrderedDict\nimport ma...,ours_extractors.py,Chenyang-Lu/semantic-foreground-inpainting,"[scene understanding, semantic segmentation]"
96348,import torch\nimport torch.nn as nn\nimport to...,ours_test.py,Chenyang-Lu/semantic-foreground-inpainting,"[scene understanding, semantic segmentation]"
154317,import torch\nimport torch.functional as F\nim...,util.py,Chenyang-Lu/semantic-foreground-inpainting,"[scene understanding, semantic segmentation]"


In [78]:
print(readme_df[readme_df["repo"] == example_repo].iloc[0]["tasks"])

['scene understanding', 'semantic segmentation']


In [79]:
print(readme_df[readme_df["repo"] == example_repo].iloc[0]["readme"])

# semantic-foreground-inpainting

Codes and data of paper ''Semantic Foreground Inpainting from Weak Supervision'', IEEE Robotics and Automation Letters.

Inplementation of the main experiment on the Cityscapes dataset. For KITTI, manual annotations are provided.

## Datasets
Two manually annotated datasets for testing are released:

1. dataset/Cityscapes/gt_manual/gt_manual.zip

2. dataset/KITTI/gt_manual/gt_manual.zip


## Citation

```
@article{Lu2020ral,
author = {Lu, Chenyang and Dubbelman, Gijs},
journal = {IEEE Robotics and Automation Letters},
title = {{Semantic Foreground Inpainting from Weak Supervision}},
year = {2020}
}
```



In [103]:
repo_summarizer_answers = []


for repo_name in tqdm.tqdm(python_files_df["repo_name"].unique()[:100]):
    repo_summarizer_answers.append(repo_summarizer(repo_name, lms))

  0% 0/100 [00:00<?, ?it/s]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  1% 1/100 [00:19<31:42, 19.22s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  2% 2/100 [00:38<31:08, 19.07s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  3% 3/100 [00:55<29:39, 18.35s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  4% 4/100 [01:13<29:05, 18.18s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  5% 5/100 [01:30<27:47, 17.56s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  6% 6/100 [01:47<27:24, 17.50s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  7% 7/100 [02:04<26:56, 17.39s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  8% 8/100 [02:23<27:26, 17.90s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


  9% 9/100 [02:42<27:50, 18.36s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 10% 10/100 [03:02<28:13, 18.82s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 11% 11/100 [03:19<26:49, 18.08s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 12% 12/100 [03:38<26:53, 18.34s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 13% 13/100 [03:57<27:13, 18.77s/it]

running lm no. 0
running lm no. 1
running lm no. 2running lm no. 3

running lm no. 4


 14% 14/100 [04:15<26:29, 18.48s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 15% 15/100 [04:32<25:35, 18.06s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 16% 16/100 [04:55<27:10, 19.40s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 17% 17/100 [05:13<26:16, 18.99s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 18% 18/100 [05:29<24:59, 18.28s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 19% 19/100 [05:50<25:39, 19.01s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 20% 20/100 [06:08<24:52, 18.65s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 21% 21/100 [06:29<25:22, 19.27s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 22% 22/100 [06:48<25:06, 19.31s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 23% 23/100 [07:06<24:04, 18.76s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 24% 24/100 [07:25<23:56, 18.90s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 25% 25/100 [07:44<23:48, 19.05s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 26% 26/100 [08:03<23:33, 19.10s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 27% 27/100 [08:24<23:43, 19.49s/it]

running lm no. 0
running lm no. 1
running lm no. 2running lm no. 3

running lm no. 4


 28% 28/100 [08:45<23:51, 19.88s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 29% 29/100 [09:04<23:27, 19.82s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 30% 30/100 [09:22<22:19, 19.13s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 31% 31/100 [09:42<22:19, 19.42s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 32% 32/100 [10:02<22:11, 19.58s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 33% 33/100 [10:20<21:28, 19.23s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 34% 34/100 [10:39<20:56, 19.03s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 35% 35/100 [10:59<20:51, 19.26s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 36% 36/100 [11:18<20:37, 19.33s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 37% 37/100 [11:40<20:57, 19.96s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 38% 38/100 [11:56<19:40, 19.04s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 39% 39/100 [12:16<19:26, 19.12s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 40% 40/100 [12:35<19:14, 19.23s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 41% 41/100 [12:53<18:29, 18.80s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 42% 42/100 [13:13<18:23, 19.03s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3


 43% 43/100 [13:28<17:01, 17.92s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 44% 44/100 [13:47<17:09, 18.39s/it]

running lm no. 0
running lm no. 1
running lm no. 2running lm no. 3

running lm no. 4


 45% 45/100 [14:05<16:36, 18.12s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3


 46% 46/100 [14:20<15:22, 17.09s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 47% 47/100 [14:36<14:51, 16.82s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 48% 48/100 [14:57<15:40, 18.08s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 49% 49/100 [15:17<15:52, 18.68s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 50% 50/100 [15:33<14:55, 17.92s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 51% 51/100 [15:52<14:59, 18.36s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 52% 52/100 [16:11<14:50, 18.55s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 53% 53/100 [16:29<14:11, 18.11s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 54% 54/100 [16:48<14:11, 18.50s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 55% 55/100 [17:07<14:01, 18.71s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 56% 56/100 [17:26<13:44, 18.74s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 57% 57/100 [17:46<13:44, 19.17s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 58% 58/100 [18:07<13:47, 19.70s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 59% 59/100 [18:27<13:33, 19.85s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 60% 60/100 [18:46<13:02, 19.57s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 61% 61/100 [19:03<12:09, 18.70s/it]

running lm no. 0
running lm no. 1
running lm no. 2running lm no. 3

running lm no. 4


 62% 62/100 [19:21<11:41, 18.46s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 63% 63/100 [19:40<11:26, 18.56s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 64% 64/100 [19:58<11:08, 18.56s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 65% 65/100 [20:18<11:07, 19.07s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 66% 66/100 [20:38<10:55, 19.29s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 67% 67/100 [20:58<10:43, 19.51s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 68% 68/100 [21:19<10:36, 19.89s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 69% 69/100 [21:39<10:14, 19.82s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 70% 70/100 [21:57<09:45, 19.51s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 71% 71/100 [22:16<09:16, 19.18s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 72% 72/100 [22:35<09:01, 19.32s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 73% 73/100 [22:56<08:55, 19.82s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 74% 74/100 [23:15<08:24, 19.38s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 75% 75/100 [23:36<08:14, 19.78s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 76% 76/100 [23:56<07:56, 19.85s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 77% 77/100 [24:13<07:18, 19.06s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 78% 78/100 [24:30<06:46, 18.47s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 79% 79/100 [24:49<06:31, 18.62s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 80% 80/100 [25:08<06:17, 18.86s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 81% 81/100 [25:27<05:59, 18.91s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 82% 82/100 [25:45<05:36, 18.68s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 83% 83/100 [26:05<05:20, 18.84s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 84% 84/100 [26:27<05:17, 19.82s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 85% 85/100 [26:48<05:04, 20.27s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 86% 86/100 [27:07<04:39, 19.96s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 87% 87/100 [27:24<04:06, 18.93s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 88% 88/100 [27:42<03:45, 18.82s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 89% 89/100 [28:01<03:25, 18.72s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 90% 90/100 [28:21<03:12, 19.21s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 91% 91/100 [28:40<02:50, 18.94s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 92% 92/100 [28:58<02:29, 18.68s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 93% 93/100 [29:14<02:06, 18.14s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 94% 94/100 [29:34<01:50, 18.46s/it]

running lm no. 0running lm no. 1

running lm no. 2
running lm no. 3
running lm no. 4


 95% 95/100 [29:51<01:31, 18.27s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 96% 96/100 [30:12<01:15, 18.89s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 97% 97/100 [30:32<00:57, 19.15s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 98% 98/100 [30:48<00:36, 18.34s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 99% 99/100 [31:08<00:18, 18.88s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


100% 100/100 [31:26<00:00, 18.86s/it]


In [1273]:
type(dict(repo_summarizer_answers[0]))

dict

In [1]:
import phoenix as px
from pathlib import Path
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
directory = str(Path('~/Projects/torch_example/phoenix/sample_2k').expanduser())
os.makedirs(directory, exist_ok=True)

my_traces = px.Client().get_trace_dataset().save(directory=directory)

  if is_datetime64tz_dtype(timestamps):
  if is_datetime64tz_dtype(timestamps):


💾 Trace dataset saved to under ID: 353a22a7-b529-4d9d-a4ec-75f442aa3eb7
📂 Trace dataset path: /home/kuba/Projects/torch_example/phoenix/sample_2k/trace_dataset-353a22a7-b529-4d9d-a4ec-75f442aa3eb7.parquet


In [106]:
for repo_name, answer in zip(python_files_df["repo_name"].unique()[:10], repo_summarizer_answers):
    print("#" * 50)
    print(repo_name)
    print()
    print(readme_df[readme_df["repo"] == repo_name]["tasks"].iloc[0])
    print("#" * 50)
    print()
    print(answer["answer"])
    print()

##################################################
braincorp/ASC

['visual tracking']
##################################################

The 'braincorp/ASC' repository appears to be a machine learning library for image processing and computer vision tasks. It provides functionalities such as image sparsification, frame reading, and sparse coding, which suggest that it tackles various machine learning problems related to image processing and computer vision, such as image compression, denoising, and feature extraction. The data used by the repository is likely images or videos, as the code deals with frame reading and sparse coding of video frames.

##################################################
Chenyang-Lu/semantic-foreground-inpainting

['scene understanding', 'semantic segmentation']
##################################################

This repository tackles the problem of semantic foreground inpainting using PyTorch. It uses a pre-trained ResNet18 backbone and adds a PSP module

In [97]:
answer["answer"].split(". ")

['The given code is a Python script that performs various tasks related to prescriptive process monitoring',
 'The repository tackles the machine learning problem of predicting process performance based on historical data',
 'The code uses a dataset of process data, which includes numerical and categorical variables, to train and evaluate a machine learning model for predicting process performance.\n\nThe code also defines several functions, such as `calculate_cost` and `evaluate_model_cost`, which are likely to be used in the training and evaluation process',
 'The script appears to be designed for a specific dataset, as it reads data from a CSV file using']

In [61]:
repo_summarizer_answer["answer"]

'This repository tackles the problem of semantic foreground inpainting on 3D medical images with their corresponding segmentations. The repository provides a PyTorch implementation of a semantic segmentation model that can perform foreground inpainting using a pre-trained ResNet18 backbone with a PSP module on top of it.'

In [None]:
0.5 * len(python_files_df["repo_name"].unique()) / 60

## How many tokens per repo do we have?

In [190]:
import tiktoken

In [191]:
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [62]:
files_sample = python_files_df[python_files_df["repo_name"].isin(python_files_df["repo_name"].unique())].copy()

In [65]:
%%time
file_tokens = [
    len(toks) for toks in tokenizer.encode_batch(files_sample["content"], allowed_special={'<|endoftext|>'}, num_threads=16)
]

CPU times: user 4min 56s, sys: 12.1 s, total: 5min 8s
Wall time: 32.3 s


In [66]:
files_sample["n_tokens"] = file_tokens

In [67]:
tokens_per_repo = files_sample.groupby("repo_name")["n_tokens"].agg("mean")

In [68]:
tokens_per_repo.describe().round(3)

count      38038.000
mean        2074.254
std        10616.494
min            8.000
25%          984.400
50%         1473.400
75%         2238.692
max      1599139.100
Name: n_tokens, dtype: float64

In [69]:
tokens_per_repo[tokens_per_repo < tokens_per_repo.quantile(0.90)].sum()

50781731.95793651

In [71]:
tokens_per_repo[tokens_per_repo < tokens_per_repo.quantile(0.90)].sum() / 10 ** 6

50.78173195793651

In [55]:
len(python_files_df["repo_name"].unique())

38038

## Filtering code files

In [528]:
import abc
from pydantic import BaseModel
from typing import List
from comment_parser import comment_parser
import re


class PythonCodeSelector(abc.ABC):

    @abc.abstractmethod
    def extract(self, code) -> List[dict]:
        pass

    def extract_str(self, code):
        return [selection["text"] for selection in self.extract(code)]

    def select_code(self, code):
        matches = self.extract(code)
        selected_code = "\n...\n".join([m["text"] for m in merge_matches(matches)])
        return selected_code

    @classmethod
    def _merge_matches(cls, matches):
        merged_matches = []
        tmp_match = matches[0]
        for match in matches[1:]:
            if match["match_type"] == tmp_match["match_type"] and match["line_start"] == tmp_match["line_end"] + 1:
                tmp_match = {
                    "text": tmp_match["text"] + "\n" + match["text"],
                    "line_start": tmp_match["line_start"],
                    "line_end": match["line_end"],
                    "match_type": tmp_match["match_type"]
                }
            else:
                merged_matches.append(tmp_match)
                tmp_match = match
        return merged_matches

class CombinedSelector(PythonCodeSelector, BaseModel):
    selectors: List[PythonCodeSelector]
    
    def extract(self, code):
        extracted_parts = []
        for selector in self.selectors:
            extracted_parts += selector.extract(code)
        return sorted(extracted_parts, key=lambda r: r["line_start"])

    
    class Config:
        arbitrary_types_allowed=True


class CommentSelector(PythonCodeSelector):

    def extract(self, code):
        comments = comment_parser.python_parser.extract_comments(code)
        return [{"text": "#" + c.text(), "line_start": c.line_number(), "line_end": c.line_number(), "match_type": "comment"} for c in comments]

    

class SignatureSelector(PythonCodeSelector, BaseModel):
    
    pattern: re.Pattern = re.compile("(\s+ def|class) (.*:$)", re.MULTILINE)
    
    def extract(self, code):
        re_newline = re.compile(r'\n')
        matches = []
        for match in self.pattern.finditer(code):
            start = match.start()
            line_start = code.count("\n", 0, match.start())
            line_offset = code.count("\n", start, match.end()) + 1
            s = match.group()
            matches.append({"text": s, "line_start": line_start, "line_end": line_start + line_offset, "match_type": "signature"})
        return matches

In [529]:
selector = CombinedSelector(selectors=[CommentSelector(), SignatureSelector()])

In [537]:
selected_python_code_contents = []

for code in tqdm.tqdm(python_files_df["content"]):
    try:
        selected_python_code_contents.append(selector.select_code(code))
    except KeyboardInterrupt:
        break
    except:
        selected_python_code_contents.append(None)

100% 297731/297731 [13:13<00:00, 375.33it/s]  


In [546]:
python_files_df["selected_code"] = selected_python_code_contents

In [1189]:
python_files_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297731 entries, 0 to 297730
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   content        297731 non-null  object
 1   path           297731 non-null  object
 2   repo_name      297731 non-null  object
 3   tasks          297731 non-null  object
 4   selected_code  270093 non-null  object
dtypes: object(5)
memory usage: 11.4+ MB


In [1194]:
python_files_df.to_json("../output/generated_readmes/python_files_with_selected_code.jsonl", orient="records", lines=True)

In [1199]:
python_files_df.to_feather("../output/generated_readmes/python_files_with_selected_code.feather")

In [1200]:
pd.read_feather("../output/generated_readmes/python_files_with_selected_code.feather")

Unnamed: 0,content,path,repo_name,tasks,selected_code
0,# ============================================...,ASC/frame_reader.py,braincorp/ASC,[visual tracking],# ============================================...
1,import torch\nimport math\nimport numbers\nimp...,ours_model.py,Chenyang-Lu/semantic-foreground-inpainting,"[scene understanding, semantic segmentation]",class PSPModule(nn.Module):\n...\n\n def __...
2,# License: MIT License https://github.com/pass...,examples/linear_outofsample_mutiple.py,passalis/sef,"[dimensionality reduction, supervised dimensio...",# License: MIT License https://github.com/pass...
3,import numpy\nfrom . import torch\nimport elas...,elasticdeform/torch.py,gvtulder/elasticdeform,"[medical image segmentation, semantic segmenta...",class ElasticDeform(torch.autograd.Function):\...
4,"# -*- coding: utf-8 -*-\n""""""\nCreated on Tue A...",affective/linguisticResourceLIWCAffect.py,dadangewp/SemEval2017-RumourEval,"[stance classification, misinformation, classi...",# -*- coding: utf-8 -*-\n...\nclass LIWCAffect...
...,...,...,...,...,...
297726,"""""""Dataset""""""\n\n\n\nimport codecs\n\nimport c...",finetune/dataset.py,mkavim/finetune_bert,"[linear probe classification, language modelli...",class Dataset(object):\n...\n\n\n def __ini...
297727,import os\nimport json\nimport isodate\nfrom a...,ClassesAndUtil/Video.py,ucnet01/UCNet_Implementation,[general classification],"# base_dir = os.path.join('..','YouTube-Spam-D..."
297728,import tensorflow as tf\nimport tensorflow.con...,ops.py,taki0112/SPADE-Tensorflow,"[image to image translation, image generation]",##############################################...
297729,import lasagne\nfrom theano import sparse\nimp...,layers.py,kimiyoung/planetoid,"[entity extraction using gan, node classificat...",class DenseLayer(lasagne.layers.Layer):\n...\n...


## Summaries based on selected code

In [575]:
def fetch_selected_code(repo_name, n=5):
    selected_python_files =  python_files_df[python_files_df["repo_name"] == repo_name].iloc[:5]
    return selected_python_files["path"], selected_python_files["selected_code"]


repo_selected_code_summarizer = RepoCodeSummarizer(fetch_selected_code)

In [576]:
%%time

example_repo_summary = repo_selected_code_summarizer(example_repo, lms=lms)

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4
CPU times: user 88 ms, sys: 0 ns, total: 88 ms
Wall time: 14.7 s


In [582]:
for repo_name in tqdm.tqdm(python_files_df["repo_name"].unique()[:5]):
    example_repo_summary = repo_selected_code_summarizer(repo_name, lms=lms)

  0% 0/5 [00:00<?, ?it/s]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 20% 1/5 [00:13<00:52, 13.03s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 40% 2/5 [00:28<00:44, 14.68s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 60% 3/5 [00:41<00:27, 13.55s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 80% 4/5 [00:57<00:14, 14.76s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


100% 5/5 [01:13<00:00, 14.73s/it]


In [581]:
for repo_name in tqdm.tqdm(python_files_df["repo_name"].unique()[:10]):
    example_repo_summary = repo_summarizer(repo_name, lms=lms)

  0% 0/10 [00:00<?, ?it/s]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 10% 1/10 [00:19<02:53, 19.24s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 20% 2/10 [00:38<02:34, 19.27s/it]

running lm no. 0
running lm no. 1
running lm no. 2running lm no. 3

running lm no. 4


 30% 3/10 [00:57<02:13, 19.00s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 40% 4/10 [01:15<01:52, 18.74s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 50% 5/10 [01:33<01:31, 18.35s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 60% 6/10 [01:51<01:12, 18.22s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 70% 7/10 [02:08<00:53, 17.83s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 80% 8/10 [02:27<00:36, 18.24s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


 90% 9/10 [02:47<00:18, 18.69s/it]

running lm no. 0
running lm no. 1
running lm no. 2
running lm no. 3
running lm no. 4


100% 10/10 [03:07<00:00, 18.73s/it]


In [578]:
python_files_df["repo_name"].unique()[:10]

array(['braincorp/ASC', 'Chenyang-Lu/semantic-foreground-inpainting',
       'passalis/sef', 'gvtulder/elasticdeform',
       'dadangewp/SemEval2017-RumourEval', 'EmoryMLIP/DOvsOD_NeuralODEs',
       'Jorge-Mendes/darknet-google-colab',
       'CN-TU/reinforcement-learning-for-per-flow-buffer-sizing',
       'kinglintianxia/KittiSeg',
       'samadeusfp/prescriptiveProcessMonitoring'], dtype=object)

In [553]:
%%time
selected_code_n_tokens = python_files_df["selected_code"].dropna().apply(lambda selected_code: len(tokenizer.encode(selected_code, allowed_special={'<|endoftext|>'})))

CPU times: user 27.4 s, sys: 7.64 ms, total: 27.4 s
Wall time: 27.4 s


In [556]:
selected_code_n_tokens.describe()

count    270093.000000
mean        263.895784
std         703.452079
min           0.000000
25%          43.000000
50%         126.000000
75%         293.000000
max      132601.000000
Name: selected_code, dtype: float64

In [519]:
pd.concat([python_files_df["repo_name"], python_files_df["selected_code"].isna()],axis=1).groupby("repo_name")["selected_code"].mean().describe()

count    38038.000000
mean         0.243348
std          0.270368
min          0.000000
25%          0.000000
50%          0.200000
75%          0.400000
max          1.000000
Name: selected_code, dtype: float64

In [479]:
example_selected_code = "\n...\n".join([m["text"] for m in merge_matches(example_matches)])
print(example_selected_code)

class InputExample(object):
...


    def __init__(self, guid, text_a, text_b=None, label=None):
...
class InputFeatures(object):
...


    def __init__(self, input_ids, input_mask, segment_ids, label_id):
...
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
...
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
...
# Account for [CLS] and [SEP] with "- 2"
...
# The convention in BERT is:
# (a) For sequence pairs:
#  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
#  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
# (b) For single sequences:
#  tokens:   [CLS] the dog is hairy . [SEP]
#  t

In [475]:
len(tokenizer.encode(example_selected_code))

912

In [415]:
for elem in ast.walk(ast.parse(example_python_code)):
    if type(elem) == ast.ClassDef:
        print(ast.unparse(elem))
        break

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        if isinstance(label, list):
            self.label = label
        elif label:
            self.label = str(label)
        else:
            self.label = None


In [419]:
ast.unparse(elem.body[0])

"'A single training/test example for simple sequence classification.'"

In [428]:
fn_elem = elem.body[1]

In [433]:
fn_elem.body[0]

<ast.Expr at 0x7601c20dc9a0>

In [425]:
ast.unparse(elem.body[1].body[0])

"'Constructs a InputExample.\\n\\n        Args:\\n            guid: Unique id for the example.\\n            text_a: string. The untokenized text of the first sequence. For single\\n            sequence tasks, only this sequence must be specified.\\n            text_b: (Optional) string. The untokenized text of the second sequence.\\n            Only must be specified for sequence pair tasks.\\n            labels: (Optional) [string]. The label of the example. This should be\\n            specified for train and dev examples, but not for test examples.\\n        '"

In [421]:
ast.unparse(elem.body[1])

'def __init__(self, guid, text_a, text_b=None, label=None):\n    """Constructs a InputExample.\n\n        Args:\n            guid: Unique id for the example.\n            text_a: string. The untokenized text of the first sequence. For single\n            sequence tasks, only this sequence must be specified.\n            text_b: (Optional) string. The untokenized text of the second sequence.\n            Only must be specified for sequence pair tasks.\n            labels: (Optional) [string]. The label of the example. This should be\n            specified for train and dev examples, but not for test examples.\n        """\n    self.guid = guid\n    self.text_a = text_a\n    self.text_b = text_b\n    if isinstance(label, list):\n        self.label = label\n    elif label:\n        self.label = str(label)\n    else:\n        self.label = None'

## summarizer with code file concatenation

The first version is pretty slow most likely due to the fact of calling the LM multiple times.

Now that the code was selected the context should be able to handle several concatenated files

In [860]:
from contextlib import contextmanager


@contextmanager
def override_lm_params(**kwargs):
    lm = dspy.settings.lm
    old_kwargs = {param_name: lm.kwargs[param_name] for param_name in kwargs.keys()}
    try:
        for param_name, param_value in kwargs.items():
            lm.kwargs[param_name] = param_value
        yield
    finally:
        for param_name, param_value in old_kwargs.items():
            lm.kwargs[param_name] = param_value
    

In [763]:

class MultiFileSummary(dspy.Signature):
    context = dspy.InputField(desc="Python code")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Summary of the code given guiding question")

In [913]:
class Prompts:
    file_summary_question_template = """
    given the code extracted from Python files of {} repository,
    separated with ``` describe what each file implements in 3 sentences.
    Focus on machine learning models and data."""

    repo_summary_question_template = """
    Using summaries of '{}' files from Context, write repository README.
    Focus on the functionalities and features.
    There is no need to describe the dependencies and setup.
    The README should provide answers to the following questions:
    - what machine learning problem does this repository tackle?
    - what kind of data does it use?
    Base your answer only on the information from context.
    """.strip()


class RepoCodeSummarizer(dspy.Module):
    
    def __init__(self, fetch_code_fn, repo_summary_question_template=Prompts.repo_summary_question_template, file_summary_question_template=Prompts.file_summary_question_template, verbose=True):
        super().__init__()
        self.fetch_code = fetch_code_fn
        self.summarize_files = dspy.Predict(MultiFileSummary)
        self.summarize_repo = dspy.ChainOfThought(RepoSummary)
        self.file_summary_question_template = file_summary_question_template
        self.repo_summary_question_template = repo_summary_question_template

    def _create_multi_file_input(self, paths, code_file_contents):
        return "\n\n".join([
            self._create_single_file_part(path, code)
            for path, code in zip(paths, code_file_contents)
        ])

    def _create_single_file_part(self, path, code):
        return f"file {path}\n```\n{code}\n```"
    
    def _summarize_files(self, repo_name):
        paths, code_file_contents = self.fetch_code(repo_name)
        file_summarization_input = self._create_multi_file_input(paths, code_file_contents)
        return self.summarize_files(question=self.file_summary_question_template.format(repo_name), context=file_summarization_input)
    
    def forward(self, repo_name, file_summarizer_lm_kwargs={"num_predict": 1024}, repo_summarizer_lm_kwargs={"num_ctx": 1024, "num_predict": 256}):

        with override_lm_params(**file_summarizer_lm_kwargs):
            summaries = self._summarize_files(repo_name)["answer"]

        with override_lm_params(**repo_summarizer_lm_kwargs):
            repo_summary = self.summarize_repo(
                question=self.repo_summary_question_template.format(repo_name),
                context=summaries
            )
            
        return dspy.Prediction(**repo_summary, context_history=summaries)

In [914]:
ollama_lm = dspy.OllamaLocal(model="codellama", base_url=f"http://localhost:11430", num_ctx=4096, max_tokens=1024, timeout_s=180, top_k=50)

In [915]:
dspy.configure(lm=ollama_lm)

In [916]:
repo_summarizer = RepoCodeSummarizer(fetch_selected_code)

In [946]:
sampled_repos = python_files_df["repo_name"].drop_duplicates().sample(1000)
sampled_repos.iloc[:50]

11034                           JunYeopLee/capsule-networks
61606                                JiwonCocoder/matching1
52976               DeepPathology/TUPAC16_AlternativeLabels
238475                                         kmaninis/COB
12237                                      dair-iitd/imojie
112884                             koshian2/OctConv-TFKeras
14479                                     hellopipu/TC-MGAN
32069                                 PhIMaL/DeePyMoD_torch
100694                mktoid/made-thousand-facial-landmarks
12180                               HarikrishnanNB/ChaosNet
111050                                JohanZYe/IWAE-pytorch
33050                                   NathanGavenski/IUPE
45757                               malllabiisc/NeuralDater
38859                                    Cyanogenoid/fspool
21033                            tollymune/CycleGAN-PyTorch
11331                              Slowpuncher24/mlhiphy_v2
80521                                   

In [948]:
readme_df[readme_df["repo"].isin(sampled_repos)]["tasks"].explode().value_counts()

tasks
general classification                143
image classification                   92
semantic segmentation                  81
translation                            80
reinforcement learning                 77
                                     ... 
scene generation                        1
data visualization                      1
deception detection                     1
foveation                               1
human object interaction detection      1
Name: count, Length: 638, dtype: int64

In [964]:
readme_df["tasks"].explode().value_counts().iloc[:25]

tasks
general classification        7530
image classification          6215
semantic segmentation         5691
object detection              5645
translation                   4428
reinforcement learning        4265
classification                3350
language modelling            3299
representation learning       3089
question answering            2831
machine translation           2684
transfer learning             2583
image generation              2258
sentiment analysis            2172
frame                         2157
data augmentation             2106
time series                   1782
text classification           1675
domain adaptation             1525
super resolution              1517
pose estimation               1450
natural language inference    1449
real time object detection    1446
instance segmentation         1425
decision making               1310
Name: count, dtype: int64

In [1265]:
sampled_repos_df = pd.read_json("../output/sampled_repos.jsonl", orient="records", lines=True)

In [1266]:
(sampled_repos_df["query_tasks"].explode().value_counts() > 10).mean()

0.935

In [1267]:
sampled_repos_df["query_tasks"].apply(len).describe()

count    2540.000000
mean        2.270866
std         1.549961
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        26.000000
Name: query_tasks, dtype: float64

In [1268]:
sampled_repos_df["query_tasks"].explode().value_counts().describe()

count    200.000000
mean      28.840000
std       25.841309
min       10.000000
25%       16.000000
50%       21.000000
75%       32.000000
max      242.000000
Name: count, dtype: float64

In [1269]:
sampled_repos_df["query_tasks"].explode().value_counts()

query_tasks
classification                                 242
named entity recognition                       143
data augmentation                              131
image generation                               114
super resolution                                94
                                              ... 
viewpoint estimation                            10
traveling salesman problem                      10
joint multilingual sentence representations     10
language identification                         10
point cloud completion                          10
Name: count, Length: 200, dtype: int64

In [1270]:
sampled_repos_df

Unnamed: 0,repo,paper_urls,paper_titles,titles,arxiv_ids,authors,tasks,readme,query_tasks
0,0h-n0/thdbonas,['https://paperswithcode.com/paper/deep-bayesi...,['Deep Bayesian Optimization on Attributed Gra...,['Deep Bayesian Optimization on Attributed Gra...,"['1905.13403', '1905.06159', '1502.05700']","[""['Jiaxu Cui', 'Bo Yang', 'Xia Hu']"", ""['Lizh...","[neural architecture search, gaussian processe...",[![Github CI/CD](https://github.com/0h-n0/thdb...,[gaussian processes]
1,0xSameer/ast,['https://paperswithcode.com/paper/pre-trainin...,['Pre-training on high-resource speech recogni...,['Pre-training on high-resource speech recogni...,['1809.01431'],"[""['Sameer Bansal', 'Herman Kamper', 'Karen Li...","[speech recognition, translation, automatic sp...",# NOTE!\n\nThis repository is currently being ...,[speech to text translation]
2,0zgur0/STAR_Network,['https://paperswithcode.com/paper/gating-revi...,['Gating Revisited: Deep Multi-layer RNNs That...,['Gating Revisited: Deep Multi-layer RNNs That...,['1911.11033'],"['[\'Mehmet Ozgur Turkoglu\', ""Stefano D\'Aron...","[sequential image classification, action recog...",# STAckable Recurrent (STAR) Network\n[PAMI21]...,[sequential image classification]
3,1044197988/TF.Keras-Commonly-used-models,['https://paperswithcode.com/paper/u-net-convo...,['U-Net: Convolutional Networks for Biomedical...,['U-Net: Convolutional Networks for Biomedical...,"['1505.04597', '1807.10165', '1804.03999', '18...","[""['Olaf Ronneberger', 'Philipp Fischer', 'Tho...","[medical image segmentation, brain tumor segme...",# TF.Keras-å¸¸ç¨åå·\n\n__èªå·±æ´ççä¸...,"[brain tumor segmentation, lung nodule segment..."
4,12wang3/mllp,['https://paperswithcode.com/paper/scalable-ru...,['Scalable Rule-Based Representation Learning ...,['Scalable Rule-Based Representation Learning ...,"['2109.15103', '1912.04695']","[""['Zhuo Wang', 'Wei zhang', 'Ning Liu', 'Jian...","[classification, general classification, repre...",# Our new work\nFor better scalability and cla...,"[classification, binarization]"
...,...,...,...,...,...,...,...,...,...
2535,zuzuba/CISR_NeurIPS20,['https://paperswithcode.com/paper/airsim-high...,['AirSim: High-Fidelity Visual and Physical Si...,['AirSim: High-Fidelity Visual and Physical Si...,"['1705.05065', '2006.12136']","[""['Shital Shah', 'Debadeepta Dey', 'Chris Lov...","[autonomous driving, autonomous vehicles, safe...",# Curriculum Induction for Safe Reinforcement ...,[autonomous vehicles]
2536,zyang-16/MCNS,['https://paperswithcode.com/paper/understandi...,['Understanding Negative Sampling in Graph Rep...,['Understanding Negative Sampling in Graph Rep...,['2005.09863'],"[""['Zhen Yang', 'Ming Ding', 'Chang Zhou', 'Ho...","[node classification, representation learning,...",# MCNS\n\n### __[Arxiv](https://arxiv.org/abs/...,"[node classification, graph representation lea..."
2537,zyning/signalSeparation,['https://paperswithcode.com/paper/u-net-convo...,['U-Net: Convolutional Networks for Biomedical...,['U-Net: Convolutional Networks for Biomedical...,['1505.04597'],"[""['Olaf Ronneberger', 'Philipp Fischer', 'Tho...","[medical image segmentation, semantic segmenta...",# Spectrum Sensing\n\n\nForked from [U-Net](ht...,"[cell segmentation, thermal image segmentation]"
2538,zzangjinsun/NLSPN_ECCV20,['https://paperswithcode.com/paper/deformable-...,"['Deformable ConvNets v2: More Deformable, Bet...","['Deformable ConvNets v2: More Deformable, Bet...","['1811.11168', '2007.10042']","[""['Xizhou Zhu', 'Han Hu', 'Stephen Lin', 'Jif...","[semantic segmentation, depth estimation, ster...",Non-Local Spatial Propagation Network for Dept...,[depth completion]


In [1174]:
python_files_df[python_files_df["repo_name"].isin(sampled_repos_df["repo"])]["repo_name"].value_counts().describe()

count    903.000000
mean       7.974529
std        2.958024
min        1.000000
25%        6.000000
50%       10.000000
75%       10.000000
max       10.000000
Name: count, dtype: float64

In [1176]:
repo_summary_answers = []

for repo_name in tqdm.tqdm(sampled_repos_df["repo"]):
    repo_summary_answers.append(repo_summarizer(repo_name))


100% 903/903 [2:27:34<00:00,  9.81s/it]  


In [1178]:
directory = str(Path('~/Projects/torch_example/phoenix').expanduser())
os.makedirs(directory, exist_ok=True)

my_traces = px.Client().get_trace_dataset().save(directory=directory)

  if is_datetime64tz_dtype(timestamps):
  if is_datetime64tz_dtype(timestamps):


💾 Trace dataset saved to under ID: 875fe8b2-2350-46a6-a147-0ebefabecbc4
📂 Trace dataset path: /home/kuba/Projects/torch_example/phoenix/trace_dataset-875fe8b2-2350-46a6-a147-0ebefabecbc4.parquet


In [1182]:
for repo_name, answer in zip(sampled_repos_df["repo"], repo_summary_answers):
    answer["repo_name"] = repo_name

In [1184]:
repo_summary_answers[0]

Prediction(
    rationale='This repository tackles the task of image captioning, which is a machine learning problem that involves generating natural language descriptions for images. The data used in this repository is the MSCOCO dataset, which contains over 300,000 images with corresponding captions.\n\nThe repository provides code for building and training an image captioning model using TensorFlow and Keras. It includes a configuration file that specifies the hyperparameters for training, such as the number of input shards, the image format, and the vocabulary size. The repository also includes code for building the inference graph, creating the vocabulary, loading the model from checkpoint, and preparing the caption generator.\n\nThe repository also includes unit tests for the ShowAndTellModel class, which checks the number of parameters in the model, the output shapes, and the accuracy of the model on a test set. Additionally, it defines the Vocabulary class that creates the voca

In [1187]:
pd.DataFrame.from_records([dict(r) for r in repo_summary_answers]).to_json("../output/generated_readmes/dspy_generated_readme_samples.json", orient="records", lines=True)

In [1201]:
len(repo_summary_answers)

903

## Viewing generation results

In [815]:
import re
from colorama import Fore


def highlight_substring_matches(text, matched_substrings, highlight_color=Fore.RED):
    callback = lambda pat: highlight_color + pat.group() + Fore.RESET
    pattern = re.compile("|".join([f"({t})" for t in matched_substrings]))
    return re.sub(pattern, callback, text)


class RepoTextTaskHighlighter(BaseModel):
    repo_metadata_df: pd.DataFrame
    repo_name_col: str = "repo"

    def highlight_repo_tasks(self, repo_name, text, highlight_color=Fore.RED):
        repo_tasks = self.get_repo_tasks(repo_name)
        return highlight_substring_matches(text, repo_tasks, highlight_color)
        
    def get_repo_tasks(self, repo_name):
        return self.repo_metadata_df[self.repo_metadata_df[self.repo_name_col] == repo_name].iloc[0]["tasks"]

    def get_repo_text_from_metadata(self, repo_name, field_name="readme"):
        return self.repo_metadata_df[self.repo_metadata_df[self.repo_name_col] == repo_name].iloc[0][field_name]
    
    class Config:
        arbitrary_types_allowed = True
    

In [816]:
repo_task_highlighter = RepoTextTaskHighlighter(repo_metadata_df=readme_df)

In [817]:
print(repo_task_highlighter.highlight_repo_tasks(selected_repo_name, answer_multifile["answer"]))

This repository, 'jacobgil/pytorch-grad-cam', appears to tackle the problem of visualizing the attention of a PyTorch model on an image. The repository contains several files that implement different methods for computing and visualizing the attention of a model, including Activation Maximization (AM), Guided Backpropagation (GBP), EigenCAM, and Grad-CAM. The data used in this repository appears to be images, as the files contain code related to image processing and manipulation. The repository also contains code for training and evaluating PyTorch models, which suggests that the data is likely to be used for machine learning tasks such as image classification or object detection. Overall, this repository seems to provide a collection of tools and techniques for visualizing the attention of PyTorch models on images, which could be useful for understanding how these models are making predictions and identifying areas of interest in the input data.


In [818]:
print(repo_task_highlighter.highlight_repo_tasks(selected_repo_name, answer_multifile["rationale"]))

This repository, 'jacobgil/pytorch-grad-cam', appears to tackle the problem of visualizing the attention of a PyTorch model on an image. The repository contains several files that implement different methods for computing and visualizing the attention of a model, including Activation Maximization (AM), Guided Backpropagation (GBP), EigenCAM, and Grad-CAM.

The data used in this repository appears to be images, as the files contain code related to image processing and manipulation. The repository also contains code for training and evaluating PyTorch models, which suggests that the data is likely to be used for machine learning tasks such as image classification or object detection.

Overall, this repository seems to provide a collection of tools and techniques for visualizing the attention of PyTorch models on images, which could be useful for understanding how these models are making predictions and identifying areas of interest in the input data.


In [819]:
answer_multifile["answer"]

"This repository, 'jacobgil/pytorch-grad-cam', appears to tackle the problem of visualizing the attention of a PyTorch model on an image. The repository contains several files that implement different methods for computing and visualizing the attention of a model, including Activation Maximization (AM), Guided Backpropagation (GBP), EigenCAM, and Grad-CAM. The data used in this repository appears to be images, as the files contain code related to image processing and manipulation. The repository also contains code for training and evaluating PyTorch models, which suggests that the data is likely to be used for machine learning tasks such as image classification or object detection. Overall, this repository seems to provide a collection of tools and techniques for visualizing the attention of PyTorch models on images, which could be useful for understanding how these models are making predictions and identifying areas of interest in the input data."

In [820]:
repo_task_highlighter.get_repo_tasks(selected_repo_name)

['image generation',
 'decision making',
 'knowledge distillation',
 'interpretable machine learning',
 'fairness',
 'weakly supervised object localization',
 '3d action recognition',
 'action recognition',
 'adversarial attack',
 'temporal action localization',
 'object localization']

In [821]:
print(repo_task_highlighter.highlight_repo_tasks(selected_repo_name, repo_task_highlighter.get_repo_text_from_metadata(selected_repo_name)))

[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
![Build Status](https://github.com/jacobgil/pytorch-grad-cam/workflows/Tests/badge.svg)
[![Downloads](https://static.pepy.tech/personalized-badge/grad-cam?period=month&units=international_system&left_color=black&right_color=brightgreen&left_text=Monthly%20Downloads)](https://pepy.tech/project/grad-cam)
[![Downloads](https://static.pepy.tech/personalized-badge/grad-cam?period=total&units=international_system&left_color=black&right_color=blue&left_text=Total%20Downloads)](https://pepy.tech/project/grad-cam)

# Advanced AI explainability for PyTorch

`pip install grad-cam`

Documentation with advanced tutorials: [https://jacobgil.github.io/pytorch-gradcam-book](https://jacobgil.github.io/pytorch-gradcam-book)


This is a package with state of the art methods for Explainable AI for computer vision.
This can be used for diagnosing model predictions, either in production or while
devel