In [197]:
import pandas as pd
import dspy

import nest_asyncio
nest_asyncio.apply()

In [74]:
## Wiring DSPy to Phoenix

In [75]:
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

endpoint = "http://127.0.0.1:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))
trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

In [None]:
readme_df = pd.read_json("../output/paperswithcode_with_readmes.json.gz")

In [2]:
readme_df.columns

Index(['repo', 'paper_urls', 'paper_titles', 'titles', 'arxiv_ids', 'authors',
       'tasks', 'readme'],
      dtype='object')

In [3]:
idx = 4
example_repo = readme_df.iloc[idx]["repo"]
print(readme_df.iloc[idx]["readme"])


# SincNet
SincNet is a neural architecture for processing **raw audio samples**. It is a novel Convolutional Neural Network (CNN) that encourages the first convolutional layer to discover more **meaningful filters**. SincNet is based on parametrized sinc functions, which implement band-pass filters.

In contrast to standard CNNs, that learn all elements of each filter, only low and high cutoff frequencies are directly learned from data with the proposed method. This offers a very compact and efficient way to derive a **customized filter bank** specifically tuned for the desired application. 

This project releases a collection of codes and utilities to perform speaker identification with SincNet.
An example of speaker identification with the TIMIT database is provided. If you are interested in **SincNet applied to speech recognition you can take a look into the PyTorch-Kaldi github repository (https://github.com/mravanelli/pytorch-kaldi).** 

<img src="https://github.com/mravanelli/Si

In [4]:
dependency_records_df = pd.read_json("../output/dependency_records/repo_dependencies_articlerank.json", lines=True, orient="records")

In [5]:
dependency_records_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156449 entries, 0 to 156448
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   repo       156449 non-null  object
 1   edge_type  156449 non-null  object
 2   nodes      156449 non-null  object
dtypes: object(3)
memory usage: 3.6+ MB


In [6]:
python_files_df = pd.read_parquet("../output/repo_selected_files.parquet")

In [7]:
python_files_df["repo_name"].value_counts().iloc[:10]

repo_name
shlizee/Audeo                               10
aishikchakraborty/LexSub                    10
natsumeS/analysis                           10
shermanhung/U-Net                           10
mtanti/mtanti-phd                           10
anonymous1100/Distributional-Discrepancy    10
vumaasha/atlas                              10
cyberjam/darknet_submit                     10
wszlong/sb-nmt                              10
Jeffrey-Ede/adaptive-scans                  10
Name: count, dtype: int64

In [8]:
example_repo 

'008karan/SincNet_demo'

In [9]:
example_repo_files_df = python_files_df[python_files_df["repo_name"] == example_repo]
example_repo_files_df["path"]

12260      compute_d_vector.py
31929               data_io.py
47820            speaker_id.py
115220           similarity.py
127410            inference.py
189566           dnn_models.py
258739    TIMIT_preparation.py
Name: path, dtype: object

In [10]:
example_repo_files_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, 12260 to 258739
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   content    7 non-null      object
 1   path       7 non-null      object
 2   repo_name  7 non-null      object
 3   tasks      7 non-null      object
dtypes: object(4)
memory usage: 280.0+ bytes


In [11]:
print(example_repo_files_df["content"].iloc[0])

# compute_d_vector.py
# Mirco Ravanelli 
# Mila - University of Montreal 

# Feb 2019

# Description: 
# This code computes d-vectors using a pre-trained model
 

import os
import soundfile as sf
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
from dnn_models import MLP
from dnn_models import SincNet as CNN 
from data_io import ReadList,read_conf_inp,str_to_bool
import sys

# Model to use for computing the d-vectors
model_file="SincNet_demo/sincnet_models/SincNet_TIMIT/model_raw.pkl" # This is the model to use for computing the d-vectors (it should be pre-trained using the speaker-id DNN)
cfg_file='SincNet_demo/cfg/SincNet_TIMIT.cfg' # Config file of the speaker-id experiment used to generate the model
#te_lst='data_lists/TIMIT_test.scp' # List of the wav files to process
te_lst='SincNet_demo/test.scp'
out_dict_file='SincNet_demo/d_vect_dr1_fcjf00.npy' # output dictionary containing the a sentence id as key as the d-vector as value
data_folder=

In [245]:

codellama = dspy.OllamaLocal(model="codellama",model_type='text',
                                max_tokens=512,
                                temperature=0,
                                top_p=0.9, frequency_penalty=1.17)

mistral = dspy.OllamaLocal(model="mistral",model_type='text',
                                max_tokens=512,
                                temperature=0,
                                top_p=0.9, frequency_penalty=1.17)

#len(example_repo_files_df["content"].iloc[0])

## Multiple OLlama models

In [246]:
import ollama

  (31, HUFFMAN_EMIT_SYMBOL, 245),


In [247]:
codellama.

<dsp.modules.ollama.OllamaLocal at 0x75bf5a30aef0>

In [253]:
"OLLAMA_HOST=127.0.0.1:1143{i} ollama serve {ollama_model_name}".split()

['OLLAMA_HOST=127.0.0.1:1143{i}', 'ollama', 'serve', '{ollama_model_name}']

In [275]:
import subprocess
import os

ollama_model_name = "codellama"


def run_ollama_subprocesses(ollama_model_name, port_suffix_range=range(2)):
    for i in port_suffix_range:
        #proc_env = dict(os.environ) | {'OLLAMA_HOST': f"127.0.0.1:1143{i}"}
        #proc = subprocess.Popen(['ollama', 'serve'], env=proc_env)
        #proc.communicate(['ollama', 'run', ollama_model_name, "write factorial in Python"])
        os.spawnl(os.P_NOWAIT, f'OLLAMA_HOST=127.0.0.1:1143{i} ollama serve; ollama run {ollama_model_name} "write factorial in Python"')
        #yield proc

ollama_processes = list(run_ollama_subprocesses(ollama_model_name))

ValueError: argv first element cannot be empty

In [None]:
ollama_processes

In [268]:
subprocess

269499

In [229]:
import asyncio
import itertools

class FileSummary(dspy.Signature):
    code = dspy.InputField(desc="Python code")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Summary of the code given guiding question")


class RepoSummary(dspy.Signature):
    context = dspy.InputField(desc="Python file summaries")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Repository summary")

    
def fetch_code(repo_name, python_files_df=python_files_df, n=5):
    repo_files_df = python_files_df[python_files_df["repo_name"] == repo_name].iloc[:n]
    return repo_files_df


REPO_SUMMARIZER_QUESTION_TEMPLATE = "For a repository named '{}' describe the functionalities of the following code:"

class RepoCodeSummarizer(dspy.Module):
    def __init__(self, fetch_code_fn, prompt_template=REPO_SUMMARIZER_QUESTION_TEMPLATE, verbose=True):
        super().__init__()
        self.fetch_code = fetch_code
        self.summarize_file = dspy.ChainOfThought(FileSummary)
        self.summarize_repo = dspy.ChainOfThought(RepoSummary)
        self.prompt_template = prompt_template
        self.verbose = verbose


    async def _summarize_file(self, lm, repo_name, code, i):
        if self.verbose:
            print(f"running lm no. {i}")
        with dspy.context(lm=lm):
            file_summary = self.summarize_file(question=self.prompt_template.format(repo_name), code=code)
        return file_summary

    async def _summarize_files_async(self, lms, repo_name, code_file_contents):
        lms = [
            lm 
            for lm_list in itertools.repeat(lms, len(code_file_contents))
            for lm in lm_list
        ]
        async_tasks = [
            self._summarize_file(lm, repo_name, code, i) for (i, (lm, code)) in enumerate(zip(lms, code_file_contents))
        ]
                               
        return await asyncio.gather(*async_tasks)
    
    def _summarize_files(self, lms, repo_name, code_file_contents):

        loop = asyncio.new_event_loop()

        return loop.run_until_complete(self._summarize_files_async(lms, repo_name, code_file_contents))
    
    def forward(self, repo_name, lms):
        code_files = self.fetch_code(repo_name)
        summaries = []
        #for code in code_files["content"].to_list():
        #    file_summary = self.summarize_file(question=self.prompt_template.format(repo_name), code=code)
        #    summaries.append(file_summary)
        #
        summaries = self._summarize_files(lms, repo_name, code_files["content"].to_list())
        
        summaries_context = "\n".join([f"{filename} summary:\n {summary}" for filename, summary in zip(code_files["path"], summaries)])

        repo_summary_question = f"""
        Given the following summaries of '{repo_name}' files write repository README.
        Focus on the functionalities and features.
        There is no need to describe the dependencies and setup.
        The README should provide answers to the following questions:
        - what machine learning problem does this repository tackle?
        - what kind of data does it use?
        Base your answer only on the information from context.
        """
        repo_summary = self.summarize_repo(
            question=repo_summary_question.strip(),
            context=summaries
        )
            
        return dspy.Prediction(**repo_summary, context_history=summaries)

In [230]:
python_files_df["repo_name"].isin(readme_df["repo"]).mean()

1.0

In [None]:
lms = [
    dspy.OllamaLocal(model="codellama",model_type='text',
                                max_tokens=512,
                                temperature=0,
                                base_url=f'http://localhost:1143{i}',
                                top_p=0.9, frequency_penalty=1.17)
    for i in [0, 1, 4]
]

In [None]:
#dspy.configure(lm=codellama)

#example_code = example_repo_files_df["content"].iloc[0]

repo_summarizer = RepoCodeSummarizer(fetch_code)
#code_summarizer(code=[example_code])

In [None]:
example_repo = python_files_df["repo_name"].unique()[1]

print(example_repo)
fetch_code(example_repo)

In [None]:
print(readme_df[readme_df["repo"] == example_repo].iloc[0]["tasks"])

In [None]:
print(readme_df[readme_df["repo"] == example_repo].iloc[0]["readme"])

In [None]:
%%time
repo_summarizer_answer = repo_summarizer(example_repo, lms)

In [156]:
repo_summarizer_answer

<Future at 0x75bfab03ed70 state=finished raised NameError>

In [95]:
print(repo_summarizer_answer["answer"])

The repository "semantic-foreground-inpainting" tackles the problem of semantic segmentation with foreground inpainting using PyTorch library. It uses pretrained weights from VGG16 network that was trained on ImageNet dataset to perform this task.


In [100]:
0.5 * len(python_files_df["repo_name"].unique()) / 60

316.98333333333335