In [None]:
# https://github.com/lakshmanok/lakblogs/tree/main/bridge_bidding_advisor
# in order to execute the scripts, the supporting indexes 
# are required which in turn requires installation of 
# torch and transformers. Chromadb install will lead to pulling 
# all the supporting libraries. 

In [None]:
!pip install -r requirements.txt

In [1]:
# building the vector index
import chromadb
import bs4
import requests
import shutil
import os
import re
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

In [2]:
URL = "https://www.bridgeworld.com/pages/readingroom/bws/bwscompletesystem.html"
URL_LOCAL = URL.split('/')[-1]
CHROMA_COLLECTION_NAME = "bridge_world_system"
CHROMADB_DIR = "db/"

In [3]:
def download_file(url: str) -> str:
    local_filename = URL_LOCAL
    if not os.path.exists(local_filename):
        print(f"Downloading {URL} to {local_filename}.")
        with requests.get(url, stream=True) as r:
            with open(local_filename, 'wb') as f:
                shutil.copyfileobj(r.raw, f)
    else:
        print(f"Using already downloaded {local_filename}.")
    return local_filename

In [4]:
chroma_client = chromadb.PersistentClient(path=CHROMADB_DIR)
collection = chroma_client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
text_splitter = SentenceTransformersTokenTextSplitter()


  from tqdm.autonotebook import tqdm, trange


In [5]:
# update paragraphs into chromadb collection
download_file(URL)

Using already downloaded bwscompletesystem.html.


'bwscompletesystem.html'

In [8]:
# initiate building the index
with open(URL_LOCAL, 'r') as f:
    soup = bs4.BeautifulSoup(f.read(), 'html.parser')
    last_header = ""
    paragraphs = soup.find_all("p")
    print(len(paragraphs))

194


In [6]:
# initiate building the index
with open(URL_LOCAL, 'r') as f:
    soup = bs4.BeautifulSoup(f.read(), 'html.parser')
    last_header = ""
    paragraphs = soup.find_all("p")
    for n, paragraph in enumerate(paragraphs[:97]):
        paragraph_id = f"{URL_LOCAL}_{n}"
        text = paragraph.text.strip()
        # find the previous header
        header = paragraph.find_all(re.compile("^h[1-5]$"))
        if header:
            header = header[0].text.strip()
            last_header = header
        else:
            header = last_header
        # print(paragraph_id, "->", header, "->", len(text), "->", text[:30])
        # split the text into chunks and insert into chromadb
        ids = []
        documents = []
        metadatas = []
        chunks = text_splitter.create_documents([text]) # takes array of documents
        for chunk_no, chunk in enumerate(chunks):
            ids.append(f"{paragraph_id}#{chunk_no}")
            documents.append(chunk.page_content)
            metadatas.append({"title": header, "source": URL})
        if ids:
            collection.upsert(ids=ids, documents=documents, metadatas=metadatas)
        
        print(f"{int(0.5 + 100.0 * n / len(paragraphs))}% ({collection.count()})", end=" ", flush=True)
        
        if n % 10 == 0:
            print()

0% (1) 
1% (53) 1% (105) 2% (156) 2% (207) 3% (258) 3% (309) 4% (359) 4% (409) 5% (459) 5% (509) 
6% (559) 6% (609) 7% (657) 7% (705) 8% (753) 8% (801) 9% (848) 9% (895) 10% (942) 10% (989) 
11% (1035) 11% (1081) 12% (1127) 12% (1173) 13% (1218) 13% (1263) 14% (1308) 14% (1353) 15% (1398) 15% (1443) 
16% (1488) 16% (1532) 17% (1576) 18% (1620) 18% (1664) 19% (1707) 19% (1750) 20% (1793) 20% (1835) 21% (1877) 
21% (1918) 22% (1959) 22% (2000) 23% (2040) 23% (2080) 24% (2120) 24% (2160) 25% (2200) 25% (2240) 26% (2280) 
26% (2320) 27% (2360) 27% (2399) 28% (2438) 28% (2477) 29% (2516) 29% (2555) 30% (2594) 30% (2633) 31% (2671) 
31% (2709) 32% (2747) 32% (2785) 33% (2823) 34% (2860) 34% (2897) 35% (2934) 35% (2971) 36% (3007) 36% (3043) 
37% (3078) 37% (3111) 38% (3144) 38% (3176) 39% (3207) 39% (3237) 40% (3267) 40% (3297) 41% (3326) 41% (3353) 
42% (3380) 42% (3407) 43% (3433) 43% (3459) 44% (3485) 44% (3510) 45% (3535) 45% (3560) 46% (3585) 46% (3609) 
47% (3633) 47% (3657) 48% (3681)

In [None]:
import os
import dotenv
import dspy

def init_gemini_pro(temperature: float = 0.0):
    """
    Initializes dspy to use Gemini as the language model.
    """
    dotenv.load_dotenv("D:\\gitFolders\\python_de_learners_data\\.env")
    api_key = os.getenv("GOOGLE_API_KEY")
    gemini = dspy.Google("models/gemini-1.0-pro",
                         api_key=api_key,
                         temperature=temperature)
    dspy.settings.configure(lm=gemini, max_tokens=1024)


def init_gpt35(temperature: float = 0.0):
    """
    Initializes dspy to use OpenAI GPT 3.5 as the language model.
    """
    dotenv.load_dotenv("D:\\gitFolders\\python_de_learners_data\\.env")
    api_key = os.getenv("OPENAI_API_KEY")
    gpt35 = dspy.OpenAI(model="gpt-3.5-turbo",
                        api_key=api_key,
                        temperature=temperature)
    dspy.settings.configure(lm=gpt35, max_tokens=1024)

In [None]:
from dspy import teleprompt
from dspy.retrieve.chromadb_rm import ChromadbRM
import json

In [None]:
class ZeroShot(dspy.Module):
    """
    Provide answer to question
    """
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict("question -> answer")

    def forward(self, question):
        return self.prog(question="In the game of bridge, " + question)


class Definitions(dspy.Module):
    """
    Retrieve the definition from Wikipedia (2017 version)
    """
    def __init__(self):
        super().__init__()
        self.retriever = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

    def forward(self, term):
        result = self.retriever(f"In the game of bridge, what does {term} mean?", k=1)
        if result:
            return result[0].long_text
        return ""


class FindTerms(dspy.Module):
    """
    Extract bridge terms from a question
    """
    def __init__(self):
        super().__init__()
        self.entity_extractor = dspy.Predict("question -> terms")

    def forward(self, question):
        max_num_terms = max(1, len(question.split())//4)
        prompt = f"Identify up to {max_num_terms} terms in the following question that are jargon in the card game bridge."
        prediction = self.entity_extractor(
            question=f"{prompt}\n{question}"
        )
        answer = prediction.terms
        if "Terms: " in answer:
            start = answer.rindex("Terms: ") + len("Terms: ")
            answer = answer[start:]
        return [a.strip() for a in answer.split(',')]


def BiddingSystem():
    """
    Retreives rules for bidding in bridge.
    This is just a retriever and does not have any language model.
    """
    from chromadb.utils import embedding_functions
    default_ef = embedding_functions.DefaultEmbeddingFunction()
    return ChromadbRM(CHROMA_COLLECTION_NAME, CHROMADB_DIR, default_ef, k=3)


class AdvisorSignature(dspy.Signature):
    definitions = dspy.InputField(format=str)  # function to call on input to make it a string
    bidding_system = dspy.InputField(format=str) # function to call on input to make it a string
    question = dspy.InputField()
    answer = dspy.OutputField()

def shorten_list(response):
    if type(response) == list:
        return [ f"{r['long_text'][:25]} ... {len(r['long_text'])}" for r in response]
    else:
        return response   
    

class BridgeBiddingAdvisor(dspy.Module):
    """
    Functions as the orchestrator. All questions are sent to this module.
    """
    def __init__(self):
        super().__init__()
        self.find_terms = FindTerms()
        self.definitions = Definitions()
        # self.bidding_system = BiddingSystem()
        self.prog = dspy.ChainOfThought(AdvisorSignature,
                                        n=3)

    def forward(self, question):
        print("a:", question)
        terms = self.find_terms(question)
        print("b:", terms)
        definitions = [self.definitions(term) for term in terms]
        print("c:", definitions)
        bidding_system = BiddingSystem()(question)
        print("d:", shorten_list(bidding_system))
        prediction = self.prog(definitions=definitions,
                               bidding_system=bidding_system,
                               question="In the game of bridge, " + question,
                               max_tokens=-1024)
        return prediction.answer

In [None]:
def run(name: str, module: dspy.Module, queries: [str], shorten: bool = False):
    print(f"**{name}**")
    for query in queries:
        response = module(query)
        if shorten:
            response = shorten_list(response)
        print(response)
    print()

In [None]:
questions = [
        "What is Stayman?",
        "When do you use Jacoby Transfers?",
        "Playing Stayman and Transfers, what do you bid with 5-4 in the majors?"
    ]

In [None]:
run("Zeroshot", ZeroShot(), questions)
# exit(0)

In [None]:
run("definitions", Definitions(), ["Stayman", "Jacoby Transfers", "Strong 1NT", "majors"])

In [None]:
run("find_terms", FindTerms(), questions)

In [None]:
run("bidding_system", BiddingSystem(), questions, shorten=True)

In [None]:
run("bidding_advisor", BridgeBiddingAdvisor(), questions)

In [None]:
# create labeled training dataset
traindata = json.load(open("trainingdata.json", "r"))['examples']
trainset = [dspy.Example(question=e['question'], answer=e['answer']) for e in traindata]

In [None]:
# train
teleprompter = teleprompt.LabeledFewShot()
optimized_advisor = teleprompter.compile(student=BridgeBiddingAdvisor(),
                                         trainset=trainset)
run("optimized", optimized_advisor, questions)