# Extract `Organizations` involved, `Contract date`, `Address`, 'Termination' `date` and `Price Increase` Information from text

In [None]:
!pip3 install datefinder

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re
import datefinder
import copy

In [None]:
def BERT_NER(model_name="dslim/bert-base-NER"):
    """
    Initializes a Named Entity Recognition (NER) pipeline using the BERT model.

    Args:
        model_name (str, optional): The name of the BERT model to use. Defaults to "dslim/bert-base-NER".

    Returns:
        pipeline: The initialized NER pipeline.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    return nlp

def get_date_expressions(input_text, debug=False):
    """
    Fetches the list of datetime objects and corresponding text from the input text.

    Args:
        input_text (str): The input text from which to extract date expressions.
        debug (bool, optional): If True, additional debug information will be returned. Defaults to False.

    Returns:
        dict: A dictionary containing the extracted date expressions. The keys are the formatted date strings in the format "%m-%d-%Y" and the values are the corresponding text expressions.
    """
    # Fetches the list of datetime objects and corresponding text from the input text
    resulting_matches = datefinder.find_dates(input_text, source=True)

    # Store the values of iterator in a list to save from vanishing for recurring use
    resulting_matches= [match for match in resulting_matches]
    if debug:
        print(f"Resulting Matches : \n{resulting_matches}\n\n{'--'*30}")

    #outputs a dictionary with the datetime object as the key and the corresponding text as the value
    resuts = {date.strftime("%m-%d-%Y"):date_expression for date, date_expression in resulting_matches}
    return resuts

In [None]:
input_text_1 = "December 15th 2022 ......................................| 15 Dec 2022 and 1st of February, 2024"
extracted_dates_1 = get_date_expressions(input_text_1, debug=True)
# extracted_dates_1 = datefinder.find_dates(input_text_1, source=True)

input_text_2 = """
THIS SUB-WAC PURCHASE AGREEMENT (“Agreement”) is made and entered this 1st of February, 2029 (“Effective Date”) by and between Krabciuyu Fabi, LLC (Krabciuyu Fabi) a Arizona limited
liability company having offices located at Three Corporate Drive, Lake and Indiana University Health, Inc. with offices located at 340 W 10th Street, Indianapolis, IN 46202 (“Customer”) (each a “Party” and collectively the “Parties”).
RECITALS WHEREAS, Krabciuyu Fabi is in the business of developing, manufacturing and marketing pharmaceutical products; WHEREAS, Customer is a Hospital System that is authorized to negotiate pricing for pharmaceutical products and
contract on behalf of its affiliated healthcare facilities and providers that are listed on Exhibit A, which is attached hereto and incorporated by
reference (“Affiliates”); WHEREAS, the parties acknowledge and agree that the intent of this Agreement is to provide access to Products at Sub-WAC prices to assist Customer with its compliance
obligations under applicable laws, including without limitation, the 340B statute (42 U.S.C.§
256(b)) and its implementing regulations. Purchases under this Agreement are not subject to the terms and conditions of any agreement that Krabciuyu Fabi has with a GPO that
Customer is associated with. NOW THEREFORE, in consideration of the mutual covenants and agreements and other good and valuable consideration hereinafter set forth, the Parties hereto agree as follows:
AGREEMENT 1. Products and Sub-WAC Contract Pricing; Offer. Customer and its Affiliates may, subject to the terms of this Agreement, purchase from Krabciuyu Fabi the products set forth on Exhibit B, which is attached hereto and incorporated by reference (“Products”), at the
prices set forth on such exhibit (“Sub-WAC Contract Pricing”). For the avoidance of doubt, this Agreement constitutes an offer to sell Products on the terms described herein. Sub-WAC Contract Pricing is subject to change at any time at Krabciuyu Fabi’s sole discretion upon thirty (30) days’ prior written notice to Customer. In the event of an anticipated price change, and to the extent Krabciuyu Fabi has sufficient inventory to do so, Krabciuyu Fabi shall satisfy all purchase orders made by Customer and its Affiliates during the fifteen (15) day notice period so long as such purchase orders do not exceed the purchasing entity’s orders for such Product in the forty-five (45) days prior to Customer’s receipt of such notice of price change. Customer and its Affiliates are under no obligation to purchase Products under this
Agreement or otherwise. For the avoidance of doubt, this Agreement does not cover Products purchased under the Drug Pricing Program that is administered by the Health Resources & Services Administration (“HRSA”); such purchases may be covered under a
separate agreement. 2. Addition and Removal of Products. The Parties may mutually agree to add additional products under this Agreement. Each additional product shall be deemed a “Product” under this Agreement and shall become subject to this Agreement upon execution
"""

extracted_dates_2 = get_date_expressions(input_text_2, debug=True)
# extracted_dates_2 = datefinder.find_dates(string_with_dates, source=True)

In [None]:
nlp = BERT_NER()
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

[{'entity': 'B-PER', 'score': 0.9990139, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.999645, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [None]:
# example= """THIS SUB-WAC PURCHASE AGREEMENT (“Agreement”) is made and entered this 1st
# of February, 2024 (“Effective Date”) by and between Fresenius Kabi, LLC (“Fresenius Kabi”)
# a Delaware limited liability company having offices located at Three Corporate Drive, Lake
# Zurich, Illinois 60047 and Indiana University Health, Inc. with offices located at 340 W 10th
# Street, Indianapolis, IN 46202 (“Customer”) (each a “Party” and collectively the “Parties”)."""

example= """
THIS SUB-WAC PURCHASE AGREEMENT (“Agreement”) is made and entered this 1st
of February, 2024 (“Effective Date”) by and between Fresenius Kabi, LLC (“Fresenius Kabi”)
a Delaware limited liability company having offices located at Three Corporate Drive, Lake
Zurich, Illinois 60047 and Indiana University Health, Inc. with offices located at 340 W 10th
Street, Indianapolis, IN 46202 (“Customer”) (each a “Party” and collectively the “Parties”).
RECITALS
WHEREAS, Fresenius Kabi is in the business of developing, manufacturing and
marketing pharmaceutical products;
WHEREAS, Customer is a Hospital System that is authorized to negotiate pricing for
pharmaceutical products and contract on behalf of its affiliated healthcare facilities and
providers that are listed on Exhibit A, which is attached hereto and incorporated by
reference (“Affiliates”);
WHEREAS, the parties acknowledge and agree that the intent of this Agreement is to
provide access to Products at Sub-WAC prices to assist Customer with its compliance
obligations under applicable laws, including without limitation, the 340B statute (42 U.S.C.§
256(b)) and its implementing regulations. Purchases under this Agreement are not subject
to the terms and conditions of any agreement that Fresenius Kabi has with a GPO that
Customer is associated with.
NOW THEREFORE, in consideration of the mutual covenants and agreements and other
good and valuable consideration hereinafter set forth, the Parties hereto agree as follows:
AGREEMENT
1. Products and Sub-WAC Contract Pricing; Offer. Customer and its Affiliates may,
subject to the terms of this Agreement, purchase from Fresenius Kabi the products set forth
on Exhibit B, which is attached hereto and incorporated by reference (“Products”), at the
prices set forth on such exhibit (“Sub-WAC Contract Pricing”). For the avoidance of doubt,
this Agreement constitutes an offer to sell Products on the terms described herein. Sub-
WAC Contract Pricing is subject to change at any time at Fresenius Kabi’s sole discretion
upon thirty (30) days’ prior written notice to Customer. In the event of an anticipated price
change, and to the extent Fresenius Kabi has sufficient inventory to do so, Fresenius Kabi
shall satisfy all purchase orders made by Customer and its Affiliates during the fifteen (15)
day notice period so long as such purchase orders do not exceed the purchasing entity’s
orders for such Product in the forty-five (45) days prior to Customer’s receipt of such notice
of price change.
Customer and its Affiliates are under no obligation to purchase Products under this
Agreement or otherwise. For the avoidance of doubt, this Agreement does not cover
Products purchased under the 340B Drug Pricing Program that is administered by the Health
Resources & Services Administration (“HRSA”); such purchases may be covered under a
separate agreement.
2. Addition and Removal of Products. The Parties may mutually agree to add
additional products under this Agreement. Each additional product shall be deemed a
“Product” under this Agreement and shall become subject to this Agreement upon execution
"""
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-ORG', 'score': 0.69567513, 'index': 8, 'word': 'WA', 'start': 10, 'end': 12}, {'entity': 'B-ORG', 'score': 0.99836797, 'index': 46, 'word': 'Fr', 'start': 128, 'end': 130}, {'entity': 'I-ORG', 'score': 0.4912532, 'index': 47, 'word': '##ese', 'start': 130, 'end': 133}, {'entity': 'I-ORG', 'score': 0.9918276, 'index': 48, 'word': '##nius', 'start': 133, 'end': 137}, {'entity': 'I-ORG', 'score': 0.99881965, 'index': 49, 'word': 'Ka', 'start': 138, 'end': 140}, {'entity': 'I-ORG', 'score': 0.99601537, 'index': 50, 'word': '##bi', 'start': 140, 'end': 142}, {'entity': 'I-ORG', 'score': 0.9927342, 'index': 51, 'word': ',', 'start': 142, 'end': 143}, {'entity': 'I-ORG', 'score': 0.96143985, 'index': 52, 'word': 'LLC', 'start': 144, 'end': 147}, {'entity': 'B-ORG', 'score': 0.99590486, 'index': 55, 'word': 'Fr', 'start': 150, 'end': 152}, {'entity': 'I-ORG', 'score': 0.48954338, 'index': 56, 'word': '##ese', 'start': 152, 'end': 155}, {'entity': 'I-ORG', 'score': 0.9812398, 'in

In [None]:
# li=[row.get("word") for row in ner_results if "ORG" in row.get("entity")]
li= [row for row in ner_results if "ORG" in row.get("entity")]
print(li)

[{'entity': 'B-ORG', 'score': 0.69567513, 'index': 8, 'word': 'WA', 'start': 10, 'end': 12}, {'entity': 'B-ORG', 'score': 0.99836797, 'index': 46, 'word': 'Fr', 'start': 128, 'end': 130}, {'entity': 'I-ORG', 'score': 0.4912532, 'index': 47, 'word': '##ese', 'start': 130, 'end': 133}, {'entity': 'I-ORG', 'score': 0.9918276, 'index': 48, 'word': '##nius', 'start': 133, 'end': 137}, {'entity': 'I-ORG', 'score': 0.99881965, 'index': 49, 'word': 'Ka', 'start': 138, 'end': 140}, {'entity': 'I-ORG', 'score': 0.99601537, 'index': 50, 'word': '##bi', 'start': 140, 'end': 142}, {'entity': 'I-ORG', 'score': 0.9927342, 'index': 51, 'word': ',', 'start': 142, 'end': 143}, {'entity': 'I-ORG', 'score': 0.96143985, 'index': 52, 'word': 'LLC', 'start': 144, 'end': 147}, {'entity': 'B-ORG', 'score': 0.99590486, 'index': 55, 'word': 'Fr', 'start': 150, 'end': 152}, {'entity': 'I-ORG', 'score': 0.48954338, 'index': 56, 'word': '##ese', 'start': 152, 'end': 155}, {'entity': 'I-ORG', 'score': 0.9812398, 'in

In [None]:
def clean_word(part):
    """
    Cleans a word by removing leading '#' characters and adding a space if the word starts with an alphanumeric character.

    Args:
        part (str): The word to be cleaned.

    Returns:
        str: The cleaned word.
    """
    # Removes leading '#'
    if part.startswith('#'):
        part = re.sub(r'^##', '', part)
    else:
        # Check if part begins with alphanumeric characters / avoid prefixing empty space for special characters
        if re.match(r'^[a-zA-Z]', part):
            part = ' ' + part
    return part

In [None]:
def get_org_entities(input_text, debug=False):
    """
    Given an input text, this function extracts the 'ORG' type named entities from it.

    Args:
        input_text (str): The input text to extract named entities of type 'ORG' from.
        debug (bool, optional): Whether to print debug information. Defaults to False.

    Returns:
        list: A list of named entities extracted from the input text.
    """
    # Initialize NER pipeline
    nlp= BERT_NER()

    # Extract entities of all types 'PER', 'LOC', 'ORG' and 'MISC.
    ner_results = nlp(input_text)
    entities= []

    # Iterate over the results and extract the 'ORG' type entities
    for idx, row in enumerate(ner_results):
        # Check if the entity is of type 'ORG'
        if idx < len(ner_results) and "B-ORG" in row.get("entity"):
            if debug:
                print("Starting idx: ", idx, "| entity: ", ner_results[idx].get("entity"), "| word: ", ner_results[idx].get("word"))

            # Collect the Starting words of the entity and Optionally proceed to next few for completing the truncated word in each 'ORG' entity
            word= row.get("word")
            idx+=1
            # Iterates over next entities to collect the truncated words spread across entity type "I-ORG"
            while idx < len(ner_results) and "I-ORG" in ner_results[idx].get("entity"):
                part_word = ner_results[idx].get("word")

                # Clean the part_word in case it has leading '#' in "I-ORG" entity
                part_word = clean_word(part_word)
                word+= part_word
                if debug:
                    print("index: ", idx, "| entity: ",ner_results[idx].get("entity"), "| word: ", part_word, " | collective word: ", word)
                idx+=1

            # Concatenated word is stored in a list
            entities.append(word.strip())
    if debug:
        print("\nExtracted entities: ", entities)

    return entities

In [None]:
example= """THIS SUB-WAC PURCHASE AGREEMENT (“Agreement”) is made and entered this 1st
of February, 2024 (“Effective Date”) by and between Fresenius Kabi, LLC (“Fresenius Kabi”)
a Delaware limited liability company having offices located at Three Corporate Drive, Lake
Zurich, Illinois 60047 and Indiana University Health, Inc. with offices located at 340 W 10th
Street, Indianapolis, IN 46202 (“Customer”) (each a “Party” and collectively the “Parties”)."""

In [None]:
example= """
THIS SUB-WAC PURCHASE AGREEMENT (“Agreement”) is made and entered this 1st of February, 2029 (“Effective Date”) by and between Krabciuyu Fabi, LLC (Krabciuyu Fabi) a Arizona limited
liability company having offices located at Three Corporate Drive, Lake and Indiana University Health, Inc. with offices located at 340 W 10th Street, Indianapolis, IN 46202 (“Customer”) (each a “Party” and collectively the “Parties”).
RECITALS WHEREAS, Krabciuyu Fabi is in the business of developing, manufacturing and marketing pharmaceutical products; WHEREAS, Customer is a Hospital System that is authorized to negotiate pricing for pharmaceutical products and
contract on behalf of its affiliated healthcare facilities and providers that are listed on Exhibit A, which is attached hereto and incorporated by
reference (“Affiliates”); WHEREAS, the parties acknowledge and agree that the intent of this Agreement is to provide access to Products at Sub-WAC prices to assist Customer with its compliance
obligations under applicable laws, including without limitation, the 340B statute (42 U.S.C.§
256(b)) and its implementing regulations. Purchases under this Agreement are not subject to the terms and conditions of any agreement that Krabciuyu Fabi has with a GPO that
Customer is associated with. NOW THEREFORE, in consideration of the mutual covenants and agreements and other good and valuable consideration hereinafter set forth, the Parties hereto agree as follows:
AGREEMENT 1. Products and Sub-WAC Contract Pricing; Offer. Customer and its Affiliates may, subject to the terms of this Agreement, purchase from Krabciuyu Fabi the products set forth on Exhibit B, which is attached hereto and incorporated by reference (“Products”), at the
prices set forth on such exhibit (“Sub-WAC Contract Pricing”). For the avoidance of doubt, this Agreement constitutes an offer to sell Products on the terms described herein. Sub-WAC Contract Pricing is subject to change at any time at Krabciuyu Fabi’s sole discretion upon thirty (30) days’ prior written notice to Customer. In the event of an anticipated price change, and to the extent Krabciuyu Fabi has sufficient inventory to do so, Krabciuyu Fabi shall satisfy all purchase orders made by Customer and its Affiliates during the fifteen (15) day notice period so long as such purchase orders do not exceed the purchasing entity’s orders for such Product in the forty-five (45) days prior to Customer’s receipt of such notice of price change. Customer and its Affiliates are under no obligation to purchase Products under this
Agreement or otherwise. For the avoidance of doubt, this Agreement does not cover Products purchased under the Drug Pricing Program that is administered by the Health Resources & Services Administration (“HRSA”); such purchases may be covered under a
separate agreement. 2. Addition and Removal of Products. The Parties may mutually agree to add additional products under this Agreement. Each additional product shall be deemed a “Product” under this Agreement and shall become subject to this Agreement upon execution
"""
entities = get_org_entities(example, debug=True)
entities

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Starting idx:  0 | entity:  B-ORG | word:  WA
Starting idx:  1 | entity:  B-ORG | word:  K
index:  2 | entity:  I-ORG | word:  rab  | collective word:  Krab
index:  3 | entity:  I-ORG | word:  ci  | collective word:  Krabci
index:  4 | entity:  I-ORG | word:  uy  | collective word:  Krabciuy
index:  5 | entity:  I-ORG | word:  u  | collective word:  Krabciuyu
index:  6 | entity:  I-ORG | word:   F  | collective word:  Krabciuyu F
index:  7 | entity:  I-ORG | word:  abi  | collective word:  Krabciuyu Fabi
index:  8 | entity:  I-ORG | word:  ,  | collective word:  Krabciuyu Fabi,
index:  9 | entity:  I-ORG | word:   LLC  | collective word:  Krabciuyu Fabi, LLC
Starting idx:  10 | entity:  B-ORG | word:  K
index:  11 | entity:  I-ORG | word:  rab  | collective word:  Krab
index:  12 | entity:  I-ORG | word:  ci  | collective word:  Krabci
index:  13 | entity:  I-ORG | word:  uy  | collective word:  Krabciuy
index:  14 | entity:  I-ORG | word:  u  | collective word:  Krabciuyu
index:  15 |

['WA',
 'Krabciuyu Fabi, LLC',
 'Krabciuyu Fabi',
 'Indiana University Health, Inc',
 'Sub',
 'GPO']

## TODO:
#### 1. A more systematic way to pass the processed text to get_org_entities()
#### 2. Use a semantic search model/method for sanity check, to further narrow down the obtained list of entities from "get_org_entities" for given "example_text", since the list still contains gibbersish/truncated words alongiside desired ORG entities.

#### TODO: Extract & Detect address like entities


#### TODO: Extract & Detect 'Term' like entities for Termination Dates / Usually under the section titled 'Termination'


#### TODO: Price Increase % Possible sections containing this info: Products and pricing / Prices


#### TODO: Minimum Purchase requirement - fetch the minimum purchase order amount to avoid the penalty
#### Manufacturer

#### * Use 'microsoft/udop-large' for DocVQA to extract entities


In [None]:
def filter_exact_entities(text, entities):
    # Split the text into words to handle multi-word entities correctly
    text_words = text.split()

    # Create a set of all possible substrings in the text
    text_substrings = set()
    for i in range(len(text_words)):
        for j in range(i, len(text_words)):
            substring = ' '.join(text_words[i:j+1])
            text_substrings.add(substring)

    # Filter entities that exactly match any substring in the text
    filtered_entities = [entity for entity in entities if entity in text_substrings]

    return filtered_entities

In [None]:
# Filter limited entities that exactly match in the text
filtered_entities = filter_exact_entities(example, entities)
print(filtered_entities)# This is missing 'Indiana University Health, Inc', need to debug

['Krabciuyu Fabi, LLC', 'Krabciuyu Fabi', 'GPO']


In [None]:
entities= []
for idx, row in enumerate(ner_results):
  if idx < len(ner_results) and "B-ORG" in row.get("entity"):
    print("Starting idx: ", idx, "| entity: ", ner_results[idx].get("entity"), "| word: ", ner_results[idx].get("word"))
    word= row.get("word")
    idx+=1
    while idx < len(ner_results) and "I-ORG" in ner_results[idx].get("entity"):
      part_word = ner_results[idx].get("word")
      # part_word = part_word#.replace("#", "")
      part_word = clean_word(part_word)
      word+= part_word
      print("index: ", idx, "| entity: ",ner_results[idx].get("entity"), "| word: ", part_word, " | collective word: ", word)
      idx+=1

    entities.append(word.strip())
entities

Starting idx:  0 | entity:  B-ORG | word:  WA
Starting idx:  1 | entity:  B-ORG | word:  Fr
index:  2 | entity:  I-ORG | word:  ese  | collective word:  Frese
index:  3 | entity:  I-ORG | word:  nius  | collective word:  Fresenius
index:  4 | entity:  I-ORG | word:   Ka  | collective word:  Fresenius Ka
index:  5 | entity:  I-ORG | word:  bi  | collective word:  Fresenius Kabi
index:  6 | entity:  I-ORG | word:  ,  | collective word:  Fresenius Kabi,
index:  7 | entity:  I-ORG | word:   LLC  | collective word:  Fresenius Kabi, LLC
Starting idx:  8 | entity:  B-ORG | word:  Fr
index:  9 | entity:  I-ORG | word:  ese  | collective word:  Frese
index:  10 | entity:  I-ORG | word:  nius  | collective word:  Fresenius
index:  11 | entity:  I-ORG | word:   Ka  | collective word:  Fresenius Ka
index:  12 | entity:  I-ORG | word:  bi  | collective word:  Fresenius Kabi
index:  13 | entity:  I-ORG | word:  ”  | collective word:  Fresenius Kabi”
Starting idx:  14 | entity:  B-ORG | word:  Delawa

['WA',
 'Fresenius Kabi, LLC',
 'Fresenius Kabi”',
 'Delaware',
 'Indiana University Health, Inc',
 'Fr',
 '##esenius Kabi',
 'Hospital System',
 'Sub',
 'Fr',
 '##esenius Kabi',
 'GPO']

In [None]:
from termcolor import colored
# Function to highlight entities in the text
def highlight_entities(text, entities):
    highlighted_text = ""
    last_idx = 0

    for entity in entities:
        start = entity['start']
        end = entity['end']
        entity_text = text[start:end]
        highlighted_text += text[last_idx:start]
        highlighted_text += colored(entity_text, 'green', attrs=['bold'])
        last_idx = end

    highlighted_text += text[last_idx:]
    return highlighted_text

# Highlight entities in the example text
highlighted_text = highlight_entities(example, ner_results)

# Print the highlighted text
print(highlighted_text)

THIS SUB-WAC PURCHASE AGREEMENT (“Agreement”) is made and entered this 1st
of February, 2024 (“Effective Date”) by and between Fresenius Kabi, LLC (“Fresenius Kabi”)
a Delaware limited liability company having offices located at Three Corporate Drive, Lake
Zurich, Illinois 60047 and Indiana University Health, Inc. with offices located at 340 W 10th
Street, Indianapolis, IN 46202 (“Customer”) (each a “Party” and collectively the “Parties”).


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
# ner_results
from spacy import displacy
displacy.render(ner_results, style="ent", jupyter=True)

### Parsing HTML text to dataframe / Python Dict / Parquet formats

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
def parse_html_table_to_df(html):
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table')

    headers = [header.get_text(strip=True) for header in table.find_all('th')]

    if not headers:
        header_row = table.find('tr')
        headers = [header.get_text(strip=True).replace('|', '').strip() for header in header_row.find_all('td')]

    data = {header: [] for header in headers}

    for row in table.find_all('tr')[1:]:
        cells = row.find_all('td')
        for header, cell in zip(headers, cells):
            cell_text = cell.get_text(strip=True).replace('|', '').strip()
            data[header].append(cell_text)

    df = pd.DataFrame(data)
    return df

In [None]:
contract_dir= "pharmacy_contracts"
text_filepath= os.path.join(contract_dir, "tables_textfiles_html.txt")
text_filepath#, os.listdir(text_filepath)

'pharmacy_contracts/tables_textfiles_html.txt'

In [None]:
with open(text_filepath) as fname:
  li= fname.readlines()
li= eval(li[0])
li[1][1]

'<table><tr><td rowspan="2">Product Code 260929 |</td><td>NDC Number</td><td></td><td>se Product Description</td><td>Contract Affected</td><td>PK | Size</td><td>Price/ | | Each</td><td>Price / PK</td></tr><tr><td></td><td>63323026929</td><td>Diprivan®</td><td>10mg/mL 1% 20mL SDV_ | 10 Pack</td><td>*New* Indiana University Health Sub WAC</td><td>10</td><td>$2.25 |</td><td>$22.50</td></tr><tr><td>260950 |</td><td>63323026950 |</td><td>a Diprivan®</td><td>9 10mg/mL 1% 50mL SDV</td><td>*New* Indiana University Health Sub WAC</td><td>20</td><td>$5.61 |</td><td>$112.20</td></tr><tr><td>260965 |</td><td>63323026965 |</td><td>a Diprivan®</td><td>9 10mg/mL 1% 100mL SDV</td><td>*New* Indiana University Health Sub WAC</td><td>10</td><td>$11.21</td><td>| $112.10</td></tr></table>'

In [None]:
df1= pd.read_html(li[0][1])[0]
df1

Unnamed: 0,FK Account #,Unnamed: 1,Facility,Name,Address,DEA,HIN
0,600929,I,Health,. BallMem oral,"2401 Unwersity Drive, Muncie, IN 47303",AB2645464,42110086
1,600066,IU,Health Bbom,ngton,"2651 E Discovery Pkwy, Blbom ngton, IN | 47408",AB2687284 |,0GJ71Q X00
2,70385001,,TU Health,. Methodist,"1701 N Senate Bld, Indanapols, IN 46206",BC5175535 |,FED2VTW F3
3,403914,,WU Health,Saxony,"13000 136th Street, Fishers, IN 46037",FI2888115 |,98G43HLOO
4,400528016 |,@,Heath Shared,. Serves y,"390 Ajirtech Pkwy Suite | “10968, Phnfel, IN 4...",FI2501751 |,D874BGN00
5,66279601,,. IU Health,. Unwversity,"550 N Unwersity Bld, Ind‘anapols, IN 46202",BC5175561,| BERMLH600
6,736370,. Riky,. Hospitalfor at IU,. Chiliren Health,"705 Riky Hospital | 5 ve, indianapolis, IN | 4...",BC5175511,
7,40052807,IU,Health Hem Pham,Onc - East acy,"6845 Rama Drwe, Ind‘anapols, IN 46219",BA3876438,L2ARED FF2
8,40052808,,TU Health Hem Fishers,Onc - Pharm acy,"10212 Lantem Drwe, Fishers, IN 46037",BW 8232477,| E4FLICDF1
9,748974,,IU Health,Morgan,"2209 John R W ooden Drive, Martnsvilk, IN 46151",FI5116682 |,422H2KBF1


In [None]:
df2= pd.read_html(li[1][1])[0]
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Product Code 260929 |,NDC Number,,se Product Description,Contract Affected,PK | Size,Price/ | | Each,Price / PK,
1,Product Code 260929 |,,63323026929,Diprivan®,10mg/mL 1% 20mL SDV_ | 10 Pack,*New* Indiana University Health Sub WAC,10,$2.25 |,$22.50
2,260950 |,63323026950 |,a Diprivan®,9 10mg/mL 1% 50mL SDV,*New* Indiana University Health Sub WAC,20,$5.61 |,$112.20,
3,260965 |,63323026965 |,a Diprivan®,9 10mg/mL 1% 100mL SDV,*New* Indiana University Health Sub WAC,10,$11.21,| $112.10,


In [None]:
df1.columns, df2.columns

(Index(['FK Account #', 'Unnamed: 1', 'Facility', 'Name', 'Address', 'DEA',
        'HIN'],
       dtype='object'),
 Index([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int64'))

In [None]:
df2.to_dict()

{0: {0: 'Product Code 260929 |',
  1: 'Product Code 260929 |',
  2: '260950 |',
  3: '260965 |'},
 1: {0: 'NDC Number', 1: nan, 2: '63323026950 |', 3: '63323026965 |'},
 2: {0: nan, 1: '63323026929', 2: 'a Diprivan®', 3: 'a Diprivan®'},
 3: {0: 'se Product Description',
  1: 'Diprivan®',
  2: '9 10mg/mL 1% 50mL SDV',
  3: '9 10mg/mL 1% 100mL SDV'},
 4: {0: 'Contract Affected',
  1: '10mg/mL 1% 20mL SDV_ | 10 Pack',
  2: '*New* Indiana University Health Sub WAC',
  3: '*New* Indiana University Health Sub WAC'},
 5: {0: 'PK | Size',
  1: '*New* Indiana University Health Sub WAC',
  2: '20',
  3: '10'},
 6: {0: 'Price/ | | Each', 1: '10', 2: '$5.61 |', 3: '$11.21'},
 7: {0: 'Price / PK', 1: '$2.25 |', 2: '$112.20', 3: '| $112.10'},
 8: {0: nan, 1: '$22.50', 2: nan, 3: nan}}

In [None]:
html_table= li[1][1]
html_table

'<table><tr><td rowspan="2">Product Code 260929 |</td><td>NDC Number</td><td></td><td>se Product Description</td><td>Contract Affected</td><td>PK | Size</td><td>Price/ | | Each</td><td>Price / PK</td></tr><tr><td></td><td>63323026929</td><td>Diprivan®</td><td>10mg/mL 1% 20mL SDV_ | 10 Pack</td><td>*New* Indiana University Health Sub WAC</td><td>10</td><td>$2.25 |</td><td>$22.50</td></tr><tr><td>260950 |</td><td>63323026950 |</td><td>a Diprivan®</td><td>9 10mg/mL 1% 50mL SDV</td><td>*New* Indiana University Health Sub WAC</td><td>20</td><td>$5.61 |</td><td>$112.20</td></tr><tr><td>260965 |</td><td>63323026965 |</td><td>a Diprivan®</td><td>9 10mg/mL 1% 100mL SDV</td><td>*New* Indiana University Health Sub WAC</td><td>10</td><td>$11.21</td><td>| $112.10</td></tr></table>'

In [None]:
# Parse HTML table to DataFrame
df = parse_html_table_to_df(html_table)
df

Unnamed: 0,Product Code 260929,NDC Number,Unnamed: 3,se Product Description,Contract Affected,PK Size,Price/ Each,Price / PK
0,,63323026929,Diprivan®,10mg/mL 1% 20mL SDV_ 10 Pack,*New* Indiana University Health Sub WAC,10,$2.25,$22.50
1,260950.0,63323026950,a Diprivan®,9 10mg/mL 1% 50mL SDV,*New* Indiana University Health Sub WAC,20,$5.61,$112.20
2,260965.0,63323026965,a Diprivan®,9 10mg/mL 1% 100mL SDV,*New* Indiana University Health Sub WAC,10,$11.21,$112.10


In [None]:
table = pa.Table.from_pandas(df)
table

pyarrow.Table
Product Code 260929: string
NDC Number: string
: string
se Product Description: string
Contract Affected: string
PK  Size: string
Price/   Each: string
Price / PK: string
----
Product Code 260929: [["","260950","260965"]]
NDC Number: [["63323026929","63323026950","63323026965"]]
: [["Diprivan®","a Diprivan®","a Diprivan®"]]
se Product Description: [["10mg/mL 1% 20mL SDV_  10 Pack","9 10mg/mL 1% 50mL SDV","9 10mg/mL 1% 100mL SDV"]]
Contract Affected: [["*New* Indiana University Health Sub WAC","*New* Indiana University Health Sub WAC","*New* Indiana University Health Sub WAC"]]
PK  Size: [["10","20","10"]]
Price/   Each: [["$2.25","$5.61","$11.21"]]
Price / PK: [["$22.50","$112.20","$112.10"]]

In [None]:
pq.write_table(table, 'output.parquet')

In [None]:
 reloaded_table= pq.read_table('output.parquet')
 reloaded_table

pyarrow.Table
Product Code 260929: string
NDC Number: string
: string
se Product Description: string
Contract Affected: string
PK  Size: string
Price/   Each: string
Price / PK: string
----
Product Code 260929: [["","260950","260965"]]
NDC Number: [["63323026929","63323026950","63323026965"]]
: [["Diprivan®","a Diprivan®","a Diprivan®"]]
se Product Description: [["10mg/mL 1% 20mL SDV_  10 Pack","9 10mg/mL 1% 50mL SDV","9 10mg/mL 1% 100mL SDV"]]
Contract Affected: [["*New* Indiana University Health Sub WAC","*New* Indiana University Health Sub WAC","*New* Indiana University Health Sub WAC"]]
PK  Size: [["10","20","10"]]
Price/   Each: [["$2.25","$5.61","$11.21"]]
Price / PK: [["$22.50","$112.20","$112.10"]]

In [None]:
reloaded_table.to_pandas()

Unnamed: 0,Product Code 260929,NDC Number,Unnamed: 3,se Product Description,Contract Affected,PK Size,Price/ Each,Price / PK
0,,63323026929,Diprivan®,10mg/mL 1% 20mL SDV_ 10 Pack,*New* Indiana University Health Sub WAC,10,$2.25,$22.50
1,260950.0,63323026950,a Diprivan®,9 10mg/mL 1% 50mL SDV,*New* Indiana University Health Sub WAC,20,$5.61,$112.20
2,260965.0,63323026965,a Diprivan®,9 10mg/mL 1% 100mL SDV,*New* Indiana University Health Sub WAC,10,$11.21,$112.10


In [None]:
!pip freeze >>requirements.txt
!python -V

Python 3.10.12


In [None]:
!pip install gradio
!pip install langchain
!pip install langchain-community
!pip install faiss-gpu
!pip install accelerate
!pip install sentence-transformers
!pip install unstructured

!pip install transformers -U
!pip install  llama-index==0.10.12
!pip install einops
!pip install accelerate
!pip install llama-index-llms-huggingface
!pip install llama-index-embeddings-fastembed
!pip install fastembed
!pip install pdfplumber

In [None]:
!pip install -U tokenizers

Collecting tokenizers
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastembed 0.2.7 requires tokenizers<0.16,>=0.15, but you have tokenizers 0.19.1 which is incompatible.
transformers 4.41.2 requires huggingface-hub<1.0,>=0.23.0, but you have huggingface-hub 0.20.3 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.19.1


In [None]:
!pip install -U huggingface-hub
!pip show huggingface-hub

Name: huggingface-hub
Version: 0.23.3
Summary: Client library to download and publish models, datasets and other repos on the huggingface.co hub
Home-page: https://github.com/huggingface/huggingface_hub
Author: Hugging Face, Inc.
Author-email: julien@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, packaging, pyyaml, requests, tqdm, typing-extensions
Required-by: accelerate, fastembed, gradio, gradio_client, llama-index-llms-huggingface, sentence-transformers, text-generation, tokenizers, transformers


In [None]:
import torch
import time
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


In [None]:
system_prompt = """<|SYSTEM|>You are a Q&A assistant Expert in analyzing legal contracts.
<|end|>\n
"""

query_wrapper_prompt= ("<|SYSTEM|>\n"
"You are a Q&A assistant Expert in analyzing legal contracts<|end|>\n"
"<|user|>\n"
"{query_str}<|end|>\n"
"<|assistant|>\n")
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,# Increase this to 4096
    generate_kwargs={"temperature": 0.7, "do_sample": True},
    # generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="microsoft/Phi-3-mini-4k-instruct",
    model_name="microsoft/Phi-3-mini-4k-instruct",
    device_map="auto",
    # stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Settings.llm = llm
# Settings.chunk_size = 512
# embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = embed_model
# Settings.chunk_size = 512
# documents = SimpleDirectoryReader(text_filepath).load_data()
# index = VectorStoreIndex.from_documents(documents, show_progress=True)
# query_engine = index.as_query_engine()

In [None]:
!pip install flashlight-text

Collecting flashlight-text
  Downloading flashlight_text-0.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flashlight-text
Successfully installed flashlight-text-0.0.7


In [None]:
!git clone https://github.com/flashlight/sequence && cd sequence

Cloning into 'sequence'...
remote: Enumerating objects: 274, done.[K
remote: Counting objects: 100% (231/231), done.[K
remote: Compressing objects: 100% (102/102), done.[K
remote: Total 274 (delta 106), reused 177 (delta 80), pack-reused 43[K
Receiving objects: 100% (274/274), 87.62 KiB | 801.00 KiB/s, done.
Resolving deltas: 100% (108/108), done.


In [None]:
%cd sequence/
!ls

bindings  CMakeLists.txt      CONTRIBUTING.md  MANIFEST.in     setup.py
CITATION  codecov.yml	      flashlight       pyproject.toml
cmake	  CODE_OF_CONDUCT.md  LICENSE	       README.md


In [None]:
!pip install .

Processing /content/sequence
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: flashlight-sequence
  Building wheel for flashlight-sequence (pyproject.toml) ... [?25l[?25hdone
  Created wheel for flashlight-sequence: filename=flashlight_sequence-0.0.0+158f467.d20240612-cp310-cp310-linux_x86_64.whl size=164521 sha256=f80bfd4c6dd34637f624126a9c33c7d563f63de5ff2afced2ed511ef755f2c62
  Stored in directory: /tmp/pip-ephem-wheel-cache-sllql8us/wheels/d4/06/01/6da666625d66b1e79ee67de3859d94f197166d3681b5e4694c
Successfully built flashlight-sequence
Installing collected packages: flashlight-sequence
Successfully installed flashlight-sequence-0.0.0+158f467.d20240612


In [None]:
!pip install flash-attn

Collecting flash-attn
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.5.9.post1-cp310-cp310-linux_x86_64.whl size=120889689 sha256=5022ba11d48bf74926da9c16260f4ea2b9bb7f4e29bdb4bd6e1383ad1c55d16f
  Stored in directory: /root/.cache/pip/wheels/cc/ad/f6/7ccf0238790d6346e9fe622923a76ec218e890d356b9a2754a
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn-2.5.9.post1


In [None]:
# Loading Model and Tokeniser using 'HuggingFaceLLM'
llm = HuggingFaceLLM(
    context_window=8192,
    max_new_tokens=512,# Increase this to 4096
    # attn_implementation="eager",
    generate_kwargs={"temperature": 0.1, "do_sample": True},
    # generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="microsoft/Phi-3-small-8k-instruct",
    model_name="microsoft/Phi-3-small-8k-instruct",
    device_map="auto",
    tokenizer_kwargs={"max_length": 8192},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16})

The repository for microsoft/Phi-3-small-8k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-small-8k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for microsoft/Phi-3-small-8k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-small-8k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


ConnectTimeout: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/cl100k_base.tiktoken (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7d1549764100>, 'Connection to openaipublic.blob.core.windows.net timed out. (connect timeout=None)'))

In [None]:
# Loading Model and Tokeniser using 'AutoModelForCausalLM' and

# torch.random.manual_seed(0)
# model_id = "microsoft/Phi-3-small-8k-instruct"

# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype="auto",
#     trust_remote_code=True,
# )
# assert torch.cuda.is_available(), "This model needs a GPU to run ..."
# device = torch.cuda.current_device()
# model = model.to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_id)


A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- tokenization_phi3_small.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- configuration_phi3_small.py
- tokenization_phi3_small.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- triton_flash_blocksparse_attn.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mi

positional_embedding.py:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- positional_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- modeling_phi3_small.py
- triton_blocksparse_attention_layer.py
- positional_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

ConnectTimeout: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/cl100k_base.tiktoken (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7d1378076650>, 'Connection to openaipublic.blob.core.windows.net timed out. (connect timeout=None)'))

### To resolve these error from above related to loading 'microsoft/Phi-3-small-8k-instruct' model -
* `MaxRetryError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/cl100k_base.tiktoken (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7d1378076650>, 'Connection to openaipublic.blob.core.windows.net timed out. (connect timeout=None)'))`.

* `ConnectTimeout: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/cl100k_base.tiktoken (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7d1378076650>, 'Connection to openaipublic.blob.core.windows.net timed out. (connect timeout=None)'))`.


We manually setup 'tiktoken' encodings on our local machine, which seem to load the model, which got eventually stuck at partial load due to GPU memory contraints.



In [None]:
import tiktoken_ext.openai_public
import inspect

print(dir(tiktoken_ext.openai_public))
print(inspect.getsource(tiktoken_ext.openai_public.cl100k_base))

['ENCODING_CONSTRUCTORS', 'ENDOFPROMPT', 'ENDOFTEXT', 'FIM_MIDDLE', 'FIM_PREFIX', 'FIM_SUFFIX', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'cl100k_base', 'data_gym_to_mergeable_bpe_ranks', 'gpt2', 'load_tiktoken_bpe', 'o200k_base', 'p50k_base', 'p50k_edit', 'r50k_base']
def cl100k_base():
    mergeable_ranks = load_tiktoken_bpe(
        "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
        expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
    )
    special_tokens = {
        ENDOFTEXT: 100257,
        FIM_PREFIX: 100258,
        FIM_MIDDLE: 100259,
        FIM_SUFFIX: 100260,
        ENDOFPROMPT: 100276,
    }
    return {
        "name": "cl100k_base",
        "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""",
        "mergeable_ranks": mergeable_ranks,
        "special_tokens": speci

In [None]:
!pwd
!ls

/content
encodings  sequence


In [None]:
import hashlib

blobpath = "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
print(cache_key)

9b5ad71b2ce5302211f9c61530b329a4922fc6a4


In [None]:
!ls

9b5ad71b2ce5302211f9c61530b329a4922fc6a4  encodings  sequence


In [None]:
# !mv 9b5ad71b2ce5302211f9c61530b329a4922fc6a4 encodings/
!ls encodings

9b5ad71b2ce5302211f9c61530b329a4922fc6a4


In [None]:
import os
import tiktoken
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


tiktoken_cache_dir = "encodings/"
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir

# validate
assert os.path.exists(os.path.join(tiktoken_cache_dir, cache_key))

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")
encoding.encode("Hello, world")

[9906, 11, 1917]

In [None]:
torch.random.manual_seed(0)
model_id = "microsoft/Phi-3-small-8k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    trust_remote_code=True,
)
assert torch.cuda.is_available(), "This model needs a GPU to run ..."
device = torch.cuda.current_device()
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-small-8k-instruct/snapshots/f196467b67c13127747a03c142e09aa6841447b8/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-small-8k-instruct/snapshots/f196467b67c13127747a03c142e09aa6841447b8/config.json
Model config Phi3SmallConfig {
  "_name_or_path": "microsoft/Phi-3-small-8k-instruct",
  "architectures": [
    "Phi3SmallForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout_prob": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-small-8k-instruct--configuration_phi3_small.Phi3SmallConfig",
    "AutoModelForCausalLM": "microsoft/Phi-3-small-8k-instruct--modeling_phi3_small.Phi3SmallForCausalLM",
    "AutoTokenizer": "microsoft/Phi-3-small-8k-instruct--tokenization_phi3_small.Phi3SmallTokenizer"
  },
  "blocksparse_block_size": 64,
  "blocksparse_homo_head_pattern": false,
  "blocksparse_num_local

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:

messages = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])
