In [1]:
from datasets import load_dataset

dataset = load_dataset("glaiveai/glaive-function-calling")

In [2]:
function_dataset = dataset["train"].filter(lambda x: "you can use the functions if needed" in x["sample"])

In [3]:
sample = function_dataset[0]["sample"]
print(sample)

SYSTEM: You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed-
{
    "name": "generate_anagram",
    "description": "Generate an anagram of a given word",
    "parameters": {
        "type": "object",
        "properties": {
            "word": {
                "type": "string",
                "description": "The word to generate an anagram of"
            }
        },
        "required": [
            "word"
        ]
    }
}
USER: Can you help me generate an anagram of the word "listen"?
ASSISTANT: <functioncall> {"name":"generate_anagram", "arguments": {"word": "listen"}}
FUNCTION RESPONSE: {"anagram": "silent"}
ASSISTANT: The anagram of the word "listen" is "silent".
USER: That's amazing! Can you generate an anagram for the word "race"?
ASSISTANT: <functioncall> {"name":"generate_anagram", "arguments": {"word": "race"}}
FUNCTION RESPONSE: {"anagram": "care"}
ASSISTANT: The anagram of the word "race" is "care".

In [4]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

In [5]:
import json, re

normalize_json = lambda s, indent=None: json.dumps(json.loads(s), indent=indent)

def to_chatml_message(role, content):
    content = content.strip()
    role = role.strip()
    
    # Parse function calls
    call_prefix = "<functioncall>"
    
    if content.startswith(call_prefix):
        _, *payload = re.split(call_prefix + "[_\w]*\s", content)
        payload = "".join(payload)
        
        payload = payload\
            .replace("'{", '{')\
            .replace("}'", '}')\
            .replace('}.', '}')\
            .replace('1990s', '"1990s"')\
            .replace('1980s', '"1980s"')
        
        payload = normalize_json(payload)

        return dict(role="function_call", content=payload)

    # Parse information
    if role == "FUNCTION RESPONSE":
        content = re.sub(r"/\*.+\*/", "", content)\
            .replace('15.6"', "15.6in")\
            .replace("123,456", "123456")\
            .replace("(QR code image data)", "")\
            .replace("(example response)", "")\
            .replace('" by', " by")
        
        payload = normalize_json(content)

        return information(f"Result:\n{payload}")

    # Parse functions
    if role == "SYSTEM":
        content = content[content.index('{'):]
        content = content.replace("}\n{", "}\n\n{")

        return dict(
            role="system",
            name="functions",
            content=f"Available functions and their signatures as JSON schema:\n\n```\n{content}\n```",
        )

    # else, return as is
    return dict(role=role.lower(), content=content)

def to_chatml(row):
    sample = row["sample"]
    sample = re.sub(r",\s*\.\.\.", "", sample)\
        .replace("\\\'", "'")\
        .replace("Calling the <functioncall> function.", "<functioncall>")
        # .replace("ASSISTANT: <functioncall>", "<functioncall>")
    
    splits = re.split("\n([A-Z]+\s*[A-Z]+):\s", '\n'+sample)[1:]
    
    chatml = [
        situation(
            "A user is talking to their helpful AI Assistant that can help them accomplish different tasks."
            " In order to do so, the AI has access to various functions that it can call as described below."
            "\n\nIn order to call a function, AI just needs to specify the name of the function to call and its arguments as a valid JSON string."
        ),
    ]

    try:
        chatml.extend([
            to_chatml_message(
                role=splits[i],
                content=splits[i+1],
            )
            for i in range(0, len(splits), 2)
        ])

        assert next(k for k in chatml if k["role"] == "function_call")

    except BaseException as e:
        print(e)            
        return dict(chatml=[])

    return dict(chatml=chatml)

In [6]:
function_dataset = function_dataset.filter(
    lambda x: (
        "<functioncall>" in x["sample"]
        and '"lyrics": "' not in x["sample"]
        and '["artist":' not in x["sample"]
        and not x["sample"].endswith(': "')
    )
)

function_dataset = function_dataset.map(to_chatml)

In [7]:
function_dataset = function_dataset.filter(lambda x: len(x["chatml"]) > 0).remove_columns(set(dataset["train"].column_names) - {"chatml"})

In [8]:
function_dataset.push_to_hub("diwank/glaive-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/476 [00:00<?, ?B/s]

In [9]:
def extract_names(row):
    chatml = row["chatml"]
    content = chatml[1]["content"].split("```")[1].strip()
    splits = content.split("\n\n")
    splits = [json.loads(s) for s in splits]
    
    names = [s["name"] for s in splits]
    return dict(names=names)

ds_names = function_dataset.map(extract_names)

Map:   0%|          | 0/41767 [00:00<?, ? examples/s]

In [10]:
all_fns = set([
    name
    for names in ds_names["names"]
    for name in names
])

In [11]:
other_names = ['weather_forecaster',
 'data_filter',
 'get_articles',
 'email_notification',
 'weather_data_analysis',
 'resize_image',
 'file_converter',
 'search_articles',
 'data_cleaning',
 'blog_post_analysis',
 'email_notifications',
 'csv_parser',
 'calculator',
 'document_analysis',
 'word_count',
 'image_search',
 'image_manipulation',
 'topic_modeling',
 'multi_item_purchase_analysis',
 'sales_report_analysis',
 'blog_filter',
 'article_search',
 'age_filter',
 'image_classification',
 'trending_analysis',
 'server_monitoring',
 'employee_lookup',
 'image_recognition',
 'web_search',
 'term_count',
 'email_response',
 'text_parser',
 'stock_data',
 'employee_distribution',
 'email_filter',
 'video_filter',
 'json_parser',
 'inventory_management',
 'sentiment_analysis',
 'email_analytics',
 'blog_search',
 'get_posts',
 'data_extraction',
 'color_adjuster',
 'file_reader',
 'file_upload',
 'movie_recommendation',
 'sort',
 'document_filter',
 'data_visualization',
 'csv_analytics',
 'weather_prediction',
 'blog_post_reader',
 'content_filtering',
 'analyze_text',
 'file_conversion',
 'word_counter',
 'text_analytics',
 'translator',
 'folder_management',
 'academic_search',
 'product_review',
 'analyze_sentiment',
 'text_extraction',
 'weather_data',
 'data_retrieval',
 'sales_analytics',
 'email_send',
 'pdf_search',
 'time_period_analysis',
 'weather_analysis',
 'keyword_extraction',
 'file_manager',
 'country_info',
 'document_converter',
 'product_sales_analysis',
 'file_downloader',
 'search_events',
 'sort_data',
 'email_analysis',
 'sales_forecast',
 'keyword_analysis',
 'review_analyzer',
 'tweet_search',
 'data_aggregation',
 'document_finder',
 'date_filtering',
 'pdf_to_text',
 'document_analytics',
 'data_analysis',
 'music_recommendation',
 'location_search',
 'sales_report_generation',
 'inventory_check',
 'document_summarizer',
 'email_manager',
 'spreadsheet_analytics',
 'social_media_search',
 'file_management',
 'email_sentiment_analysis',
 'website_visit_counter',
 'send_email',
 'key_phrase_extraction',
 'dataset_filter',
 'filter_articles',
 'text_to_speech',
 'email_sending',
 'employee_info',
 'image_analysis',
 'product_lookup',
 'email_finder',
 'entity_extraction',
 'file_retrieval',
 'data_extractor',
 'web_scraping',
 'data_filtering',
 'database_filter',
 'image_resizer',
 'detect_objects',
 'text_translation',
 'file_search',
 'generate_word_cloud',
 'currency_converter',
 'email_alerts',
 'file_uploader',
 'data_export',
 'generate_graph',
 'database_download',
 'file_date_filter',
 'search_documents',
 'text_search',
 'log_monitoring',
 'get_blog_posts',
 'image_classifier',
 'content_analysis',
 'term_finder',
 'purchase_history',
 'city_information',
 'file_operations',
 'filter_by_keywords',
 'csv_sorting',
 'sort_by_date',
 'pdf_generation',
 'text_summarization',
 'customer_details',
 'search',
 'city_info',
 'text_generator',
 'text_analyzer',
 'weather_report',
 'csv_data_analysis',
 'database_query',
 'image_processor',
 'phrase_finder',
 'filter_by_date_range',
 'csv_reader',
 'data_encryption',
 'search_engine_query',
 'product_sales_count',
 'temperature_filter',
 'generate_report',
 'social_media_monitoring',
 'task_reminder',
 'web_scraper',
 'document_summary',
 'monthly_sales_report',
 'image_resizing',
 'reminder_setter',
 'calendar_manager',
 'pdf_extractor',
 'twitter_data_extraction',
 'document_editor',
 'social_media_analytics',
 'speech_to_text',
 'product_review_analysis',
 'search_engine',
 'translate_text',
 'text_analysis',
 'document_search',
 'image_enhancement',
 'image_processing',
 'file_sorter',
 'data_sorting',
 'sales_search',
 'calculate_average',
 'stock_price_check',
 'file_filter',
 'pdf_conversion',
 'document_conversion',
 'excel_analytics',
 'email_notifier',
 'review_parser',
 'article_analysis',
 'get_related_posts',
 'news_search',
 'doc_translate',
 'average_temperature',
 'user_profile_update',
 'analyze_hashtag',
 'filter_data',
 'pdf_analytics',
 'travel_planner',
 'dataset_statistics',
 'email_search',
 'tweet_filter',
 'text_classification',
 'pdf_to_word',
 'product_sales',
 'date_calculator',
 'team_management',
 'movie_database',
 'email_retrieval',
 'article_finder',
 'article_summary',
 'text_extractor',
 'text_summarizer',
 'create_playlist',
 'document_management',
 'doc_search',
 'file_compressor',
 'post_filter',
 'book_recommender',
 'book_review_analytics',
 'twitter_search',
 'file_download',
 'fetch_tweets',
 'book_search',
 'file_transfer',
 'customer_segmentation',
 'customer_feedback',
 'xml_parser',
 'pdf_parser',
 'product_counter',
 'data_scraper',
 'csv_export',
 'order_history',
 'table_operations',
 'entity_recognition',
 'csv_filter',
 'text_processing',
 'product_recommender',
 'document_translation',
 'movie_recommender',
 'blog_post_analytics',
 'synonym_finder',
 'time_zone_info',
 'product_review_analytics',
 'sales_report',
 'video_editing',
 'weather_forecast',
 'filter_content',
 'pdf_converter',
 'email_sender',
 'fetch_news',
 'keyword_search',
 'website_scraper',
 'research_paper_search',
 'project_filter',
 'pdf_reader',
 'dataset_analytics',
 'content_filter',
 'review_analysis',
 'product_search',
 'post_searcher',
 'date_range_filter',
 'database_search',
 'time_series_analysis',
 'document_classification',
 'location_filter',
 'audio_processing',
 'find_articles',
 'recipe_search',
 'image_downloader',
 'file_analytics',
 'get_weather_forecast',
 'data_visualizer',
 'html_parser',
 'sales_csv_analytics',
 'date_filter']

In [12]:
other_names = list(set(other_names).difference(all_fns))

In [13]:
len(other_names)

269

In [14]:
num_to_keep = len(other_names) * 5
num_shards = len(function_dataset) // num_to_keep

subset_ds = function_dataset.shard(num_shards=num_shards, index=0)
len(subset_ds)

1348

In [15]:
def add_neg_example(row, idx):
    chatml = row["chatml"]
    chatml = chatml[:2]
    neg_function_name = other_names[idx % len(other_names)]

    neg_function_content = dict(error=f"`{neg_function_name}` is not a valid function.", error_code="not_found")
    neg_function_call = dict(role="function_call", content=json.dumps(neg_function_content))

    chatml.append(person(f"Can you please run the `{neg_function_name}` function?"))
    chatml.append(neg_function_call)
    chatml.append(me(f"Sorry but I don't know how to do that!"))

    # Add neg declaration
    chatml[0]["content"] += " If a function with the given name is not found, please respond with an error (code: not_found) json in the function call."
    
    return dict(chatml=chatml)

neg_ds = subset_ds.map(add_neg_example, with_indices=True)

Map:   0%|          | 0/1348 [00:00<?, ? examples/s]

In [16]:
neg_ds[0]

{'chatml': [{'content': 'A user is talking to their helpful AI Assistant that can help them accomplish different tasks. In order to do so, the AI has access to various functions that it can call as described below.\n\nIn order to call a function, AI just needs to specify the name of the function to call and its arguments as a valid JSON string. If a function with the given name is not found, please respond with an error (code: not_found) json in the function call.',
   'name': 'situation',
   'role': 'system'},
  {'content': 'Available functions and their signatures as JSON schema:\n\n```\n{\n    "name": "generate_anagram",\n    "description": "Generate an anagram of a given word",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "word": {\n                "type": "string",\n                "description": "The word to generate an anagram of"\n            }\n        },\n        "required": [\n            "word"\n        ]\n    }\n}\n```',
   'name': 

In [17]:
neg_ds.push_to_hub("diwank/glaive-neg-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/547 [00:00<?, ?B/s]

In [18]:
def intermingle_negative(row):
    chatml = row["chatml"]

    # find first function call
    try:
        i_ffc, ffc = next((i, k) for i, k in enumerate(chatml) if k["role"] == "function_call")
        ffc_name = json.loads(ffc["content"])["name"]
    except:
        import traceback; print(traceback.format_exc())
        import pdb; pdb.set_trace()

    # Find another row that doesnt have this
    other_row = next(k for k in ds_names if ffc_name not in k["names"])

    # Replace functions
    other_functions = other_row["chatml"][1]["content"]
    chatml[1]["content"] = other_functions

    # New chatml
    i_user, _ = next((i, k) for i, k in enumerate(chatml) if k["role"] == "user")
    chatml = chatml[:i_user + 1]  # Remove everything before first user message

    # Add neg declaration
    chatml[0]["content"] += " If no appropriate function is available to carry out the user's request, please respond with an error (code: need_function) json in the function call."

    # Add neg response
    neg_function_content = dict(error=f"No function available that can help me perform this.", error_code="need_function")
    
    chatml.append(dict(
        role="function_call",
        content=json.dumps(neg_function_content),
    ))
    
    return dict(chatml=chatml)

In [19]:
# Get one tenth dataset for neg neg
tenth_ds = function_dataset.shuffle().shard(num_shards=10, index=0)

neg_neg_dataset = tenth_ds.map(intermingle_negative)

Map:   0%|          | 0/4177 [00:00<?, ? examples/s]

In [20]:
neg_neg_dataset[0]

{'chatml': [{'content': "A user is talking to their helpful AI Assistant that can help them accomplish different tasks. In order to do so, the AI has access to various functions that it can call as described below.\n\nIn order to call a function, AI just needs to specify the name of the function to call and its arguments as a valid JSON string. If no appropriate function is available to carry out the user's request, please respond with an error (code: need_function) json in the function call.",
   'name': 'situation',
   'role': 'system'},
  {'content': 'Available functions and their signatures as JSON schema:\n\n```\n{\n    "name": "generate_anagram",\n    "description": "Generate an anagram of a given word",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "word": {\n                "type": "string",\n                "description": "The word to generate an anagram of"\n            }\n        },\n        "required": [\n            "word"\n        ]

In [21]:
neg_ds.push_to_hub("diwank/glaive-neg-neg-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/551 [00:00<?, ?B/s]