In [0]:

# Load the section metadatas
import json
import pandas as pd

with open("/Volumes/main/casml/raw_data/book-metadata.json", "r") as f:
    book_metadata = json.load(f)

# Map page-nums to sections
def map_page_to_section(metadata):
    page_map = {}
    for section, sec_data in metadata.items():
        sec_start = sec_data.get("page_start")
        sec_end = sec_data.get("page_end")
        subsections = sec_data.get("subsections", {})
        for page in range(sec_start, sec_end + 1):
            page_map[page] = {"section": section, "subsection": None}
        for subsec, subsec_data in subsections.items():
            sub_start = subsec_data.get("page_start")
            sub_end = subsec_data.get("page_end")
            for page in range(sub_start, sub_end + 1):
                page_map[page] = {"section": section, "subsection": subsec}
    return page_map

page_to_section = map_page_to_section(book_metadata)

# Convert page_to_section dict to a DataFrame
page_section_df = pd.DataFrame([
    {"page_num": page, "section": info["section"], "subsection": info["subsection"]}
    for page, info in page_to_section.items()
])

# Create Spark DataFrame
page_section_sdf = spark.createDataFrame(page_section_df)

# Create a temporary SQL table
page_section_sdf.createOrReplaceTempView("pag_to_subsection_temp")



In [0]:
%sql
USE main.casml;
CREATE TABLE IF NOT EXISTS knowledge_base (
  id BIGINT GENERATED ALWAYS AS IDENTITY,
  page_num INT,
  content STRING,
  subsection STRING
)
TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
%sql
INSERT OVERWRITE TABLE knowledge_base (page_num, content)
SELECT
first_value(e.value:bbox[0]:page_id) AS page_num,
string_agg(e.value:content) AS content
FROM (
  SELECT ai_parse_document(content) AS parsed_document, path
  FROM READ_FILES('/Volumes/main/casml/raw_data/CoT.pdf', format => 'binaryFile')
) AS f,
LATERAL variant_explode(f.parsed_document:document:elements) AS e
GROUP BY e.value:bbox[0]:page_id::int, f.path


num_affected_rows,num_inserted_rows
43,43


In [0]:
%sql
MERGE INTO knowledge_base AS kb
USING pag_to_subsection_temp AS pts
ON kb.page_num = pts.page_num
WHEN MATCHED THEN
  UPDATE SET kb.subsection = pts.subsection;

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
36,36,0,0


In [0]:
%sql
SELECT * from knowledge_base

id,page_num,content,subsection,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Unnamed: 0_level_1,Commonsense,Commonsense,Commonsense,Symbolic,Symbolic,Symbolic,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,,,,,,,,,,,,
9,0,"Chain-of-Thought Prompting Elicits Reasoning in Large Language ModelsJason Wei Xuezhi Wang Dale Schuurmans Maarten BosmaBrian Ichter Fei Xia Ed H. Chi Quoc V. Le Denny ZhouY21 1905-6 [CS.5]Google Research, Brain Team {jasonwei,dennyzhou}@google.comAbstractWe explore how generating a chain of thought—a series of intermediate reasoning steps—significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain-of-thought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting.Experiments on three large language models show that chain-of-thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a PaLM 540B with just eight chain-of-thought exemplars achieves state-of-the-art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.Figure 1: Chain-of-thought prompting enables large language models to tackle complex arithmetic, commonsense, and symbolic reasoning tasks. Chain-of-thought reasoning processes are highlighted.36th Conference on Neural Information Processing Systems (NeurIPS 2022).",,,,,,,,,,
11,4,"Second, chain-of-thought prompting has larger performance gains for more-complicated problems. For instance, for GSM8K (the dataset with the lowest baseline performance), performance more than doubled for the largest GPT and PaLM models. On the other hand, for Sin-gleOp, the easiest subset of MAWPS which only requires a single step to solve, performance improvements were either negative or very small (see Appendix Table 3).Third, chain-of-thought prompting via GPT-3 175B and PaLM 540B compares favorably to prior state of the art, which typically finetunes a task-specific model on a labeled training dataset. Figure 4 shows how PaLM 540B uses chain-of-thought prompting to achieve new state of the art on GSM8K, SVAMP, and MAWPS (though note that standard prompting already passed the prior best for SVAMP). On the other two datasets, AQUA and ASDiv, PaLM with chain-of-thought prompting reaches within 2% of the state of the art (Appendix Table 2).To better understand why chain-of-thought prompting works, we manually examined model-generated chains of thought by LaMDA 137B for GSM8K. Of 50 random examples where the model returned the correct final answer, all of the generated chains of thought were also logically and mathematically correct except two that coincidentally arrived at the correct answer (see Appendix D.1, and Table 8 for examples of correct model-generated chains of thought). We also randomly examined 50 random samples for which the model gave the wrong answer. The summary of this analysis is that 46% of the chains of thought were almost correct, barring minor mistakes (calculator error, symbol mapping error, or one reasoning step missing), and the errors in semantic understanding or coherence why scaling improves chain-of-thought reasoning made by PaLM 62B and whether those errors is that scaling PaLM to 540B fixes a large portion3.3 Ablation StudyThe observed benefits of using chain-of-thought prompting raises the natural question of whether the same performance improvements can be conferred via other types of prompting. Figure 5 shows an ablation study with three variations of chain of thought described below.Equation only. One reason for why chain-of-thought prompting might help is that it produces the mathematical equation to be evaluated, and so we test a variation where the model is prompted to output only a mathematical equation before giving the answer. Figure 5 shows that equation only prompting does not help much for GSM8K, which implies that the semantics of the questions in GSM8K are too challenging to directly translate into an equation without the natural language reasoning steps in chain of thought. For datasets of one-step or two-step problems, however, we find that equation only prompting does improve performance, since the equation can be easily derived from the question (see Appendix Table 6).Model scale (# parameters in billions)Figure 4: Chain-of-thought prompting enables large language models to solve challenging math problems. Notably, chain-of-thought reasoning is an emergent ability of increasing model scale. Prior best numbers are from Cobbe et al. (2021) for GSM8K, Jie et al. (2022) for SVAMP, and Lan et al. (2021) for MAWPS.at the other 54% of the chains of thought had major errors (see Appendix D.2). To provide a small insight into missing ability, we performed a similar analysis of errors that were fixed by scaling to PaLM 540B. The summary5",,,,,,,,,,
23,2,"language models can generate chains of thought if demonstrations of chain-of-thought reasoning are provided in the exemplars for few-shot prompting.Figure 1 shows an example of a model producing a chain of thought to solve a math word problem that it would have otherwise gotten incorrect. The chain of thought in this case resembles a solution and can interpreted as one, but we still opt to call it a chain of thought to better capture the idea that it mimics a step-by-step thought process for arriving at the answer (and also, solutions/explanations typically come after the final answer (Narang et al., 2020; Wiegreffe et al., 2022; Lampinen et al., 2022, inter alia)).Chain-of-thought prompting has several attractive properties as an approach for facilitating reasoning in language models.1. First, chain of thought, in principle, allows models to decompose multi-step problems into intermediate steps, which means that additional computation can be allocated to problems that require more reasoning steps.2. Second, a chain of thought provides an interpretable window into the behavior of the model, suggesting how it might have arrived at a particular answer and providing opportunities to debug where the reasoning path went wrong (although fully characterizing a model's computations that support an answer remains an open question).3. Third, chain-of-thought reasoning can be used for tasks such as math word problems, commonsense reasoning, and symbolic manipulation, and is potentially applicable (at least in principle) to any task that humans can solve via language.4. Finally, chain-of-thought reasoning can be readily elicited in sufficiently large off-the-shelf language models simply by including examples of chain of thought sequences into the exemplars of few-shot prompting.In empirical experiments, we will observe the utility of chain-of-thought prompting for arithmetic reasoning (Section 3), commonsense reasoning (Section 4), and symbolic reasoning (Section 5).3 Arithmetic ReasoningWe begin by considering math word problems of the form in Figure 1, which measure the arithmetic reasoning ability of language models. Though simple for humans, arithmetic reasoning is a task where language models often struggle (Hendrycks et al., 2021; Patel et al., 2021, inter alia). Strikingly, chain-of-thought prompting when used with the 540B parameter language model performs comparably with task-specific finetuned models on several tasks, even achieving new state of the art on the challenging GSM8K benchmark (Cobbe et al., 2021).3.1 Experimental SetupWe explore chain-of-thought prompting for various language models on multiple benchmarks.Benchmarks. We consider the following five math word problem benchmarks: (1) the GSM8K benchmark of math word problems (Cobbe et al., 2021), (2) the SVAMP dataset of math word problems with varying structures (Patel et al., 2021), (3) the ASDiv dataset of diverse math word problems (Miao et al., 2020), (4) the AQuA dataset of algebraic word problems, and (5) the MAWPS benchmark (Koncel-Kedziorski et al., 2016). Example problems are given in Appendix Table 12.Standard prompting. For the baseline, we consider standard few-shot prompting, popularized by Brown et al. (2020), in which a language model is given in-context exemplars of input-output pairs before outputting a prediction for a test-time example. Exemplars are formatted as questions and answers. The model gives the answer directly, as shown in Figure 1 (left).Chain-of-thought prompting. Our proposed approach is to augment each exemplar in few-shot prompting with a chain of thought for an associated answer, as illustrated in Figure 1 (right). As most of the datasets only have an evaluation split, we manually composed a set of eight few-shot exemplars with chains of thought for prompting—Figure 1 (right) shows one chain of thought exemplar, and the full set of exemplars is given in Appendix Table 20. (These particular exemplars did not undergo prompt-engineering; robustness is studied in Section 3.4 and Appendix A.2.) To investigate whether chain-of-thought prompting in this form can successfully elicit successful reasoning across a range of3",,,,,,,,,,
28,1,"1 IntroductionThe NLP landscape has recently been revolutionized by language models (Peters et al., 2018; Devlin et al., 2019; Brown et al., 2020, inter alia). Scaling up the size of lan- guage models has been shown to confer a range of benefits, such as improved performance and sample efficiency (Ka- plan et al., 2020; Brown et al., 2020, inter alia). However, scaling up model size alone has not proved sufficient for achieving high performance on challenging tasks such as arithmetic, commonsense, and symbolic reasoning (Rae et al., 2021).This work explores how the reasoning ability of large language models can be unlocked by a simple method motivated by two ideas. First, techniques for arithmetic reasoning can benefit from generating natural language rationales that lead to the final answer. Prior work has given models the ability to generate natural language intermediate steps by training from scratch (Ling et al., 2017) or finetuning a pretrained model (Cobb et al., 2021), in addition to neuro-symbolic methods that use formal lan-guages instead of natural language (Roy and Roth, 2015; Chiang and Chen, 2019; Amini et al., 2019; Chen et al., 2019). Second, large language models offer the exciting prospect of in-context few-shot learning via prompting. language model checkpoint for each new task, one can input–output exemplars demonstrating the task. Remarkable simple question-answering tasksBoth of the above ideas, however, have key limitations. For rationale-augmented training and finetuning methods, it is costly to create a large set of high quality rationales, which is much more complicated than simple input-output pairs used in normal machine learning. For the traditional few-shot prompting method used in Brown et al. (2020), it works poorly on tasks that require reasoning abilities, and often does not improve substantially with increasing language model scale (Rae et al., 2021). In this paper, we combine the strengths of these two ideas in a way that avoids their limitations Specifically, we explore the ability of language models to perform few-shot prompting for reasoning tasks, given a prompt that consists of triples: (input, chain of thought, output). A chain of thought isWe present empirical evaluations on arithmetic, commonsense, and symbolic reasoning benchmarks, showing that chain-of-thought prompting outperforms standard prompting, sometimes to a striking degree. Figure 2 illustrates one such result—on the GSM8K benchmark of math word problems (Cobb et al., 2021), chain-of-thought prompting with PaLM 540B outperforms standard prompting by a large margin and achieves new state-of-the-art performance. A prompting only approach is important because it does not require a large training dataset and because a single model checkpoint can perform many tasks without loss of generality. This work underscores how large language models can learn via a few examples with natural language data about the task (c.f. automatically learning the patterns underlying inputs and outputs via a large training dataset).2 Chain-of-Thought PromptingConsider one's own thought process when solving a complicated reasoning task such as a multi-step math word problem. It is typical to decompose the problem into intermediate steps and solve each before giving the final answer: “After Jane gives 2 flowers to her mom she has 10 ... then after she gives 3 to her dad she will have 7 ... so the answer is 7.” The goal of this paper is to endow language models with the ability to generate a similar chain of thought—a coherent series of intermediate reasoning steps that lead to the final answer for a problem.Finetuned GPT-3 175EPrior besPaLM 540B: standard promptingPaLM 540B: chain-of-thought promptingFigure 2: PaLM 540B uses chain-ofthought prompting to achieve new state-of-the-art performance on the GSM8K benchmark of math word problems. Finetuned GPT-3 and prior best are from Cobbe et al. (2021).2",,,,,,,,,,
34,6,"source (examples in this dataset already included reasoning steps like a chain of thought). 2 Figure 6 shows that these prompts performed comparably with our manually written exemplars, also substantially outperforming standard prompting.In addition to robustness to annotators, independently-written chains of thought, different exemplar, and various language models, we also find that chain-of-thought prompting for arithmetic reasoning is robust to different exemplar orders and varying numbers of exemplars (see Appendix A.2).4 Commonsense ReasoningAlthough chain of thought is particularly suitable for math word problems, the language-based nature of chain of thought actually makes it applicable to a broad class of commonsense reasoning problems, which involve reasoning about physical and human interactions under the presumption of general background knowledge. Commonsense reasoning is key for interacting with the world and is still beyond the reach of current natural language understanding systems (Talmor et al., 2021).Benchmarks. We consider five datasets covering a diverse range of commonsense reasoning types. The popular CSQA (Talmor et al., 2019) asks commonsense questions about the world involving complex semantics that often require prior knowledge. StrategyQA (Geva et al., 2021) requires models to infer a multi-hop strategy to answer questions. We choose two specialized evaluation sets from the BIG-bench effort (BIG-bench collaboration, 2021); Date Understanding, which involves inferring a date from a given context, and Sports Understanding, which involves determining whether a sentence relating to sports is plausible or implausible. Finally, the SayCan dataset (Ahn et al., 2022) involves mapping a natural language instruction to a sequence of robot actions from a discrete set. Figure 3 shows examples with chain of thought annotations for all datasets.Prompts. We follow the same experimental setup as the prior section. For CSQA and StrategyQA, we randomly selected examples from the training set and manually composed chains of thought for them to use as few-shot exemplars. The two BIG-bench tasks do not have training sets, so we selected the first ten examples as exemplars in the evaluation set as few-shot exemplars and report numbers on the rest of the evaluation set. For SayCan, we use six examples from the training set used in Ahn et al (2022) and also manually composed chains of thought.Results. Figure 7 highlights these results for PaLM (full results for LaMDA, GPT-3, and different model scales are shown in Table 4). For all tasks, scaling up model size improved the performance of standard prompting; chain-of-thought prompting led to further gains, with improvements appearing to be largest for PaLM 540B. With chain-of-thought prompting, PaLM 540B achieved strong performance relative to baselines, outperforming the prior state of the art on StrategyQA (75.6% vs 69.4%) and outperforming an unaided sports enthusiast on sports understanding (95.4% vs 84%). These results demonstrate that chain-of-thought prompting can also improve performance on tasks requiring a range of commonsense reasoning abilities (though note that gain was minimal on CSQA)Figure 7: Chain-of-thought prompting also improves the commonsense reasoning abilities of language models. The language model shown here is PaLM. Prior best numbers are from the leaderboards of CSQA (Talmor et al., 2019) and StrategyQA (Geva et al., 2021) (single-model only, as of May 5, 2022). Additional results using various sizes of LaMDA, GPT-3, and PaLM are shown in Table 4.2We sample examples < 60 tokens to fit into our input context window, and also limit the examples to < 2 steps to solve for a fair comparison with the eight exemplars that we composed.",,,,,,,,,,
40,3,"Figure 3: Examples of (input, chain of thought, output) triples for arithmetic, commonsense, and symbolic reasoning benchmarks. Chains of thought are highlighted. Full prompts in Appendix G.math word problems, we used this single set of eight chain of thought exemplars for all benchmarks except AQUA, which is multiple choice instead of free response. For AQUA, we used four exemplars and solutions from the training set, as given in Appendix Table 21.Language models. We evaluate five large language models. The first is GPT-3 (Brown et al., 2020), for which we use text-ada-001, text-babbage-001, text-curie-001, and text-davinci-002, which presumably correspond to InstructGPT models of 350M, 1.3B, 6.7B, and 175B parameters (Ouyang et al., 2022). The second is LaMDA (Thoppilan et al., 2022), which has models of 422M, 2B, 8B, 68B, and 137B parameters. The third is PaLM, which has models of 8B, 62B, and 540B parameters. The fourth is UL2 20B (Tay et al., 2022), and the fifth is Codex (Chen et al., 2021, code-davinci-002 in the OpenAI API). We sample from the models via greedy decoding (though follow-up work shows chain-of-thought prompting can be improved by taking the majority final answer over many sampled generations (Wang et al., 2022a)). For LaMDA, we report averaged results over five random seeds, where each seed had a different randomly shuffled order of exemplars. As LaMDA experiments did not show large variance among different seeds, to save compute we report results for a single exemplar order for all other models.3.2 ResultsThe strongest results of chain-of-thought prompting are summarized in Figure 4, with all experimental outputs for each model collection, model size, and benchmark shown in Table 2 in the Appendix. There are three key takeaways. First, Figure 4 shows that chain-of-thought prompting is an emergent ability of model scale (Wei et al., 2022b). That is, chain-of-thought prompting does not positively impact performance for small models, and only yields performance gains when used with models of $\sim$100B parameters. We qualitatively found that models of smaller scale produced fluent but illogical chains of thought, leading to lower performance than standard prompting.4",,,,,,,,,,
41,5,"Variable compute only. Another intuition is that chain of thought allows the model to spend more computation (i.e., intermediate tokens) on harder problems. To isolate the effect of variable computation from chain-of-thought reasoning, we test a configuration where the model is prompted to output a only sequence of dots (...) equal to the number of characters in the equation needed to solve the problem. This variant performs about the same as the baseline, which suggests that variable computation by itself is not the reason for the success of chain-of-thought prompting, and that there appears to be utility from expressing intermediate steps via natural language.Chain of thought after answer. Another potential benefit of chain-of-thought prompting could simply be that such prompts allow the model to better access relevant knowledge acquired during pretraining. Therefore, we test an alternative configuration where the chain of thought prompt is only given after the answer, isolating whether the model actually depends on the produced chain of thought to give the final answer. This variant performs about the same as the baseline, which suggests that the sequential reasoning embodied in the chain of thought is useful for reasons beyond just activating knowledge.3.4 Robustness of Chain of ThoughtSensitivity to exemplars is a key consideration of prompting approaches—for instance, varying the permutation of few-shot exemplars can cause the accuracy of GPT-3 on SST-2 to range from near chance (54.3%) to near state of the art (93.4%) (Zhao et al., 2021). In this final subsection, we evaluate robustness to chains of thought written by different annotators. In addition to the results above, which used chains of thought written by an Annotator A, two other co-authors of this paper (Annotators B and C) independently wrote chains of thought for the same few-shot exemplars (shown in Appendix H). Annotator A also wrote another chain of thought that was more concise than the original, following the style of solutions given in Cobbe et al. (2021).1Figure 6 shows these results for LaMDA 137B on GSM8K and MAWPS (ablation results for other datasets are given in Appendix Table 6 / Table 7). Although there is variance among different chain of thought annotations, as would be expected when using exemplar-based prompting (Le Scao and Rush, 2021; Reynolds and McDonell, 2021; Zhao et al., 2021), all sets of chain of thought prompts outperform the standard baseline by a large margin. This result implies that successful use of chain of thought does not depend on a particular linguistic style.To confirm that successful chain-of-thought prompting works for other sets of exemplars, we also run experiments with three sets of eight exemplars randomly sampled fromFor instance, whereas original chain of thought uses several short sentences ( “There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29” ), the concise chain of thought would read “5 * 4 = 20 new computers were added. So there are 9 + 20 = 29 new computers in the server room now”.Standard promptingEquation onlyVariable compute onlyReasoning after answerFigure 5: Ablation study for dif- ferent variations of prompting us- ing LaMDA 137B and PaLM 540B Results for other datasets are given in Appendix Table 6 and Table 7.Figure 6: Chain-of-thought prompting has variance for different prompt examples (as expected) but outperforms standard prompting for various annotators as well as for different exemplars.6",,,,,,,,,,
43,29,"E Additional DetailsVersion ControlV5 → V6. Fixed minor typo in Figure 3.V4 → V5. Added Codex and UL2 results. Small changes to writing and style of paper.V3 → V4. Fixed typo in Figure 3 and added a couple citations.V2 → V3. Added GPT-3 results. Added SVAMP and AQUA eval datasets for math. Added SayCan eval for commonsense. Added Extended Related Work section (Appendix C). Added ablations for Commonsense and Symbolic Reasoning (Table 7). Added FAQ section (Appendix A). Added raw results in Appendix B.V1 → V2. Added PaLM results (V1 only had LaMDA).E.1 Reproducibility StatementAs our results make use of two sets of large language models that is not publicly available, we take the following actions to facilitate reproducibility. First, we provide the exact input prompts for all tasks in Table 20–Table 27 in Appendix G (and emphasize that we do not perform any finetuning and only apply prompting to off-the-shelf language models). Second, we conduct experiments using the publicly available GPT-3 API for four model scales text-ada-001, text-babbage-001, text-curie-001, text-davinci-002). Finally, we make exact inputs, targets, and predictions for LaMDA 137B for each task available as a zip file in the supplementary material.E.2 Computational ResourcesFor all three language models we evaluated, we did prompting-based inference only. No finetuning was done for this paper. For inference on LaMDA 137B we use TPU v3 (8x8 configuration, 64 chips / 128 cores), and for inference on PaLM 540B we use TPU v4 (4x4x12 configuration, 192 chips / 384 cores). GPT-3 experiments were done using the public API.5E.3 Dataset Details and LicensesWe list the details and licenses for all arithmetic and commonsense datasets used in this paper. The symbolic reasoning datasets were created synthetically, as described in Section 4.Arithmetic reasoningMath Word Problem Repository (Koncel-Kedziorski et al., 2016); AddSub (Hosseini et al., 2014); https://www.cs.washington.edu/nlp/arithmetic; MultiArith (Roy and Roth, 2015), license: CC BY 4.0.• ASDiv (Miao et al., 2020): https://github.com/chaochun/nlu-asdiv-dataset.• AQuA (Ling et al., 2017): https://github.com/deepmind/AQuA, license: https:// github.com/deepmind/AQuA/blob/master/LICENSE.• GSM8K (Cobbe et al., 2021): https://github.com/openai/grade-school-math, MIT license: https://github.com/openai/grade-school-math/blob/master/ LICENSE.• SVAMP (Patel et al., 2021): https://github.com/arkilpatel/SVAMP , MIT license: https://github.com/arkilpatel/SVAMP/blob/main/LICENSECommonsense reasoning• CSQA (Talmor et al., 2019): https://www.tau-nlp.org/commonsenseqa, https:// github.com/jonathanherzig/commonsenseqa.5 https://beta.openai.com/docs/api-reference/making-requests30",Careers in Psychology,,,,,,,,,
42,18,"multi-step reasoning, (2) a large language model is used, and (3) the scaling curve is relatively flat. Conversely, the benefits are smaller when one or more of these conditions are not met.These intuitions are perhaps supported by the arithmetic reasoning results. The performance gain from chain-of-thought prompting is largest for PaLM 540B on GSM8K (challenging multi-step problems, flat scaling curve), which meets these conditions. The performance gain is small for the subsets of MAWPS that only require one or two steps (SingleOP, SingleEq, and AddSub), for which PaLM 540B already achieves performance of 90% or higher (and it is also generally true that there is less headroom for improvement when performance is already strong).Although in this paper we focused on multi-step reasoning tasks (arithmetic, commonsense, and symbolic), chain-of-thought prompting can potentially be applied to any task for which humans use a ""chain of thought"" to solve (at least in principle). We leave the empirical evaluation of chain-of-thought prompting on such diverse tasks (e.g., machine translation, etc.) to future work.A.4 Why is prompting with the equation only not enough for some arithmetic reasoning datasets?Prompting with the equation only as an intermediate step does help on many datasets, especially when the datasets only require a few reasoning steps (SVAMP, ASDiv, MAWPS). For GSM8K, however, using the equation only did not improve performance substantially. Based on qualitative analysis, we believe that these questions are too semantically challenging for the model to directly translate them into a math equation. Consider this example from LaMDA 137B:QUESTION: Mike plays ping pong for 40 minutes. In the first 20 minutes, he scores 4 points. In the second 20 minutes, he scores 25% more points. How many total points did he score?LOCATION ONLY (WRONG ANSWER): (4 + 20 * 0.25) = 6. The answer is 6.CHAIN OF THOUGHT (CORRECT): Mike played ping pong for 40 minutes. In the first 20 minutes, he scored 4 points. In the second 20 minutes, he scored 25% more points. So he scored 25% more in the second 20 minutes. 4 x 1.25 = 5. So he scored 5 points in the second 20 minutes. So he scored 9 points in total. The answer is 9.It is hard for the model to directly translate all of the semantics into a single equation, but chain of thought allows it to better reason about each part of the question via intermediate steps in natural language.19",Contemporary Psychology,,,,,,,,,

Dataset,N,Example problem
GSM8K,1319,"Josh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?"
SVAMP,1000,Each pack of dvds costs 76 dollars. If there is a discount of 25 dollars on each pack. How much do you have to pay to buy each pack?
ASDiv,2096,Ellen has six more balls than Marin. Marin has nine balls. How many balls does Ellen have?
AQuA,254,"A car is being driven, in a straight line and at a uniform speed, towards the base of a vertical tower. The top of the tower is observed from the car and, in the process, it takes 10 minutes for the angle of elevation to change from 45° to 60°. After how much more time will this car reach the base of the tower? Answer Choices: (a) 5√3 + 1 (b) 6√3 + √2 (c) 7√3 - 1 (d) 8√3 - 2 (e) None of these"
MAWPS: SingleOp,562,"If there are 7 bottle caps in a box and Linda puts 7 more bottle caps inside, how many bottle caps are in the box?"
MAWPS: SingleEq,508,Benny bought a soft drink for 2 dollars and 5 candy bars. He spent a total of 27 dollars. How much did each candy bar cost?
MAWPS: AddSub,395,There were 6 roses in the vase. Mary cut some roses from her flower garden. There are now 16 roses in the vase. How many roses did she cut?
MAWPS: MultiArith,600,"The school cafeteria ordered 42 red apples and 7 green apples for students lunches. But, if only 9 students wanted fruit, how many extra did the cafeteria end up with?"

0,1
PROMPT For CSQA,
Q: What do people use to absorb extra ink from a fountain pen? Answer Choices: (a) shirt pocket (b) calligrapher's hand (c) inkwell (d) desk drawer (c) blotter,
"A: The answer must be an item that can absorb ink. Of the above choices, only blotters are used to absorb ink. So the answer is (c).",
Q: What home entertainment equipment requires cable?,Q: What home entertainment equipment requires cable?
Answer Choices: (a) radio shack (b) substation (c) television (d) cabinet,
"A: The answer must require cable. Of the above choices, only television requires cable. So the answer is (c).",
"Q: The fox walked from the city into the forest, what was it looking for? Answer Choices: (a) pretty flowers (b) hen house (c) natural habitat (d) storybook",
"A: The answer must be something in the forest. Of the above choices, only natural habitat is in the forest. So the answer is (b).",
Q: Sammy wanted to go to where the people were. Where might he go? Answer Choices: (a) populated areas (b) race track (c) desert (d) apartment (e) roadblock,
"A: The answer must be a place with a lot of people. Of the above choices, only populated areas have a lot of people. So the answer is (a).",

Unnamed: 0,GSM8K,SVAMP,ASDiv,MAWPS
Standard prompting,6.5 ±0.4,29.5 ±0.6,40.1 ±0.6,43.2 ±0.9
Chain of thought prompting,14.3 ±0.4,36.7 ±0.4,46.6 ±0.7,57.9 ±1.5
Ablations,Ablations,Ablations,Ablations,Ablations
· equation only,5.4 ±0.2,35.1 ±0.4,45.9 ±0.6,50.1 ±1.0
· variable compute only,6.4 ±0.3,28.0 ±0.6,39.4 ±0.4,41.3 ±1.1
· reasoning after answer,6.1 ±0.4,30.7 ±0.9,38.6 ±0.6,43.6 ±1.0
Robustness,Robustness,Robustness,Robustness,Robustness
· different annotator (B),15.5 ±0.6,35.2 ±0.4,46.5 ±0.4,58.2 ±1.0
· different annotator (C),17.6 ±1.0,37.5 ±2.0,48.7 ±0.7,60.1 ±2.0
· intentionally concise style,11.1 ±0.3,38.7 ±0.8,48.0 ±0.3,59.6 ±0.7

Unnamed: 0,Commonsense,Commonsense.1,Commonsense.2,Symbolic,Symbolic.1,Symbolic.2
,,,,,,
,Date,Sports,SayCan,Concat,Coin,
Standard prompting,21.5 ±0.6,59.5 ±3.0,80.8 ±1.8,5.8 ±0.6,49.0 ±2.1,
Chain of thought prompting,26.8 ±2.1,85.8 ±1.8,91.7 ±1.4,77.5 ±3.8,99.6 ±0.3,
Ablations,,,,,,
· variable compute only,21.3 ±0.7,61.6 ±2.2,74.2 ±2.3,7.2 ±1.6,50.7 ±0.7,
· reasoning after answer,20.9 ±1.0,63.0 ±2.0,83.3 ±0.6,0.0 ±0.0,50.2 ±0.5,
Robustness,,,,,,
· different annotator (B),27.4 ±1.7,75.4 ±2.7,88.3 ±1.4,76.0 ±1.9,77.5 ±7.9,
· different annotator (C),25.5 ±2.5,81.1 ±3.6,85.0 ±1.8,68.1 ±2.2,71.4 ±11.1,

0
PROMPT FOR STRATEGYQA
Q: Do hamsters provide food for any animals?
"A: Hamsters are prey animals. Prey are food for predators. Thus, hamsters provide food for some animals. So the answer is yes."
Q: Could Brooke Shields succeed at University of Pennsylvania?
"A: Brooke Shields went to Princeton University. Princeton University is about as academically rigorous as the University of Pennsylvania. Thus, Brooke Shields could also succeed at the University of Pennsylvania. So the answer is yes."
Q: Yes or no: Hydrogen's atomic number squared exceeds number of Spice Girls?
"A: Hydrogen has an atomic number of 1. 1 squared is 1. There are 5 Spice Girls. Thus, Hydrogen's atomic number squared is less than 5. So the answer is no."
Q: Yes or no: Is it common to see frost during some college commencements?
"A: College commencement ceremonies can happen in December, May, and June. December is in the winter, so there can be frost. Thus, there could be frost at some commencements. So the answer is yes."
Q: Yes or no: Could a llama birth twice during War in Vietnam (1945–46)?

0,1
PROMPT FOR DATE UNDERSTANDING,
Q: 2015 is coming in 36 hours. What is the date one week from today in MM/DD/YYYY?,
"A: If 2015 is coming in 36 hours, then it is coming in 2 days. 2 days before 01/01/2015 is 12/30/2014, so today is 12/30/2014. So one week from today will be 01/05/2015. So the answer is 01/05/2015.",
"Q: The first day of 2019 is a Tuesday, and today is the first Monday of 2019. What is the date today in MM/DD/YYYY?",
"A: If the first day of 2019 was Tuesday, then 01/01/2019 was a Tuesday. Today is the first monday, would be six days later. So today is 01/07/2019. So the answer is 01/07/2019.",
"Q: The concert was scheduled to be on 06/01/1943, but was delayed by one day to today. What is the date 10 days ago in MM/DD/YYYY?",
"A: One day after 06/01/1943 is 06/02/1943, so today is 06/02/1943. 10 days before today is 05/23/1943. So the answer is 05/23/1943.",
Q: It is 4/19/1969 today. What is the date 24 hours later in MM/DD/YYYY?,
"A: Today is 04/19/1969. 24 hours later is one day after today, which would be 04/20/1969. So the answer is 04/20/1969.",
"Q: Jane thought today is 3/11/2002, but today is in fact Mar 12, which is 1 day later. What is the date 24 hours later in MM/DD/YYYY?",

Unnamed: 0,Prompting,GSM8K,SVAMP,ASDiv,AQuA,MAWPS
Prior best,N/A (finetuning),55a,57.4b,75.3c,37.9d,88.4e
UL2 20B,Standard,4.1,10.1,16.0,20.5,16.6
UL2 20B,Chain of thought,4.4 (+0.3),12.5 (+2.4),16.9 (+0.9),23.6 (+3.1),19.1 (+2.5)
UL2 20B,+ ext. calc,6.9,28.3,34.3,23.6,42.7
LaMDA 137B,Standard,6.5,29.5,40.1,25.5,43.2
LaMDA 137B,Chain of thought,14.3 (+7.8),37.5 (+8.0),46.6 (+6.5),20.6 (-4.9),57.9 (+14.7)
LaMDA 137B,+ ext. calc,17.8,42.1,53.4,20.6,69.3
GPT-3 175B,GPT-3 175B,GPT-3 175B,GPT-3 175B,GPT-3 175B,GPT-3 175B,GPT-3 175B
(text-davinci-002),Standard,15.6,65.7,70.3,24.8,72.7
(text-davinci-002),Chain of thought,46.9 (+31.3),68.9 (+3.2),71.3 (+1.0),35.8 (+11.0),87.1 (+14.4)

0,1
PROMPT FOR SPORTS UNDERSTANDING,
"Q: Is the following sentence plausible? ""Kyle Palmieri was called for slashing.""",
A: Kyle Palmieri is a hockey player. Being called for slashing is part of hockey. So the answer is yes.,
"Q: Is the following sentence plausible? ""Joao Moutinho caught the screen pass in the NFC championship.""",
"A: Joao Moutinho is a soccer player. The NFC championship is part of American football, not soccer. So the answer is no.",
"Q: Is the following sentence plausible? ""Carson Wentz set the pick and roll.""",
"A: Carson Wentz is an American football player. Pick and roll is part of basketball, not football. So the answer is no.",
"Q: Is the following sentence plausible? ""Jonas Valanciunas beat the buzzer.""",
A: Jonas Valanciunas is a basketball player. Beating the buzzer is part of basketball. So the answer is yes.,
"Q: Is the following sentence plausible? ""Jamel Murray was perfect from the line.""",

0
PROMPT FOR AQUA ALGEBRAIC WORD PROBLEMS
Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is?
Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
"A: If 10 is added to each number, then the mean of the numbers also increases by 10. So the new mean would be 50. The answer is (a)."
"Q: If a / b = 3/4 and 8a + 5b = 22, then find the value of a."
Answer Choices: (a) 1/2 (b) 3/2 (c) 5/2 (d) 4/2 (e) 7/2
"A: If a / b = 3/4, then b = 4a / 3. So 8a + 5(4a / 3) = 22. This simplifies to 8a + 20a / 3 = 22, which means 44a / 3 = 22. So a is equal to 3/2. The answer is (b)."
Q: A person is traveling at 20 km/hr and reached his destiny in 2.5 hr then find the distance?
Answer Choices: (a) 53 km (b) 55 km (c) 52 km (d) 60 km (e) 50 km
A: The distance that the person traveled would have been 20 km/hr * 2.5 hrs = 50 km. The answer is (e).

0
PROMPT FOR LAST LETTER CONCATENATION
"Q: Take the last letters of the words in ""Elon Musk"" and concatenate them."
"A: The last letter of ""Elon"" is ""n"". The last letter of ""Musk"" is ""k"". Concatenating them is ""nk"". The answer is nk."
"Q: Take the last letters of the words in ""Larry Page"" and concatenate them."
"A: The last letter of ""Larry"" is ""y"". The last letter of ""Page"" is ""e"". Concatenating them is ""ye"". The answer is ye."
"Q: Take the last letters of the words in ""Sergey Brin"" and concatenate them."
"A: The last letter of ""Sergey"" is ""y"". The last letter of ""Brin"" is ""n"". Concatenating them is ""yn"". The answer is yn."
"Q: Take the last letters of the words in ""Bill Gates"" and concatenate them."
"A: The last letter of ""Bill"" is ""l"". The last letter of ""Gates"" is ""s"". Concatenating them is ""ls"". The answer is ls."

0,1,2,3,4,5,6,7,8,9,10,11,12
Model,,GSM8K standard,CoT,SVAMP standard,CoT,ASDiv standard,AQuA standard,MAWPS standard,CoT,,,
UL2,20B,4.1,4.4,10.1,12.5,16.0,16.9,20.5,23.6,16.6,19.1,
UL2,LaMDA,420M,2.6,0.4,2.5,1.6,3.2,0.8,23.5,8.3,3.2,
GPT,2B,3.6,1.9,3.3,2.4,4.1,3.8,22.9,17.7,3.9,3.1,
GPT,8B,3.2,1.6,4.3,3.4,5.9,5.0,22.8,18.6,5.3,4.8,
GPT,68B,5.7,8.2,13.6,18.8,21.8,23.1,22.3,20.2,21.6,30.6,
GPT,137B,6.5,14.3,29.5,37.5,40.1,46.6,25.5,20.6,43.2,57.9,
GPT,350M,2.2,0.5,1.4,0.8,2.1,0.8,18.1,8.7,2.4,1.1,
Codex,1.3B,2.4,0.5,1.5,1.7,2.6,1.4,12.6,4.3,3.1,1.7,
Codex,6.7B,4.0,2.4,6.1,3.1,8.6,3.6,15.4,13.4,8.8,3.5,

Model,Unnamed: 1_level_0,SingleOp,SingleOp,SingleEq,SingleEq,AddSub,AddSub,MultiArith,MultiArith
Model,Unnamed: 1_level_1,standard,CoT,standard,CoT,standard,CoT,standard,CoT
UL2,20B,24.9,27.2,18.0,20.2,18.5,18.2,5.0,10.7
LaMDA,420M,2.8,1.0,2.4,0.4,1.9,0.7,5.8,1.5
LaMDA,2B,4.6,4.1,2.4,3.3,2.7,3.2,5.8,1.8
LaMDA,8B,8.0,7.0,4.5,4.4,3.4,5.2,5.2,2.4
LaMDA,68B,36.5,40.8,23.9,26.0,17.3,23.2,8.7,32.4
LaMDA,137B,73.2,76.2,48.8,58.7,43.0,51.9,7.6,44.9
GPT,350M,3.2,1.8,2.0,0.2,2.0,1.5,2.3,0.8
GPT,1.3B,5.3,3.0,2.4,1.6,2.3,1.5,2.2,0.5
GPT,6.7B,13.5,3.9,8.7,4.9,8.6,2.5,4.5,2.8
GPT,175B,90.9,88.8,82.7,86.6,83.3,81.3,33.8,91.7

Model,CSQA,CSQA,StrategyQA,StrategyQA,Date,Date,Sports,Sports,SayCan,SayCan,Unnamed: 11_level_0
Model,standard,CoT,standard,CoT,standard,CoT,standard,CoT,standard,CoT,Unnamed: 11_level_1
UL2,20B,34.2,51.4,59.0,53.3,13.5,14.0,57.9,65.3,20.0,41.7
LaMDA,420M,20.1,19.2,46.4,24.9,1.9,1.6,50.0,49.7,7.5,7.5
LaMDA,2B,20.2,19.6,52.6,45.2,8.0,6.8,49.3,57.5,8.3,8.3
LaMDA,8B,19.0,20.3,54.1,46.8,9.5,5.4,50.0,52.1,28.3,33.3
LaMDA,68B,37.0,44.1,59.6,62.2,15.5,18.6,55.2,77.5,35.0,42.5
LaMDA,137B,53.6,57.9,62.4,65.4,21.5,26.8,59.5,85.8,43.3,46.6
GPT,350M,14.7,15.2,20.6,0.9,4.3,0.9,33.8,41.6,12.5,0.8
GPT,1.3B,12.0,19.2,45.8,35.7,4.0,1.4,0.0,26.9,20.8,9.2
GPT,6.7B,19.0,24.0,53.6,50.0,8.9,4.9,0.0,4.4,17.5,35.0
GPT,175B,79.5,73.5,65.9,65.4,43.8,52.1,69.6,82.4,81.7,87.5

Model,Last Letter Concatenation,Last Letter Concatenation,Last Letter Concatenation,Last Letter Concatenation,Last Letter Concatenation,Last Letter Concatenation,Coin Flip (state tracking),Coin Flip (state tracking),Coin Flip (state tracking),Unnamed: 10_level_0
Model,2,2,2,OOD: 3,OOD: 3,2,OOD: 3,OOD: 3,OOD: 4,Unnamed: 10_level_1
Model,standard,CoT,standard.1,CoT standard,CoT standard.1,2,CoT standard,CoT standard.1,OOD: 4,CoT
UL2,UL2,UL2,UL2,UL2,UL2,UL2,UL2,UL2,UL2,
20B,,0.6,18.8,0.0,0.2,0.0,0.0,67.1,51.6,52.2
LaMDA 420M,LaMDA 420M,LaMDA 420M,LaMDA 420M,LaMDA 420M,LaMDA 420M,LaMDA 420M,LaMDA 420M,LaMDA 420M,LaMDA 420M,
2B,,0.3,1.6,0.0,0.0,0.0,0.0,52.9,49.6,50.0
8B,,2.3,6.0,0.0,0.0,0.0,0.0,54.9,55.3,47.4
68B,,1.5,11.5,0.0,0.0,0.0,0.0,52.9,55.5,48.2
137B,,4.4,52.0,0.0,0.8,0.0,2.5,56.2,83.2,50.4
,,5.8,77.5,0.0,34.4,0.0,13.5,49.0,99.6,50.7
PaLM,PaLM,PaLM,PaLM,PaLM,PaLM,PaLM,PaLM,PaLM,PaLM,
8B,,2.6,18.8,0.0,0.0,0.0,0.2,60.0,74.4,47.3


In [0]:
%pip install -U -qqqq mlflow>=3.1.1 langchain langgraph databricks-langchain pydantic databricks-agents unitycatalog-langchain[databricks] uv databricks-feature-engineering==0.12.1
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import time

def endpoint_exists(vsc, vs_endpoint_name):
  try:
    return vs_endpoint_name in [e['name'] for e in vsc.list_endpoints().get('endpoints', [])]
  except Exception as e:
    #Temp fix for potential REQUEST_LIMIT_EXCEEDED issue
    if "REQUEST_LIMIT_EXCEEDED" in str(e):
      print("WARN: couldn't get endpoint status due to REQUEST_LIMIT_EXCEEDED error. The demo will consider it exists")
      return True
    else:
      raise e

def wait_for_vs_endpoint_to_be_ready(vsc, vs_endpoint_name):
  for i in range(180):
    try:
      endpoint = vsc.get_endpoint(vs_endpoint_name)
    except Exception as e:
      #Temp fix for potential REQUEST_LIMIT_EXCEEDED issue
      if "REQUEST_LIMIT_EXCEEDED" in str(e):
        print("WARN: couldn't get endpoint status due to REQUEST_LIMIT_EXCEEDED error. Please manually check your endpoint status")
        return
      else:
        raise e
    status = endpoint.get("endpoint_status", endpoint.get("status"))["state"].upper()
    if "ONLINE" in status:
      return endpoint
    elif "PROVISIONING" in status or i <6:
      if i % 20 == 0: 
        print(f"Waiting for endpoint to be ready, this can take a few min... {endpoint}")
      time.sleep(10)
    else:
      raise Exception(f'''Error with the endpoint {vs_endpoint_name}. - this shouldn't happen: {endpoint}.\n Please delete it and re-run the previous cell: vsc.delete_endpoint("{vs_endpoint_name}")''')
  raise Exception(f"Timeout, your endpoint isn't ready yet: {vsc.get_endpoint(vs_endpoint_name)}")

def index_exists(vsc, endpoint_name, index_full_name):
    try:
        vsc.get_index(endpoint_name, index_full_name).describe()
        return True
    except Exception as e:
        if 'RESOURCE_DOES_NOT_EXIST' not in str(e):
            print(f'Unexpected error describing the index. This could be a permission issue.')
            raise e
    return False
  
def wait_for_index_to_be_ready(vsc, vs_endpoint_name, index_name):
  for i in range(180):
    idx = vsc.get_index(vs_endpoint_name, index_name).describe()
    index_status = idx.get('status', idx.get('index_status', {}))
    status = index_status.get('detailed_state', index_status.get('status', 'UNKNOWN')).upper()
    url = index_status.get('index_url', index_status.get('url', 'UNKNOWN'))
    if "ONLINE" in status:
      return
    if "UNKNOWN" in status:
      print(f"Can't get the status - will assume index is ready {idx} - url: {url}")
      return
    elif "PROVISIONING" in status:
      if i % 40 == 0: print(f"Waiting for index to be ready, this can take a few min... {index_status} - pipeline url:{url}")
      time.sleep(10)
    else:
        raise Exception(f'''Error with the index - this shouldn't happen. DLT pipeline might have been killed.\n Please delete it and re-run the previous cell: vsc.delete_index("{index_name}, {vs_endpoint_name}") \nIndex details: {idx}''')
  raise Exception(f"Timeout, your index isn't ready yet: {vsc.get_index(index_name, vs_endpoint_name)}")

In [0]:
VECTOR_SEARCH_ENDPOINT_NAME = "casml_vs_endpoint"

from databricks.vector_search.client import VectorSearchClient
vsc = VectorSearchClient(disable_notice=True)

if not endpoint_exists(vsc, VECTOR_SEARCH_ENDPOINT_NAME):
    endpoints = vsc.list_endpoints()
    if len(endpoints):
        endpoint_names = [ep['name'] for ep in endpoints['endpoints']]
        for name in endpoint_names:
            vsc.delete_endpoint(name)

    vsc.create_endpoint(name=VECTOR_SEARCH_ENDPOINT_NAME, endpoint_type="STANDARD")

wait_for_vs_endpoint_to_be_ready(vsc, VECTOR_SEARCH_ENDPOINT_NAME)
print(f"Endpoint named {VECTOR_SEARCH_ENDPOINT_NAME} is ready.")

Waiting for endpoint to be ready, this can take a few min... {'name': 'casml_vs_endpoint', 'creator': 'jmendozais@gmail.com', 'creation_timestamp': 1759515491775, 'last_updated_timestamp': 1759515491775, 'endpoint_type': 'STANDARD', 'last_updated_user': 'jmendozais@gmail.com', 'id': '47dc94ca-3f29-465f-9abe-df8a852e75ae', 'endpoint_status': {'state': 'PROVISIONING'}, 'num_indexes': 0}
Waiting for endpoint to be ready, this can take a few min... {'name': 'casml_vs_endpoint', 'creator': 'jmendozais@gmail.com', 'creation_timestamp': 1759515491775, 'last_updated_timestamp': 1759515491775, 'endpoint_type': 'STANDARD', 'last_updated_user': 'jmendozais@gmail.com', 'id': '47dc94ca-3f29-465f-9abe-df8a852e75ae', 'endpoint_status': {'state': 'PROVISIONING'}, 'num_indexes': 0}
Waiting for endpoint to be ready, this can take a few min... {'name': 'casml_vs_endpoint', 'creator': 'jmendozais@gmail.com', 'creation_timestamp': 1759515491775, 'last_updated_timestamp': 1759515491775, 'endpoint_type': 'ST

In [0]:
from databricks.sdk import WorkspaceClient

#The table we'd like to index
catalog = "main"
dbName = "casml"
source_table_fullname = f"{catalog}.{dbName}.knowledge_base"
# Where we want to store our index
vs_index_fullname = f"{catalog}.{dbName}.knowledge_base_vs_index"

if not index_exists(vsc, VECTOR_SEARCH_ENDPOINT_NAME, vs_index_fullname):
  print(f"Creating index {vs_index_fullname} on endpoint {VECTOR_SEARCH_ENDPOINT_NAME}...")
  vsc.create_delta_sync_index(
    endpoint_name=VECTOR_SEARCH_ENDPOINT_NAME,
    index_name=vs_index_fullname,
    source_table_name=source_table_fullname,
    pipeline_type="TRIGGERED",
    primary_key="id",
    embedding_source_column='content', #The column containing our text
    embedding_model_endpoint_name='databricks-gte-large-en' #The embedding endpoint used to create the embeddings
  )
  #Let's wait for the index to be ready and all our embeddings to be created and indexed
  wait_for_index_to_be_ready(vsc, VECTOR_SEARCH_ENDPOINT_NAME, vs_index_fullname)
else:
  #Trigger a sync to update our vs content with the new data saved in the tableoug
  wait_for_index_to_be_ready(vsc, VECTOR_SEARCH_ENDPOINT_NAME, vs_index_fullname)
  vsc.get_index(VECTOR_SEARCH_ENDPOINT_NAME, vs_index_fullname).sync()

print(f"index {vs_index_fullname} on table {source_table_fullname} is ready")

Creating index main.casml.knowledge_base_vs_index on endpoint casml_vs_endpoint...
Waiting for index to be ready, this can take a few min... {'detailed_state': 'PROVISIONING_INDEX', 'message': 'Delta sync Index creation is pending. Check latest status: https://dbc-4f84a5d3-c8b8.cloud.databricks.com/explore/data/main/casml/knowledge_base_vs_index', 'indexed_row_count': 0, 'ready': False, 'index_url': 'dbc-4f84a5d3-c8b8.cloud.databricks.com/api/2.0/vector-search/indexes/main.casml.knowledge_base_vs_index'} - pipeline url:dbc-4f84a5d3-c8b8.cloud.databricks.com/api/2.0/vector-search/indexes/main.casml.knowledge_base_vs_index
index main.casml.knowledge_base_vs_index on table main.casml.knowledge_base is ready


In [0]:
question = "What was the main contribution the 'Chain-of-Thought Prompting Elicits Reasoning in Large Language Models' paper"

results = vsc.get_index(VECTOR_SEARCH_ENDPOINT_NAME, vs_index_fullname).similarity_search(
  query_text=question,
  columns=["id", "content"],
  num_results=1)
  
docs = results.get('result', {}).get('data_array', [])

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.


[[9.0,
  'Chain-of-Thought Prompting Elicits Reasoning in Large Language ModelsJason Wei Xuezhi Wang Dale Schuurmans Maarten BosmaBrian Ichter Fei Xia Ed H. Chi Quoc V. Le Denny ZhouY21 1905-6 [CS.5]Google Research, Brain Team\n{jasonwei,dennyzhou}@google.comAbstractWe explore how generating a chain of thought—a series of intermediate reasoning steps—significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain-of-thought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting.Experiments on three large language models show that chain-of-thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a PaLM 540B with just eight chain-of-thought exemplars achieves state-of-the-art 