✅ Update tests for v0.3

lightonai · Jan 19, 2022 · f630b0b · f630b0b
1 parent 5552b65
commit f630b0b
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 36 deletions.
diff --git a/tests/create.py b/tests/create.py
@@ -7,25 +7,26 @@
 class TestCreateEndpoint(unittest.TestCase):
     def test_single_prompt(self):
         # check types and single input
-        output_keys = {'execution_metadata', 'input_text', 'completions'}
+        output_keys = {'input_text', 'completions'}
         creator = lightonmuse.Create("orion-fr")
         sentence = "C'est quand même un truc magique, se dit le livreur, que d'avoir toujours"
         n_tokens = 16
         outputs, cost, rid = creator(sentence, n_tokens=n_tokens, seed=0, return_logprobs=True)
         assert isinstance(outputs, list), "`outputs` is not list as expected"
         assert len(outputs) == 1, f"`len(outputs) = {len(outputs)}` despite single input."
-        assert cost == n_tokens, f"`cost={cost}` despite {n_tokens} tokens requested."
+        assert cost['orion-fr@default']['total_tokens_generated'] == n_tokens, \
+            f"`cost={cost['orion-fr@default']['total_tokens_generated']}` despite {n_tokens} tokens requested."
         assert isinstance(rid, str), f"Detected type {type(rid)} for `rid`, expected `str` instead."
         assert output_keys == \
                outputs[0].keys(), f"Set of keys is different than expected. Expected {output_keys}" \
                                   f"got {outputs[0].keys()} instead."
         assert outputs[0]["input_text"] == sentence, f"`text` field in `outputs` does not match the" \
-                                               f" input sentence."
+                                                     f" input sentence."
 
         # check scores logic
         completion = outputs[0]["completions"][0]
-        score, normalized_score = completion["score"], completion["normalized_score"]
-        token_scores = [list(element.values())[0] for element in completion["token_scores"]]
+        score, normalized_score = completion["score"]["logprob"], completion["score"]["normalized_logprob"]
+        token_scores = [list(element.values())[0] for element in completion["score"]["token_logprobs"]]
         n_tokens = len(token_scores)
         assert len(token_scores) == n_tokens, f"Generated ({len(token_scores)}) but" \
                                               f"requested {n_tokens} tokens."
@@ -46,8 +47,9 @@ def test_multiple_prompts(self):
         assert isinstance(outputs, list), "`outputs` is not list as expected"
         assert len(outputs) == len(sentence_list), f"`len(outputs) = {len(outputs)}` despite " \
                                                    f"{len(sentence_list)} prompts."
-        assert cost == n_tokens*len(sentence_list), f"`cost={cost}` despite {n_tokens} tokens for " \
-                                                    f"{len(sentence_list)} prompts requested."
+        assert cost['orion-fr@default']['total_tokens_generated'] == n_tokens*len(sentence_list), \
+            f"`cost={cost['orion-fr@default']['total_tokens_generated']}` despite {n_tokens} tokens for " \
+            f"{len(sentence_list)} prompts requested."
         assert isinstance(rid, str), f"Detected type {type(rid)} for `rid`, expected `str` instead."
         assert outputs[1]["input_text"] == sentence_list[1], f"`text` field in `outputs` does not " \
                                                              f"match the input sentence."
@@ -58,11 +60,11 @@ def test_multiple_outputs(self):
         n_tokens, n_completions, n_best = 16, 4, 2
         outputs, cost, rid = creator(sentence, n_tokens=n_tokens, seed=0,
                                      n_completions=n_completions, n_best=n_best)
-        assert len(outputs[0]["completions"]) == n_best, f"Returned " \
-                                                          f"{len(outputs[0]['completions'])}" \
-                                                          f"completions instead of {n_best}."
-        assert cost == n_tokens * n_completions, f"Cost={cost} despite asking for {n_tokens} " \
-                                                 f"tokens for {n_completions} completions."
+        assert len(outputs[0]["completions"]) == n_best, f"Returned {len(outputs[0]['completions'])}" \
+                                                         f"completions instead of {n_best}."
+        assert cost['orion-fr@default']["total_tokens_generated"] == n_tokens * n_completions, \
+            f"Cost={cost['orion-fr@default']['total_tokens_generated']} despite asking for {n_tokens} " \
+            f"tokens for {n_completions} completions."
 
     def test_control(self):
         # word bias
@@ -92,7 +94,8 @@ def test_control(self):
                                      concat_prompt=True)
         assert outputs[0]["completions"][0]["output_text"][-2:] == "et", f"Completion does not " \
                                                                          f"end with stopword"
-        assert cost < n_tokens, f"Cost is higher than expected given generation ended at stopword."
+        assert cost["orion-fr@default"]["total_tokens_generated"] < n_tokens, f"Cost is higher than expected given " \
+                                                                              f"that generation ended at stopword."
 
     def test_utilities(self):
         # check reproducibility with the seed
@@ -107,7 +110,7 @@ def test_utilities(self):
 
         # check tokens scores
         outputs, _, _ = creator(sentence, n_tokens=n_tokens, seed=0, return_logprobs=False)
-        token_scores = outputs[0]["completions"][0]["token_scores"]
+        token_scores = outputs[0]["completions"][0]['score']["token_logprobs"]
         assert token_scores is None, f"Token scores is not None, despite not asking for logprobs."
 
         # check concat_prompt

diff --git a/tests/fail.py b/tests/fail.py
@@ -7,7 +7,7 @@
 
 single_input_endpoints = [lightonmuse.Analyse,
                           lightonmuse.Create,
-                          lightonmuse.Represent]
+                          lightonmuse.Embed]
 double_input_endpoints = [lightonmuse.Compare,
                           lightonmuse.Select]
 
@@ -59,5 +59,6 @@ def test_prompt_too_long(self):
                                                               f"did not raise message about input" \
                                                               f"too long."
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/understand.py b/tests/understand.py
@@ -7,21 +7,22 @@
 class TestUnderstandEndpoints(unittest.TestCase):
     def test_analyse(self):
         # check types and single input
-        output_keys = {'execution_metadata', 'text', 'score', 'normalized_score', 'token_scores'}
+        output_keys = {'execution_metadata', 'text', 'score'}
         analyser = lightonmuse.Analyse("orion-fr")
         sentence = "Je voudrais un café et deux croissants, s'il vous plait."
         outputs, cost, rid = analyser(sentence)
         assert isinstance(outputs, list), "`outputs` is not list as expected"
         assert len(outputs) == 1, f"`len(outputs) = {len(outputs)}` despite single input."
-        assert cost == 1, f"`cost={cost}` despite single Analyse call."
+        assert cost["orion-fr@default"]["batch_size"] == 1, f"`batch_size={cost['orion-fr@default']['batch_size']}` " \
+                                                            f"despite single Analyse call."
         assert isinstance(rid, str), f"Detected type {type(rid)} for `rid`, expected `str` instead."
         assert output_keys == \
                outputs[0].keys(), f"Set of keys is different than expected. Expected {output_keys}" \
                                   f"got {outputs[0].keys()} instead."
         assert outputs[0]["text"] == sentence, f"`text` field in `outputs` does not match the" \
                                                f" input sentence."
-        score, normalized_score = outputs[0]["score"], outputs[0]["normalized_score"]
-        n_tokens = len(outputs[0]["token_scores"])
+        score, normalized_score = outputs[0]["score"]["logprob"], outputs[0]["score"]["normalized_logprob"]
+        n_tokens = len(outputs[0]["score"]["token_logprobs"])
         assert score <= 0., f"Detected score > 0. This would give probability={math.exp(score)} " \
                             f"that is greater than 1."
         assert math.isclose(normalized_score, score/n_tokens), f"Normalized score isn't close to " \
@@ -34,31 +35,32 @@ def test_analyse(self):
         assert isinstance(outputs, list), "`outputs` is not list as expected"
         assert len(outputs) == len(sentence_list), f"`len(outputs) = {len(outputs)}` despite " \
                                                    f"len(input)={len(sentence_list)}"
-        assert cost == len(sentence_list), f"`cost={cost}` despite len(input)={len(sentence_list)}"
+        assert cost["orion-fr@default"]["batch_size"] == len(sentence_list), \
+            f"`cost={cost['orion-fr@default']['batch_size']}` despite len(input)={len(sentence_list)}"
         assert isinstance(rid, str), f"Detected type {type(rid)} for `rid`, expected `str` instead."
 
         # check correct functioning
         message = "The unlikely sentence is more likely than the normal one."
-        assert outputs[0]["normalized_score"] > outputs[1]["normalized_score"], f"{message}"
-        token_scores = [list(element.values())[0] for element in outputs[1]["token_scores"]]
+        assert outputs[0]["score"]["normalized_logprob"] > outputs[1]["score"]["normalized_logprob"], f"{message}"
+        token_scores = [list(element.values())[0] for element in outputs[1]["score"]["token_logprobs"]]
         message = f"The most unlikely token is different than expected."
         assert math.isclose(min(token_scores), token_scores[-1]), f"{message}"
 
-    def test_represent(self):
+    def test_embed(self):
         # check types and single input
-        # TODO: update this when `txt`->`text` upstream
-        output_keys = {'execution_metadata', 'txt', 'embedding'}
-        representer = lightonmuse.Represent("orion-fr")
+        output_keys = {'execution_metadata', 'text', 'embedding'}
+        representer = lightonmuse.Embed("orion-fr")
         sentence = "Je voudrais un café et deux croissants, s'il vous plait."
         outputs, cost, rid = representer(sentence)
         assert isinstance(outputs, list), "`outputs` is not list as expected"
         assert len(outputs) == 1, f"`len(outputs) = {len(outputs)}` despite single input."
-        assert cost == 1, f"`cost={cost}` despite single Represent call."
+        assert cost['orion-fr@default']['batch_size'] == 1, \
+            f"`cost={cost['orion-fr@default']['batch_size']}` despite single Represent call."
         assert isinstance(rid, str), f"Detected type {type(rid)} for `rid`, expected `str` instead."
         assert output_keys == \
                outputs[0].keys(), f"Set of keys is different than expected. Expected {output_keys}" \
                                   f"got {outputs[0].keys()} instead."
-        assert outputs[0]["txt"] == sentence, f"`txt` field in `outputs` does not match the" \
+        assert outputs[0]["text"] == sentence, f"`text` field in `outputs` does not match the" \
                                                f" input sentence."
         embedding = outputs[0]["embedding"]
         assert isinstance(embedding, list)
@@ -71,7 +73,8 @@ def test_represent(self):
         assert isinstance(outputs, list), "`outputs` is not list as expected"
         assert len(outputs) == len(sentence_list), f"`len(outputs) = {len(outputs)}` despite " \
                                                    f"len(input)={len(sentence_list)}"
-        assert cost == len(sentence_list), f"`cost={cost}` despite len(input)={len(sentence_list)}"
+        assert cost["orion-fr@default"]["batch_size"] == len(sentence_list), \
+            f"`batch size={cost['orion-fr@default']['batch_size']}` despite len(input)={len(sentence_list)}"
         assert isinstance(rid, str), f"Detected type {type(rid)} for `rid`, expected `str` instead."
 
         first_embedding, second_embedding = outputs[0]["embedding"], outputs[1]["embedding"]
@@ -89,7 +92,8 @@ def test_select(self):
         outputs, cost, rid = selecter(reference, candidates)
         assert isinstance(outputs, list), "`outputs` is not list as expected"
         assert len(outputs) == 1, f"`len(outputs) = {len(outputs)}` despite single reference."
-        assert cost == len(candidates), f"`cost={cost}` despite {candidates} candidates."
+        assert cost["orion-fr@default"]["batch_size"] == len(candidates), \
+            f"`batch size={cost['orion-fr@default']['batch_size']}` despite {len(candidates)} candidates."
         assert isinstance(rid, str), f"Detected type {type(rid)} for `rid`, expected `str` instead."
         assert output_keys == \
                outputs[0].keys(), f"Set of keys is different than expected. Expected " \
@@ -103,9 +107,9 @@ def test_select(self):
         assert len(rankings) == len(candidates), f"Got {len(rankings)}  elements in rankings " \
                                                  f"while {len(candidates)} candidates were given."
 
-        scores = [element["score"] for element in rankings]
-        normalized_scores = [element["normalized_score"] for element in rankings]
-        n_tokens = [len(element["token_scores"]) for element in rankings]
+        scores = [element["score"]["logprob"] for element in rankings]
+        normalized_scores = [element["score"]["normalized_logprob"] for element in rankings]
+        n_tokens = [len(element["score"]["token_logprobs"]) for element in rankings]
         message = f"Normalized score isn't close to score divided by number of tokens."
         assert all([math.isclose(ns, s/n)]
                    for s, ns, n in zip(scores, normalized_scores, n_tokens)), message
@@ -117,21 +121,22 @@ def test_select(self):
         conjunction = "est equivalent à"
         outputs, cost, rid = selecter(reference, candidates, conjunction=conjunction)
         rankings = outputs[0]["rankings"]
-        normalized_scores = [element["normalized_score"] for element in rankings]
+        normalized_scores = [element["score"]["normalized_logprob"] for element in rankings]
         best_score_with_conj = max(normalized_scores)
         assert best_score_with_conj > best_score_no_conj, f"Conjunction `{conjunction}` does not" \
                                                           f"improve the score."
 
     def test_compare(self):
-        output_keys = {"reference", "similarities", "execution_metadata"}
+        output_keys = {"reference", "similarities", "best", "execution_metadata"}
         comparer = lightonmuse.Compare("orion-fr")
         reference = "Je suis content"
         correct, wrong, out_of_context = "Je suis heureux", "Je suis triste", "Hello world adhsh"
         candidates = [wrong, correct, out_of_context]
         outputs, cost, rid = comparer(reference, candidates)
         assert isinstance(outputs, list), "`outputs` is not list as expected"
         assert len(outputs) == 1, f"`len(outputs) = {len(outputs)}` despite single reference."
-        assert cost == len(candidates)+1, f"`cost={cost}` different from {candidates} candidates+1."
+        assert cost['orion-fr@default']['batch_size'] == len(candidates)+1, \
+            f"`batch_size={cost['orion-fr@default']['batch_size']}` different from {candidates} candidates+1."
         assert isinstance(rid, str), f"Detected type {type(rid)} for `rid`, expected `str` instead."
         assert output_keys == \
                outputs[0].keys(), f"Set of keys is different than expected. Expected " \