In [1]:
from model_dict import *
from inference import *
import pandas as pd
import random, csv
import guidance
from transformers import AutoTokenizer, pipeline, logging

  from .autonotebook import tqdm as notebook_tqdm


## Instantiate guidance

In [2]:
summ_path = '../data/summarization/train_summarization.tsv'
summ_df = pd.read_csv(summ_path, delimiter="\t", quoting=csv.QUOTE_NONE)

mt_path = '../data/en_de/train_en_de.tsv'
mt_df = pd.read_csv(mt_path, delimiter="\t", quoting=csv.QUOTE_NONE)

In [3]:
model_name = 'WizardLM-13B-V1.1-GPTQ'
model_path = f'../models/{model_name}'

model, tokenizer, u_prompt, a_prompt = load_from_catalogue(model_name, model_path)



In [4]:
g_model = guidance.llms.Transformers(
            model, tokenizer=tokenizer, trust_remote_code=True,
)
guidance.llms.Transformers.cache.clear()
guidance.llm = g_model

In [6]:
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=512,
#     temperature=0.7,
#     top_p=0.95,
#     repetition_penalty=1.15
# )

The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead',

#### Zero-Shot

In [14]:
input_prompt = '''
{{prompt_placeholder}}
Score the summarization with respect to the summarized document\
on a continuous scale from 0 to 100, where a score of zero means\
irrelevant, factually incorrect and not readable" and score of one hundred means\
relevant, factually correct, good readability.
----
Source text: {{gt}}
Summary: {{hyp}}
{{response_placeholder}}
Score: {{gen 'score' pattern='(100|[1-9]?[0-9])'}}
'''

In [16]:
structure_program = guidance(input_prompt, llm=g_model, caching=False)
zero_shot = structure_program(
    gt=summ_df['SRC'][2],
    hyp=summ_df['HYP'][2],
    prompt_placeholder=u_prompt,
    response_placeholder=a_prompt
)
torch.cuda.empty_cache()

zero_shot()

In [18]:
input_prompt = '''
{{prompt_placeholder}}
Score the following translation from {{source_lang}} to {{target_lang}} with respect to\
the source sentence on a continuous scale from 0 to 100, where a score of zero means\
"no meaning preserved" and score of one hundred means "perfect meaning and grammar".
----
{{source_lang}} source: "{{gt}}"
{{target_lang}} translation: "{{hyp}}"
{{response_placeholder}}
Score: {{gen 'score' pattern='(100|[1-9]?[0-9])'}}
'''

In [19]:
structure_program = guidance(input_prompt, llm=g_model, caching=False)
zero_shot = structure_program(
    gt=mt_df['SRC'][2],
    hyp=mt_df['HYP'][2],
    prompt_placeholder=u_prompt,
    response_placeholder=a_prompt,
    source_lang="English",
    target_lang="German"
)
torch.cuda.empty_cache()

zero_shot()

#### Few-Shot

In [20]:
examples = [
    {'gt': f'{summ_df["SRC"][0]}',
    'hyp': f'{summ_df["HYP"][0]}',
    'score': f'{summ_df["Score"][0]}'},
    {'gt': f'{summ_df["SRC"][1]}',
    'hyp': f'{summ_df["HYP"][1]}',
    'score': f'{summ_df["Score"][1]}'},
]

input_prompt = '''
{{prompt_placeholder}}
Score the summarization with respect to the summarized document\
on a continuous scale from 0 to 100, where a score of zero means\
irrelevant, factually incorrect and not readable" and score of one hundred means\
relevant, factually correct, good readability.
----
{{~! display the few-shot examples ~}}
{{~#each examples}}
Source text: {{this.gt}}
Summary: {{this.hyp}}
{{response_placeholder}}
Score: {{this.score}}
----
{{~/each}}

{{~! place the real question at the end }}
Source text: {{gt}}
Summary: {{hyp}}
{{response_placeholder}}
Score: {{gen 'score' pattern='(100|[1-9]?[0-9])'}}
'''

In [21]:
structure_program = guidance(input_prompt, llm=g_model, caching=False)
few_shot = structure_program(
    examples=examples,
    gt=summ_df['SRC'][2],
    hyp=summ_df['HYP'][2],
    prompt_placeholder=u_prompt,
    response_placeholder=a_prompt
)
torch.cuda.empty_cache()

few_shot()

In [24]:
examples = [
    {'gt': f'{mt_df["SRC"][0]}',
    'hyp': f'{mt_df["HYP"][0]}',
    'score': f'{mt_df["mqm"][0]}'},
    {'gt': f'{mt_df["SRC"][1]}',
    'hyp': f'{mt_df["HYP"][1]}',
    'score': f'{mt_df["mqm"][1]}'},
]

input_prompt = '''
{{prompt_placeholder}}
Score the following translation from {{source_lang}} to {{target_lang}} with respect to\
the source sentence on a continuous scale from 0 to 100, where a score of zero means\
"no meaning preserved" and score of one hundred means "perfect meaning and grammar".
----
{{~! display the few-shot examples ~}}
{{~#each examples}}
{{source_lang}} source: {{this.gt}}
{{target_lang}} translation: {{this.hyp}}
{{response_placeholder}}
Score: {{this.score}}
----
{{~/each}}

{{~! place the real question at the end }}
{{source_lang}} source: {{gt}}
{{target_lang}} translation: {{hyp}}
{{response_placeholder}}
Score: {{gen 'score' pattern='(100|[1-9]?[0-9])'}}
'''

In [25]:
structure_program = guidance(input_prompt, llm=g_model, caching=False)
few_shot = structure_program(
    examples=examples,
    gt=mt_df['SRC'][2],
    hyp=mt_df['HYP'][2],
    prompt_placeholder=u_prompt,
    response_placeholder=a_prompt,
    source_lang="English",
    target_lang="German"
)
torch.cuda.empty_cache()

few_shot()

#### Chain of Thought

In [26]:
input_prompt = '''
{{prompt_placeholder}}
Score the summarization with respect to the summarized document\
on a continuous scale from 0 to 100, where a score of zero means\
irrelevant, factually incorrect and not readable" and score of one hundred means\
relevant, factually correct, good readability.
    
Evaluation Steps:
1. Read the summary carefully and identify the main topic and key points.
2. Read the summary and compare it to the source text.\
Check if the summary covers the main topic and key points of source text,\
and if it presents them in a clear and logical order
3. Assign a score.
----
{{~! place the real question at the end }}
Source text: {{gt}}
Summary: {{hyp}}
{{response_placeholder}}
Score: {{gen 'score' pattern='(100|[1-9]?[0-9])'}}
'''

In [27]:
structure_program = guidance(input_prompt, llm=g_model, caching=False)
cot = structure_program(
    gt=summ_df['SRC'][2],
    hyp=summ_df['HYP'][2],
    prompt_placeholder=u_prompt,
    response_placeholder=a_prompt
)
torch.cuda.empty_cache()

cot()

In [31]:
input_prompt = '''
{{prompt_placeholder}}
Score the following translation from {{source_lang}} to {{target_lang}} with respect to\
the source sentence on a continuous scale from 0 to 100, where a score of zero means\
"no meaning preserved" and score of one hundred means "perfect meaning and grammar".
    
Evaluation Steps:
1. Read source and translation independently.
2. Read translation and compare it to the source.\
 Check if the translation covers all the information and fluency of source,\
and if it doesn't have any syntatic errors.
3. Assign a score.
----
{{source_lang}} source: "{{gt}}"
{{target_lang}} translation: "{{hyp}}"
{{response_placeholder}}
Score: {{gen 'score' pattern='(100|[1-9]?[0-9])'}}
'''

In [30]:
structure_program = guidance(input_prompt, llm=g_model, caching=False)
cot = structure_program(
    gt=mt_df['SRC'][2],
    hyp=mt_df['HYP'][2],
    prompt_placeholder=u_prompt,
    response_placeholder=a_prompt,
    source_lang="English",
    target_lang="German"
)
torch.cuda.empty_cache()

cot()

#### Fine-Grained

#### Generartion Probability

#### Majority Vote

#### Self-Refinement