### Importations

In [2]:
#!pip install torch
#!pip install transformers
#!pip install git+https://github.com/corolla-johnson/mkultra.git#egg=mkultra --log PIP_LOG

In [3]:
import torch

from transformers.pipelines import pipeline
from mkultra.inference import GPT2SoftPromptLM
from mkultra.tokenizers import GPT2SPTokenizerFast
from mkultra.soft_prompt import SoftPrompt

2023-03-10 14:45:11.637107: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-10 14:45:11.637138: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Soft prompt

In [4]:
model = GPT2SoftPromptLM.from_pretrained("gpt2")
tokenizer = GPT2SPTokenizerFast.from_pretrained("gpt2")
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

sp = SoftPrompt.from_string("""The artist Jeff Koons' famous sculptures might look like they're made from balloons -- but the works are actually fragile, as one art fair attendee found out when she knocked over a $42,000 Koons piece Thursday, causing it to shatter.

A blue balloon dog sculpture created by Koons broke into tiny shards when a visitor accidentally kicked its podium, according to the gallery hosting the piece.
Bel-Air Fine Art was displaying the piece at its booth at Art Wynwood, a contemporary art fair in Miami.

Here is a summary of this article :""", model=model, tokenizer=tokenizer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'GPT2SPTokenizerFast'.
The model 'GPT2SoftPromptLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'GPT2LMHeadModel', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTJForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWi

In [5]:
prompt = sp.get_tag_str()

In [6]:
prompt_len = len(tokenizer.encode(prompt))
print(f"Length of soft prompt: {len(tokenizer.encode(sp.get_tag_str()))}")
print(f"Length of full prompt: {prompt_len}")

Length of soft prompt: 117
Length of full prompt: 117


In [7]:
output = generator(prompt,
                    do_sample=True,
                    min_length=prompt_len+100,
                    max_length=prompt_len+100,
                    repetition_penalty=1.7,
                    top_p=0.8,
                    temperature=0.4,
                    use_cache=True,
                    return_full_text=True)

print(output[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<FromString-7675bfab-@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@>
"The blue balloon dog sculpture is a very simple piece of art, but it's really fragile. It breaks into tiny pieces and then explodes when you hit the ground." - Jeff Koons "It was just like an earthquake in my head that I didn't know what to do with myself because there were so many people around me who thought this would be fun for them," said artist Jeffrey Pascual-Paz, whose work has been featured on CNN. He says he found out about


In [8]:
# # It is also fine to use generate() instead of a pipeline.
# input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# output = model.generate(input_ids,
#                         do_sample=True,
#                         min_length=prompt_len+100,
#                         max_length=prompt_len+100,
#                         repetition_penalty=1.7,
#                         top_p=0.8,
#                         temperature=0.7,
#                         use_cache=True,
#                         return_full_text=True,
#                         num_return_sequences=3)

# print(tokenizer.decode(output[0]))
# print(tokenizer.decode(output[1]))
# print(tokenizer.decode(output[2]))

### Soft prompt + hard prompt

In [9]:
model = GPT2SoftPromptLM.from_pretrained("gpt2")
tokenizer = GPT2SPTokenizerFast.from_pretrained("gpt2")
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

sp = SoftPrompt.from_string("""The artist Jeff Koons' famous sculptures might look like they're made from balloons -- but the works are actually fragile, as one art fair attendee found out when she knocked over a $42,000 Koons piece Thursday, causing it to shatter.

A blue balloon dog sculpture created by Koons broke into tiny shards when a visitor accidentally kicked its podium, according to the gallery hosting the piece.
Bel-Air Fine Art was displaying the piece at its booth at Art Wynwood, a contemporary art fair in Miami.""", model=model, tokenizer=tokenizer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'GPT2SPTokenizerFast'.
The model 'GPT2SoftPromptLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'GPT2LMHeadModel', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTJForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWi

In [10]:
prompt = sp.get_tag_str() + "Here is a summary of this article :"

In [11]:
prompt_len = len(tokenizer.encode(prompt))
print(f"Length of soft prompt: {len(tokenizer.encode(sp.get_tag_str()))}")
print(f"Length of full prompt: {prompt_len}")

Length of soft prompt: 107
Length of full prompt: 115


In [12]:
output = generator(prompt,
                    do_sample=True,
                    min_length=prompt_len+100,
                    max_length=prompt_len+100,
                    repetition_penalty=1.7,
                    top_p=0.4,
                    temperature=0.4,
                    use_cache=True,
                    return_full_text=True)

print(output[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<FromString-7db84666-@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@>Here is a summary of this article :
The artist Jeff Koons' famous sculptures might look like they're made from balloons -- but the works are actually fragile, as one art fair attendee found out when she knocked over an $42.5 million piece Thursday night. The sculpture was created by kitty-dog owner and illustrator Jeffrey Loomis, who said he had been working on his own balloon artwork for years before it broke into pieces in front at Art Wynwood last week (see below). "I'm not sure


### Soft + hard prompt (avec prompt long)

In [13]:
model = GPT2SoftPromptLM.from_pretrained("gpt2")
tokenizer = GPT2SPTokenizerFast.from_pretrained("gpt2")
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

string = """A sugar replacement called erythritol – used to add bulk or sweeten stevia, monkfruit and keto reduced-sugar products – has been linked to blood clotting, stroke, heart attack and death, according to a new study.

“The degree of risk was not modest,” said lead study author Dr. Stanley Hazen, director of the Center for Cardiovascular Diagnostics and Prevention at the Cleveland Clinic Lerner Research Institute.

People with existing risk factors for heart disease, such as diabetes, were twice as likely to experience a heart attack or stroke if they had the highest levels of erythritol in their blood, according to the study, published Monday in the journal Nature Medicine.

 “If your blood level of erythritol was in the top 25% compared to the bottom 25%, there was about a two-fold higher risk for heart attack and stroke. It’s on par with the strongest of cardiac risk factors, like diabetes,” Hazen said.

Additional lab and animal research presented in the paper revealed that erythritol appeared to be causing blood platelets to clot more readily. Clots can break off and travel to the heart, triggering a heart attack, or to the brain, triggering a stroke.

“This certainly sounds an alarm,” said Dr. Andrew Freeman, director of cardiovascular prevention and wellness at National Jewish Health, a hospital in Denver, who was not involved in the research.

“There appears to be a clotting risk from using erythritol,” Freeman said. “Obviously, more research is needed, but in an abundance of caution, it might make sense to limit erythritol in your diet for now.”

In response to the study, the Calorie Control Council, an industry association, told CNN that “the results of this study are contrary to decades of scientific research showing reduced-calorie sweeteners like erythritol are safe, as evidenced by global regulatory permissions for their use in foods and beverages,” said Robert Rankin, the council’s executive director, in an email.

The results “should not be extrapolated to the general population, as the participants in the intervention were already at increased risk for cardiovascular events,” Rankin said.

The European Association of Polyol Producers declined to comment, saying it had not reviewed the study."""

sp = SoftPrompt.from_string(string, model=model, tokenizer=tokenizer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'GPT2SPTokenizerFast'.
The model 'GPT2SoftPromptLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'GPT2LMHeadModel', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTJForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWi

In [14]:
prompt = sp.get_tag_str() + "Here is a summary of this article :"

In [15]:
prompt_len = len(tokenizer.encode(prompt))
print(f"Length of soft prompt: {len(tokenizer.encode(sp.get_tag_str()))}")
print(f"Length of full prompt: {prompt_len}")

Length of soft prompt: 514
Length of full prompt: 522


In [16]:
output = generator(prompt,
                    do_sample=True,
                    min_length=prompt_len+100,
                    max_length=prompt_len+100,
                    repetition_penalty=1.7,
                    top_p=0.4,
                    temperature=0.4,
                    use_cache=True,
                    return_full_text=True)

print(output[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<FromString-e57fd799-@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@

In [58]:
model = GPT2SoftPromptLM.from_pretrained("gpt2")
tokenizer = GPT2SPTokenizerFast.from_pretrained("gpt2")
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

with open('datasets/edf-wiki.txt', 'r') as infile:
    string = infile.read()[:3800]

sp = SoftPrompt.from_string(string, model=model, tokenizer=tokenizer)

print(string)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'GPT2SPTokenizerFast'.
The model 'GPT2SoftPromptLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'GPT2LMHeadModel', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTJForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWi

Électricité de France

Électricité de France S.A. (literally Electricity of France), commonly known as EDF, is a French multinational electric utility company, largely owned by the French state. Headquartered in Paris, with €71.2 billion in revenues in 2016, EDF operates a diverse portfolio of at least 120 gigawatts of generation capacity in Europe, South America, North America, Asia, the Middle East, and Africa.

In 2009, EDF was the world's largest producer of electricity. Its 56 active nuclear reactors (in France) are spread out over 18 sites (nuclear power plants). They comprise 32 reactors of 900 MWe, 20 reactors of 1,300 MWe, and 4 reactors of 1,450 MWe, all PWRs.

EDF was created on 8 April 1946 by the 1945 parliament, from the merging of various divided actors. EDF led France's post-war energy growth, with a unique focus on civil nuclear energy, through reconstruction and further industrialization within the Trente Glorieuse, being a fleuron of France's new industrial landscape

In [59]:
prompt = sp.get_tag_str() + "Answer the following question.\nQuestion : In what city are EDF's headquarters located ?\nAnswer : "

In [60]:
prompt_len = len(tokenizer.encode(prompt))
print(f"Length of soft prompt: {len(tokenizer.encode(sp.get_tag_str()))}")
print(f"Length of full prompt: {prompt_len}")

Length of soft prompt: 892
Length of full prompt: 914


In [61]:
output = generator(prompt,
                    do_sample=True,
                    min_length=prompt_len+100,
                    max_length=prompt_len+100,
                    repetition_penalty=1.7,
                    top_p=0.4,
                    temperature=0.4,
                    use_cache=True,
                    return_full_text=True,
                  pad_token_id=tokenizer.eos_token_id)

print(output[0]['generated_text'])

<FromString-e83e6b47-@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@

In [64]:
prompt = sp.get_tag_str() + "Answer the following question.\nQuestion : How many people does EDF employ worldwide ?\nAnswer : "

In [65]:
output = generator(prompt,
                    do_sample=True,
                    min_length=prompt_len+100,
                    max_length=prompt_len+100,
                    repetition_penalty=1.7,
                    top_p=0.4,
                    temperature=0.4,
                    use_cache=True,
                    return_full_text=True,
                  pad_token_id=tokenizer.eos_token_id)

print(output[0]['generated_text'])

<FromString-e83e6b47-@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@

In [66]:
prompt = sp.get_tag_str() + "Answer the following question.\nQuestion : What does the acronym 'EDF' stand for ?\nAnswer : "

In [69]:
output = generator(prompt,
                    do_sample=True,
                    min_length=prompt_len+100,
                    max_length=prompt_len+100,
                    repetition_penalty=1.7,
                    top_p=0.4,
                    temperature=0.6,
                    use_cache=True,
                    return_full_text=True,
                  pad_token_id=tokenizer.eos_token_id)

print(output[0]['generated_text'])

<FromString-e83e6b47-@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@><@