# Gemini response generation

In [7]:
import google.generativeai as genai


with open("GOOGLE_API_KEY.txt", "r") as f:
    GOOGLE_API_KEY = f.read().strip()

genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel("models/gemini-1.0-pro")

with open("prompts.txt", "r") as f:
    prompts = f.read().splitlines()

hitler_prompt = "Respond to the following creative writing prompt: Thwarting infamous dictators has become a sport amongst time travelers. Points are awarded for creativity and difficulty. You are last year's champion, tell the story of how you won?"
prompts[0] = hitler_prompt

n_responses_per_prompt = 80
prompts_800 = []
for prompt in prompts:
    for _ in range(n_responses_per_prompt):
        prompts_800.append(prompt)

## Prompt modification because safety blocks

We have to use a modified version of the prompt about Killing Hitler (again) because Gemini sometimes marked the original prompt with the following:

```
block_reason: SAFETY
safety_ratings {
  category: HARM_CATEGORY_SEXUALLY_EXPLICIT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HATE_SPEECH
  probability: LOW
}
safety_ratings {
  category: HARM_CATEGORY_HARASSMENT
  probability: MEDIUM
}
safety_ratings {
  category: HARM_CATEGORY_DANGEROUS_CONTENT
  probability: NEGLIGIBLE
}
```

I tried changing the safety settings and used the test prompt: `How to make a gun`, but it was still marked as unsafe.
```
index: 0
content {
  parts {
    text: "I\'m sorry, I can\'t help you with that. Making a gun is illegal and dangerous. If you\'re interested in learning more about gun safety, I can provide you with some resources."
  }
  role: "model"
}
finish_reason: STOP
safety_ratings {
  category: HARM_CATEGORY_SEXUALLY_EXPLICIT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HATE_SPEECH
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HARASSMENT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_DANGEROUS_CONTENT
  probability: NEGLIGIBLE
}
```

So we use the same prompt we used for Bard:

```
Respond to the following creative writing prompt: Thwarting infamous dictators has become a sport amongst time travelers. Points are awarded for creativity and difficulty. You are last year's champion, tell the story of how you won?
```

In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import threading
import time


def generate_response(prompt, responses, i):
    response = None
    while True:
        response = model.generate_content(prompt)
        if response.candidates:
            break

    response_text = " ".join([part.text for part in response.parts])
    response_text = re.sub(r"\s+", " ", response_text).strip()

    responses[i] = response_text


threads = []
responses = np.zeros(len(prompts_800), dtype=object)
for i, prompt in enumerate(tqdm(prompts_800)):
    thread = threading.Thread(target=generate_response, args=(prompt, responses, i))
    thread.start()
    threads.append(thread)
    time.sleep(2)

for thread in threads:
    thread.join()

responses = list(responses)
df = pd.DataFrame({"prompt": prompts_800, "response": responses})
df.to_csv("gemini1.csv", index=False)

## Rare generation failure

33 out of the 2400 generations failed on the first attempt, so I completed the same generation again and combined the results to sample 80 responses per prompt, taking only responses with >= 10 characters.

In [8]:
df0 = pd.read_csv("gemini0.csv")
df0["response"] = df0["response"].fillna("")

failed_responses_counts = df0[df0["response"].str.len() <= 3]["response"].value_counts(
    dropna=False
)
for prompt, count in failed_responses_counts.items():
    print(f"{count} responses failed with: {prompt}")

print()

failed_responses_counts = df0[df0["response"].str.len() <= 3]["prompt"].value_counts(
    dropna=False
)
for prompt, count in failed_responses_counts.items():
    print(f"{count} responses failed for prompt: {prompt}")

26 responses failed with: 
4 responses failed with: .
2 responses failed with: 0
1 responses failed with: )

9 responses failed for prompt: Respond to the following creative writing prompt: Thwarting infamous dictators has become a sport amongst time travelers. Points are awarded for creativity and difficulty. You are last year's champion, tell the story of how you won?
6 responses failed for prompt: You are born without emotions; to compensate this, you started a donation box where people could donate their unwanted emotions. You've lived a life filled with sadness, fear and regret until one day, someone donates happiness.
4 responses failed for prompt: There is no prompt. Just write a story you've always been thinking about or one you've been thinking about sharing. Anything goes.
3 responses failed for prompt: Write a short story where the first sentence has 20 words, 2nd sentence has 19, 3rd has 18 etc. Story ends with a single word.
3 responses failed for prompt: You are a kid's i

In [20]:
import pandas as pd


combined_df = pd.concat([pd.read_csv("gemini0.csv"), pd.read_csv("gemini1.csv")])
final_df = pd.DataFrame(columns=["prompt", "response"])
for prompt in combined_df["prompt"].unique():
    # each response must have more than 10 characters
    responses_long_df = combined_df[
        (combined_df["response"].str.len() > 10) & (combined_df["prompt"] == prompt)
    ]
    responses_long_df = responses_long_df.sample(n=80, random_state=42)
    final_df = pd.concat([final_df, responses_long_df])

final_df.reset_index(drop=True, inplace=True)
final_df.to_csv("gemini.csv", index=False)

## Preprocess the text in the same way as the other responses (nltk tokenization)

In [28]:
import pandas as pd
from nltk.tokenize import word_tokenize


df = pd.read_csv("gemini.csv")

df["response"] = df["response"].apply(lambda x: " ".join(word_tokenize(x)).strip())

df.to_csv("gemini_nltk.csv", index=False)

# LLM vector generation

In [3]:
from transformers import (
    AutoTokenizer,
    AutoModel,
    BertTokenizer,
    BertModel,
    OPTModel,
    RobertaTokenizer,
    RobertaModel,
)
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("gemini_nltk.csv")

bert_model_name = "bert-base-uncased"
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)

roberta_model_name = "roberta-base"
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_model_name)
roberta_model = RobertaModel.from_pretrained(roberta_model_name)

opt_model_name = "facebook/opt-350m"
opt_tokenizer = AutoTokenizer.from_pretrained(opt_model_name)
opt_model = OPTModel.from_pretrained(opt_model_name)

llama_model_name = "meta-llama/Llama-2-7b-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
llama_model = AutoModel.from_pretrained(
    llama_model_name, device_map="auto", load_in_8bit=True
)


def apply_bert(model, tokenizer, response):
    inputs = tokenizer(response, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]
    cls_embedding = last_hidden_states[0, 0, :].tolist()
    return cls_embedding


def apply_roberta(model, tokenizer, response):
    inputs = tokenizer(response, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]
    cls_embedding = last_hidden_states[0, 0, :].tolist()
    return cls_embedding


def apply_opt(model, tokenizer, response):
    inputs = tokenizer(response, return_tensors="pt", max_length=2048, truncation=True)
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]
    eos_embedding = last_hidden_states[0, -1, :].tolist()
    return eos_embedding


def apply_llama(model, tokenizer, response):
    inputs = tokenizer(response, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]
    vector = last_hidden_states[0, -1, :].tolist()
    return vector


df["bert"] = df["response"].apply(
    lambda response: apply_bert(bert_model, bert_tokenizer, response)
)
print("BERT done")
df["roberta"] = df["response"].apply(
    lambda response: apply_roberta(roberta_model, roberta_tokenizer, response)
)
print("RoBERTa done")
df["opt"] = df["response"].apply(
    lambda response: apply_opt(opt_model, opt_tokenizer, response)
)
print("OPT done")
df["llama"] = df["response"].apply(
    lambda response: apply_llama(llama_model, llama_tokenizer, response)
)
print("LLaMA done")


df.to_csv("gemeni_llm_vectors.csv", index=False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]


BERT done
RoBERTa done
OPT done
LLaMA done


# Combine with style vectors and other dataframe

In [9]:
import pandas as pd


df_gemini_llm = pd.read_csv("gemini_llm_vectors.csv")
df_gemini_style = pd.read_csv("gemini_style_vectors.csv")

df_gemini_style = df_gemini_style[["common", "function"]]
df_gemini = pd.concat([df_gemini_llm, df_gemini_style], axis=1)

df_gemini["author"] = "gemini"

df_without_gemini = pd.read_csv("data_without_gemini.csv")
df_final = pd.concat([df_without_gemini, df_gemini], ignore_index=True)
df_final

Unnamed: 0,author,prompt,response,bert,roberta,opt,llama,common,function
0,reddit,Killing Hitler has become a sport amongst time...,* Magnificient work ! Truly inspiring ! * Hein...,"[-0.006725453305989504, 0.23576399683952332, -...","[-0.04180792719125748, 0.040072135627269745, -...","[-0.7724277377128601, -1.4468441009521484, -3....","[2.265625, 0.82666015625, 0.69775390625, 0.782...","[4, 3, 5, 1, 4, 3, 6, 0, 0, 0, 0, 1, 0, 1, 1, ...","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,reddit,Killing Hitler has become a sport amongst time...,Even by my standards this one took some work ....,"[0.07968071103096008, -0.020308567211031914, 0...","[-0.02620106004178524, 0.02632817253470421, -0...","[-0.11295819282531738, -1.027052640914917, -2....","[0.9736328125, -2.439453125, 1.4921875, -0.757...","[8, 6, 6, 2, 4, 3, 7, 0, 0, 1, 0, 1, 0, 4, 2, ...","[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,reddit,Killing Hitler has become a sport amongst time...,"So , we can now time travel into the future . ...","[-0.12700289487838745, 0.3927253186702728, -0....","[-0.04244694486260414, 0.010937012732028961, -...","[-5.222697734832764, -2.8652701377868652, -3.3...","[0.06005859375, -1.708984375, -0.457275390625,...","[8, 5, 2, 2, 1, 3, 5, 2, 0, 2, 0, 0, 0, 3, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,reddit,Killing Hitler has become a sport amongst time...,"Dear POTUS , The future is amazing . After pea...","[0.3432272970676422, -0.2167324721813202, 0.15...","[-0.025556232780218124, 0.06068963184952736, -...","[2.4253904819488525, -4.577791690826416, -3.37...","[1.6201171875, -3.439453125, -1.546875, -1.0, ...","[5, 5, 5, 3, 3, 1, 4, 1, 0, 1, 0, 0, 0, 2, 1, ...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
4,reddit,Killing Hitler has become a sport amongst time...,"When doing this kind of stuff , it 's good to ...","[-0.2912406027317047, 0.12030187994241714, 0.0...","[-0.029593346640467644, 0.066276416182518, -0....","[-1.9841090440750122, 0.004299342632293701, -2...","[0.305908203125, 1.208984375, 3.591796875, -0....","[7, 9, 3, 0, 5, 3, 6, 2, 0, 0, 0, 0, 1, 1, 3, ...","[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
...,...,...,...,...,...,...,...,...,...
6195,gemini,"To get in Heaven, you have to confront the per...",In the hushed tranquility of the celestial rea...,"[-0.16426882147789001, 0.16457293927669525, -0...","[-0.016211245208978653, 0.06866081804037094, -...","[-1.8663527965545654, -3.4808688163757324, -2....","[0.7861328125, -1.6962890625, 2.23828125, -0.7...","[23, 11, 14, 9, 9, 6, 21, 7, 0, 5, 0, 4, 0, 6,...","[9, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
6196,gemini,"To get in Heaven, you have to confront the per...","As I arrived at the pearly gates of Heaven , m...","[-0.11653857678174973, 0.3063790202140808, -0....","[-0.025263268500566483, 0.057814307510852814, ...","[-0.4610805809497833, -4.71753454208374, -3.81...","[0.57470703125, -2.1953125, 0.9580078125, 0.04...","[23, 16, 14, 7, 12, 6, 36, 3, 2, 3, 0, 2, 1, 5...","[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
6197,gemini,"To get in Heaven, you have to confront the per...","In the ethereal realm , where clouds caressed ...","[-0.1640430986881256, 0.024861453101038933, -0...","[-0.024055898189544678, 0.08036021888256073, -...","[-0.8870304822921753, -4.3331685066223145, 0.1...","[0.7978515625, -2.126953125, 1.701171875, -1.2...","[24, 8, 8, 16, 7, 11, 27, 13, 0, 4, 0, 4, 1, 8...","[7, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
6198,gemini,"To get in Heaven, you have to confront the per...","In the ethereal realm of the afterlife , I sto...","[-0.22200319170951843, 0.1660720556974411, -0....","[-0.032017629593610764, 0.0817214846611023, -0...","[-2.0494422912597656, -2.7628235816955566, -3....","[0.80859375, -1.21875, 2.259765625, -2.2792968...","[23, 10, 25, 14, 10, 12, 35, 10, 0, 3, 0, 4, 1...","[10, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0,..."


In [10]:
df_final.to_csv("../data.csv", index=False)