In [1]:
import pandas as pd

forum_messages_df = pd.read_csv('/kaggle/input/meta-kaggle/ForumMessages.csv')
forum_messages_df.head()

wikipedia_movie_plots_df = pd.read_csv('/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv')
wikipedia_movie_plots_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [2]:
wikipedia_movie_plots_df['Plot'][:1000]
forum_messages_df['Message'][:2][0]

'<p>This like betting your life savings on a game of black jack, win once, and then walk. Brilliant!</p>'

In [3]:
original_texts = pd.concat([forum_messages_df['Message'][:10],wikipedia_movie_plots_df['Plot'][:10]], ignore_index=True)
original_texts

0     <p>This like betting your life savings on a ga...
1     <p>Hi everyone... I'm looking for a team. I'm ...
2                            <p>Thanks for sharing!</p>
3     <p>This competition's final submission deadlin...
4     <p>funny, it didn't work for me - got 13.837.\...
5     <p>10x+ scores, so unreliable on the &lt;1% te...
6                                    <p>Great Work!</p>
7                                         <p>Thx ;)</p>
8     <p>I hope this is occasional  and will be fixe...
9     <p>Great stuff Tom! I really like the walkthro...
10    A bartender is working at a saloon, serving dr...
11    The moon, painted with a smiling face hangs ov...
12    The film, just over a minute long, is composed...
13    Lasting just 61 seconds and consisting of two ...
14    The earliest known adaptation of the classic f...
15    Alice follows a large white rabbit down a "Rab...
16    The film opens with two bandits breaking into ...
17    The film is about a family who move to the

In [4]:
rewrite_prompts = [
    'Explain this to me like I\'m five.',
    'Convert this into a sea shanty.',
    'Make this rhyme.',
    'Make this shorter.',
    'Make this longer.',
    'Make this more detailed.',
    'Rewrite this essay but do it using the writing style of Dr. Seuss',
    'Rewrite this essay but do it using the writing style of William Shakespeare',
    'Rewrite this essay but do it using the writing style of Tupac Shakur',
    'Make this a haiku.',
    'Make this into a poem.',
    'Turn this into a sonnet.',
    'Summarize this.',
    'Give me the highlights.',
]

In [5]:
!pip install -q -U immutabledict sentencepiece
!git clone https://github.com/google/gemma_pytorch.git
!mkdir /kaggle/working/gemma/
!mv /kaggle/working/gemma_pytorch/gemma/* /kaggle/working/gemma/

Cloning into 'gemma_pytorch'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 91 (delta 20), reused 15 (delta 11), pack-reused 55[K
Unpacking objects: 100% (91/91), 2.14 MiB | 3.91 MiB/s, done.


In [6]:
import sys
sys.path.append("/kaggle/working/gemma_pytorch/")
from gemma.config import GemmaConfig, get_config_for_7b, get_config_for_2b
from gemma.model import GemmaForCausalLM
from gemma.tokenizer import Tokenizer
import contextlib
import os
import torch

VARIANT = "7b-it-quant"
MACHINE_TYPE = "cuda"
weights_dir = '/kaggle/input/gemma/pytorch/7b-it-quant/2'

@contextlib.contextmanager
def _set_default_tensor_type(dtype: torch.dtype):
  """Sets the default torch dtype to the given dtype."""
  torch.set_default_dtype(dtype)
  yield
  torch.set_default_dtype(torch.float)
    
model_config = get_config_for_2b() if "2b" in VARIANT else get_config_for_7b()
model_config.tokenizer = os.path.join(weights_dir, "tokenizer.model")
model_config.quant = "quant" in VARIANT

device = torch.device(MACHINE_TYPE)
with _set_default_tensor_type(model_config.get_dtype()):
  model = GemmaForCausalLM(model_config)
  ckpt_path = os.path.join(weights_dir, f'gemma-{VARIANT}.ckpt')
  model.load_weights(ckpt_path)
  model = model.to(device).eval()

  return self.fget.__get__(instance, owner)()


In [7]:
import random
random.seed(0)
USER_CHAT_TEMPLATE = "<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"

rewrite_data = []

for original_text in original_texts:
    rewrite_prompt = random.choice(rewrite_prompts)
    prompt = f'{rewrite_prompt}\n{original_text}'
    rewritten_text = model.generate(
        USER_CHAT_TEMPLATE.format(prompt=prompt),
        device=device,
        output_len=100,
    )
    rewrite_data.append({
        'original_text': original_text,
        'rewrite_prompt': rewrite_prompt,
        'rewritten_text': rewritten_text,
    })

In [8]:
rewrite_data_df = pd.DataFrame(rewrite_data)
rewrite_data_df[:20].values

array([['<p>This like betting your life savings on a game of black jack, win once, and then walk. Brilliant!</p>',
        'Give me the highlights.',
        "Sure, here's the highlights:\n\n* **High-stakes blackjack:** The text describes playing blackjack with life savings, implying a significant financial commitment.\n* **Potentially lucrative winnings:** The text suggests that a win in this game could result in a significant windfall, comparable to walking away from the game with substantial winnings."],
       ["<p>Hi everyone... I'm looking for a team. I'm Data Scientist and I'd like learning more about Kaggle competitions, strategies and I have time to do this! I use Python and R currently. ;)</p>",
        'Rewrite this essay but do it using the writing style of Dr. Seuss',
        'Sure, here\'s the rewritten essay using the writing style of Dr. Seuss:\n\n"Hi, my dear friend, come join the fun,\nI\'m Data Scientist, ready to run.\nI\'m searching for a team of bright,\nAnd eager

In [9]:
rewrite_data_df.to_csv('prompts_and_rewrites.csv', index=False)