In [1]:
import os
import csv
import random
import json

<h3>Loading the informative posts</h3>

In [2]:
def loadFiles(folder):
    return [f for f in os.listdir(folder) if 'ipynb' not in f]


def loadPosts(fn, folder):
    full_fn, posts = folder + fn, []
    with open(full_fn, 'r') as f:
        data = data = random.sample(list(csv.reader(f)), k=10)
        for d in data: posts.append(d[-1])
    return posts


folder = '../Implementation/GeneratedPosts/'
csv_files = loadFiles(folder)
posts = {c.replace('.csv', ''): loadPosts(c, folder) for c in csv_files}

<h3>Creating prompts based on the loaded posts</h3>

In [3]:
def createPrompts(posts):
    prompts = []
    for p in posts:
        instr = 'Rewrite the following in a more entertaining way: '
        prompt = instr + p
        prompts.append(prompt)
    return prompts


prompts = {k: createPrompts(posts[k]) for k in posts}

<h3>Loading the OpenChat language model</h3>

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer


def loadModel(mn):
    model = AutoModelForCausalLM.from_pretrained(mn)
    tok = AutoTokenizer.from_pretrained(mn)
    tok.pad_token = tok.eos_token
    return model, tok


model, tok = loadModel('openchat/openchat_3.5')

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<h3>Rephrasing the informative posts</h3>

In [5]:
def rephrase(prompts, model, tok):
    model_inputs = tok(prompts, return_tensors='pt', padding=True)
    model_output = model.generate(**model_inputs, max_new_tokens=100)
    blog_posts = tok.batch_decode(model_output, skip_special_tokens=True)
    return blog_posts


blog_posts = {k: rephrase(prompts[k], model, tok) for k in prompts}

<h3>Saving the rephrased posts</h3>

In [13]:
def savePosts(key, bps, i=1):
    fn = './RephrasedPosts/OriginalModel/' + key + '.json'
    f_bps = {b.split('\n')[0]: b.split('\n')[2] for b in bps}
    with open(fn, 'w') as f: json.dump(f_bps, f)


for k in blog_posts: savePosts(k, blog_posts[k])