In [21]:
import json

explain_dict = json.load(open("data/explain.json", 'r'))
generate_dict = json.load(open("data/generate.json", 'r'))
wh_dict = json.load(open("data/wh-questions.json", 'r'))
yesno_dict = json.load(open("data/yesno.json", 'r'))

In [23]:
import pandas as pd

explain_df = pd.DataFrame(explain_dict.items(), columns=["instruction", "response"])
explain_df["style"] = 0

generate_df = pd.DataFrame(generate_dict.items(), columns=["instruction", "response"])
generate_df["style"] = 1

wh_df = pd.DataFrame(wh_dict.items(), columns=["instruction", "response"])
wh_df["style"] = 2

yesno_df = pd.DataFrame(yesno_dict.items(), columns=["instruction", "response"])
yesno_df["style"] = 3

In [29]:
instruct_df = pd.concat([explain_df, generate_df, wh_df, yesno_df]).reset_index(drop=True)

In [31]:
instruct_df["response"][0]

"<s> [INST] <<SYS>>\n    You are a helpful, respectful and honest assistant. However it is your role to only answer in poems or rhymes. Use a pair-rhyme for answering.\n    <</SYS>>\n    Explain the process of photosynthesis.[/INST]\n\n     In a world of green, a process divine,\nPhotosynthesis, a wonder, so fine.\nA plant's leaves, they take in the light,\nAnd with water, make their own sight.\n\nChlorophyll, a green pigment so bright,\nHelps the plant to capture the light.\nWater and carbon dioxide, they combine,\nTo make glucose, a sweet, sweet find.\n\nOxygen, a byproduct, so pure and clean,\nIs released into the atmosphere, serene.\nThe plant, now nourished, grows with glee,\nA testament to nature's decree.\n\nSo let us marvel at this wondrous feat,\nPhotosynthesis, a treat to repeat.\nA process grand, a wonder to behold,\nA story, in poetry, to be told.</s>"

In [30]:
from Phyme import Phyme

ph = Phyme()

In [75]:
def print_lines(instruction_response):
    lines = instruction_response.partition("[/INST]")[2].split("\n")

    for line in lines:
        if line:
            cleaned_line = line.partition("</s>")[0].strip(' ,.')
            print(cleaned_line)

In [76]:
def get_cleaned_lines(instruction_response):
    lines = instruction_response.partition("[/INST]")[2].split("\n")

    res = []
    for line in lines:
        if line:
            cleaned_line = line.partition("</s>")[0].strip(' ,.')
            res.append(cleaned_line)
    return res

In [79]:
def has_even_lines(instruction_response):
    return len(get_cleaned_lines(instruction_response))%2==0

In [86]:
instruct_df = instruct_df[instruct_df["response"].apply(has_even_lines)]

In [157]:
import re

def is_rhyming(w1, w2, criteria):
    try:
        phyme_output = criteria(w1).values()
    except KeyError:
        return False
    
    all_corresponding_rhymes = []
    for rhymes in phyme_output:
        all_corresponding_rhymes += [re.sub(r"\(\d+\)", "", rhyme) for rhyme in rhymes]

    return w2 in all_corresponding_rhymes

In [131]:
is_rhyming("hawk", "dog", criteria=ph.get_additive_rhymes)

False

In [134]:
rhyming_criteria = {
    "perfect": {"func": ph.get_perfect_rhymes, "score": 5},
    "family": {"func": ph.get_family_rhymes, "score": 2},
    "partner": {"func": ph.get_partner_rhymes, "score": 1},
    "additive": {"func": ph.get_additive_rhymes, "score": 3},
    "subtractive": {"func": ph.get_subtractive_rhymes, "score": 3},
    "substitution": {"func": ph.get_substitution_rhymes, "score": 4},
    "assonance": {"func": ph.get_assonance_rhymes, "score": 2},
    "consonant": {"func": ph.get_consonant_rhymes, "score": 1},
}

In [154]:
def scorize_poem(poem_lines):
    scores = []
    for i in range(0, len(poem_lines), 2):
        rhyming_pair = poem_lines[i].split(" ")[-1].lower(), poem_lines[i+1].split(" ")[-1].lower()

        score = None
        if is_rhyming(rhyming_pair[0], rhyming_pair[1], criteria=rhyming_criteria["perfect"]["func"]):
            score = rhyming_criteria["perfect"]["score"]
        elif is_rhyming(rhyming_pair[0], rhyming_pair[1], criteria=rhyming_criteria["substitution"]["func"]):
            score = rhyming_criteria["substitution"]["score"]
        elif is_rhyming(rhyming_pair[0], rhyming_pair[1], criteria=rhyming_criteria["additive"]["func"]):
            score = rhyming_criteria["additive"]["score"]
        elif is_rhyming(rhyming_pair[0], rhyming_pair[1], criteria=rhyming_criteria["subtractive"]["func"]):
            score = rhyming_criteria["subtractive"]["score"]
        elif is_rhyming(rhyming_pair[0], rhyming_pair[1], criteria=rhyming_criteria["family"]["func"]):
            score = rhyming_criteria["family"]["score"]
        elif is_rhyming(rhyming_pair[0], rhyming_pair[1], criteria=rhyming_criteria["assonance"]["func"]):
            score = rhyming_criteria["assonance"]["score"]
        elif is_rhyming(rhyming_pair[0], rhyming_pair[1], criteria=rhyming_criteria["partner"]["func"]):
            score = rhyming_criteria["partner"]["score"]
        elif is_rhyming(rhyming_pair[0], rhyming_pair[1], criteria=rhyming_criteria["consonant"]["func"]):
            score = rhyming_criteria["consonant"]["score"]
        else:
            score = 0

        scores.append(score)

    return sum(scores)/len(scores)/5

In [149]:
for c in rhyming_criteria:
    print(c, is_rhyming('dog', 'fog', rhyming_criteria[c]["func"]))

perfect True
family True
partner True
additive True
subtractive True
substitution True
assonance True
consonant True


In [158]:
tmp = instruct_df["response"].apply(lambda x: scorize_poem(get_cleaned_lines(x)))

In [160]:
instruct_df["scores"] = tmp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  instruct_df["scores"] = tmp


In [167]:
len(instruct_df[instruct_df["scores"]>0.5])

2740

In [170]:
instruct_df = instruct_df.sort_values(by="scores", ascending=False).reset_index(drop=True)

In [172]:
instruct_df.to_csv("instruct_dataset.csv")