In [1]:
%pip install bert_score

Note: you may need to restart the kernel to use updated packages.


In [2]:
import evaluate

In [23]:
generations = []
references = []
contexts = []

for filename in ['base_model_out.txt', 'base_model_out_2.txt', 'base_model_out_3.txt', 'base_model_out_4.txt']:
    with open(filename, 'r',  encoding="utf8") as f:
        context = ""
        gen = ""
        long_scaff = False
        for line in f.readlines():
            if long_scaff:
                if '=====================' in line:
                    contexts.append(context)
                    context = ""
                    generations.append(gen)
                    gen = ""
                    long_scaff = False
                else:
                    if line.strip():
                        gen += ' ' + line.strip() 
            else:
                if line.split()[:2] == ['ORIGINAL', 'SCAFFOLDING:']:
                    references.append(' '.join(line.split()[2:]))
                elif line.split()[:2] == ['GENERATED', 'SCAFFOLDING:']:
                    gen += (' '.join(line.split()[2:]))
                    long_scaff = True
                elif line.split()[:1] == ['STUDENT:'] or line.split()[:1] == ['TEACHER:']:
                  context += (' '.join(line.split()[1:]) + "<|endoftext|>")
                elif '=====================' in line:
                  contexts.append(context)
                  context = ""
                  generations.append(gen)
                  gen = ""

In [4]:
generations[:20]

["Sure, it comes with practice! Don't give up. It does take a few tries to get comfortable with the poses and breathing techniques. Try to do 15 minutes per day. You'll get there with persistence and before you know it, you'll see the benefits!",
 'Great! Yoga practice has a lot of benefits. How about your listening and speaking skills? Would you like to practice these using a native English speaker?',
 'Yes, practice is another helpful practice to build up your English skills.',
 "Oh, you might enjoy this video, but don't try the poses at home, especially not with those IKEA chairs!).",
 "Correct! You got it! 'Have been married' means they were married in the past and are not married now. 'Were married' is the same meaning. Great job! You're learning fast. Keep it up!",
 "Yes, you're warming up! Remember, we can use present perfect for a past action that has an effect on the present.",
 'No. They were once a celebrated couple!',
 '* propose * introduce * pass * comes into effect * is 

In [5]:
len(generations)

658

In [6]:
len(references)

658

In [7]:
generation_scores = [{"text": gen} for gen in generations]
generation_scores[0]

{'text': "Sure, it comes with practice! Don't give up. It does take a few tries to get comfortable with the poses and breathing techniques. Try to do 15 minutes per day. You'll get there with persistence and before you know it, you'll see the benefits!"}

In [40]:
bleu = evaluate.load("bleu")

In [47]:
results_bleu1 = []
results_bleu4 = []
for gen_score, ref in zip(generation_scores, references):
    gen = gen_score['text']
    if(len(gen) and len(ref)): # to not include empty ones
        result1 = {'generated': gen, 'reference': ref}
        result4 = {'generated': gen, 'reference': ref}
        result1['score'] = bleu.compute(predictions=[gen], references=[ref], max_order=1)
        result4['score'] = bleu.compute(predictions=[gen], references=[ref], max_order=4)
        
        results_bleu1.append(result1)
        results_bleu4.append(result4)

        gen_score['bleu1'] = result1['score']['bleu']
        gen_score['bleu4'] = result4['score']['bleu']
    else:
        gen_score['bleu1'] = 0
        gen_score['bleu4'] = 0

In [43]:
generation_scores[:5]

[{'text': "Sure, it comes with practice! Don't give up. It does take a few tries to get comfortable with the poses and breathing techniques. Try to do 15 minutes per day. You'll get there with persistence and before you know it, you'll see the benefits!",
  'overlap': 0.0,
  'MLU_match': 31.8,
  'ngram_score': 0.413582684192269,
  'rpt': 0.08346192000000001,
  'bleu1': {'bleu': 0.15686274509803924,
   'precisions': [0.1568627450980392],
   'brevity_penalty': 1.0,
   'length_ratio': 1.8888888888888888,
   'translation_length': 51,
   'reference_length': 27},
  'bleu4': {'bleu': 0.0,
   'precisions': [0.1568627450980392, 0.0, 0.0, 0.0],
   'brevity_penalty': 1.0,
   'length_ratio': 1.8888888888888888,
   'translation_length': 51,
   'reference_length': 27}},
 {'text': 'Great! Yoga practice has a lot of benefits. How about your listening and speaking skills? Would you like to practice these using a native English speaker?',
  'overlap': 0.047619047619047616,
  'MLU_match': 11.8,
  'ngram_

In [10]:
results_bleu1 = sorted(results_bleu1, key=lambda item: item['score']['bleu'] if item['score'] else '', reverse=True)
results_bleu4 = sorted(results_bleu4, key=lambda item: item['score']['bleu'] if item['score'] else '', reverse=True)


In [11]:
[x['score']['bleu'] for x in results_bleu1][:10]

[0.7165313105737893,
 0.38095238095238093,
 0.37735849056603776,
 0.375,
 0.3563218390804598,
 0.35416129051928236,
 0.3448275862068966,
 0.3431521454126054,
 0.32786885245901637,
 0.32449307785569]

In [12]:
[x['score']['bleu'] for x in results_bleu4][:10]

[0.27178805978554255,
 0.18709539406264028,
 0.1264203703898735,
 0.11143093225027423,
 0.10026964123698473,
 0.08971721455931114,
 0.08263765971401953,
 0.0810371533925042,
 0.07520294647008956,
 0.07369081146140266]

In [13]:
bertscore = evaluate.load('bertscore')

In [None]:
bert_res = bertscore.compute(predictions=generations, references=references, lang="en")


In [44]:
bert_full_res = [{'bert_f1': res, 'generated': gen, 'reference': ref} for res, gen, ref in zip(bert_res['f1'], generations, references)]
for i, res in enumerate(bert_full_res):
    generation_scores[i]['bertscore_f1'] = res['bert_f1']

In [None]:
bert_sorted = sorted(bert_full_res, key=lambda item: item['bert_f1'], reverse=True)
bert_sorted

In [3]:
%run func_declarations.ipynb

\begin{subfigure}{0.3\textwidth} 
    \includegraphics[width=\textwidth]{figs/student/student002_MLU_AoA.png} 
    \caption{figs/student/student002 MLU AoA} 
    \end{subfigure} 
    \hfill 
    \begin{subfigure}{0.3\textwidth} 
    \includegraphics[width=\textwidth]{figs/student/student002_TTR_utterances_words_scaffolding_proportion.png} 
    \caption{figs/student/student002 TTR utterances words scaffolding proportion} 
    \end{subfigure} 
    \hfill 
    \begin{subfigure}{0.3\textwidth} 
    \includegraphics[width=\textwidth]{figs/student/student002_WF.png} 
    \caption{figs/student/student002 WF} 
    \end{subfigure} 
    \hfill 
    \begin{subfigure}{0.3\textwidth} 
    \includegraphics[width=\textwidth]{figs/student/student003_MLU_AoA.png} 
    \caption{figs/student/student003 MLU AoA} 
    \end{subfigure} 
    \hfill 
    \begin{subfigure}{0.3\textwidth} 
    \includegraphics[width=\textwidth]{figs/student/student003_TTR_utterances_words_scaffolding_proportion.png} 
    \captio

  previous_utts = pd.concat(
  previous_utts = pd.concat(
  previous_utts = pd.concat(
  previous_utts = pd.concat(
  previous_utts = pd.concat(
  previous_utts = pd.concat(


### VO of generated utterances

In [19]:
for (generation, context) in zip(generation_scores, contexts):
    tokens = tokenize_and_preprocess(generation['text'], True)
    previous_tokens = tokenize_and_preprocess(context.split("<|endoftext|>")[-2], True)

    intersect = set(previous_tokens).intersection(set(tokens))
    union = set(previous_tokens).union(set(tokens))
    
    generation["overlap"] = len(intersect)/len(union) if union else 0 # jaccard


In [20]:
contexts[0].split("<|endoftext|>")

['So I know her because of Ihor. And now she does online sessions.',
 'ah, brilliant!',
 "Sometimes it's yoga, sometimes it's pilates. And I decide to join. Two times per week. I like it.",
 'I started doing yoga in Kyiv back in 2014. It was not an easy year as you remember and I really appreciated the relaxing effects of yoga. And I have been practicing ever since!',
 'And even bought a fitness mat, ball and top for sessions ).',
 'Do you practice at home?',
 'Do you practice yourself now? Yes, I practice at home usually. One time at the office )).',
 'I do, yes, I follow a couple of yoga youtube channels and do the sessions from there.',
 'Is it hard? Because I thought about more yoga or pilates.',
 '']

### Complexity match MLU of generated utterances

In [21]:
for (generation, context) in zip(generation_scores, contexts):
    tokens = tokenize_and_preprocess(generation['text'], False)
    student_tokens = [tokenize_and_preprocess(c, False) for c in context.split("<|endoftext|>")[::2]]

    MLU_gen = len(tokens)
    MLU_student = np.mean([len(utt) for utt in student_tokens])

    generation['MLU_match'] = MLU_gen-MLU_student

    # intersect = set(previous_tokens).intersection(set(tokens))
    # union = set(previous_tokens).union(set(tokens))
    
    # generation["overlap"] = len(intersect)/len(union) if union else 0 # jaccard


In [22]:
generation_scores[:10]

[{'text': "Sure, it comes with practice! Don't give up. It does take a few tries to get comfortable with the poses and breathing techniques. Try to do 15 minutes per day. You'll get there with persistence and before you know it, you'll see the benefits!",
  'overlap': 0.0,
  'MLU_match': 31.8},
 {'text': 'Great! Yoga practice has a lot of benefits. How about your listening and speaking skills? Would you like to practice these using a native English speaker?',
  'overlap': 0.047619047619047616,
  'MLU_match': 11.8},
 {'text': 'Yes, practice is another helpful practice to build up your English skills.',
  'overlap': 0.125,
  'MLU_match': 0.8000000000000007},
 {'text': "Oh, you might enjoy this video, but don't try the poses at home, especially not with those IKEA chairs!).",
  'overlap': 0.0625,
  'MLU_match': 10.2},
 {'text': "Correct! You got it! 'Have been married' means they were married in the past and are not married now. 'Were married' is the same meaning. Great job! You're learni

In [4]:
ngrams_coeff = json.load(open("ngrams_scaff.txt", "r"))

In [24]:
ngrams_coeff[:5]

[[1.7496312271859153, 'the snow'],
 [1.1564406480866836, 'is called'],
 [0.5455720135187181, 'the meaning'],
 [0.5268666293446779, 'the people'],
 [0.5088000832813544, 'that is']]

In [5]:
ngram_texts = [ngram[1] for ngram in ngrams_coeff]
ngram_coeffs = [ngram[0] for ngram in ngrams_coeff]

In [26]:
for generation in generation_scores:
    tokens = tokenize_and_preprocess(generation['text'], False)
    bigrams = nltk.ngrams(tokens, 2)
    trigrams = nltk.ngrams(tokens, 3)
    ngram_list = (list(bigrams) + list(trigrams))
    ngram_score = 0
    for ngram in ngram_list:
        if ' '.join(ngram) in ngram_texts:
            # print(ngram, ngram_texts.index(' '.join(ngram)), ngram_coeffs[ngram_texts.index(' '.join(ngram))])
            ngram_score += ngram_coeffs[ngram_texts.index(' '.join(ngram))]
    generation['ngram_score'] = ngram_score

In [33]:
generation_scores[:5]

[{'text': "Sure, it comes with practice! Don't give up. It does take a few tries to get comfortable with the poses and breathing techniques. Try to do 15 minutes per day. You'll get there with persistence and before you know it, you'll see the benefits!",
  'overlap': 0.0,
  'MLU_match': 31.8,
  'ngram_score': 0.413582684192269},
 {'text': 'Great! Yoga practice has a lot of benefits. How about your listening and speaking skills? Would you like to practice these using a native English speaker?',
  'overlap': 0.047619047619047616,
  'MLU_match': 11.8,
  'ngram_score': -0.23984571315131742},
 {'text': 'Yes, practice is another helpful practice to build up your English skills.',
  'overlap': 0.125,
  'MLU_match': 0.8000000000000007,
  'ngram_score': -0.008942568029654659},
 {'text': "Oh, you might enjoy this video, but don't try the poses at home, especially not with those IKEA chairs!).",
  'overlap': 0.0625,
  'MLU_match': 10.2,
  'ngram_score': -0.3470567451400328},
 {'text': "Correct! 

In [48]:
generation_scores[:5]

[{'text': "Sure, it comes with practice! Don't give up. It does take a few tries to get comfortable with the poses and breathing techniques. Try to do 15 minutes per day. You'll get there with persistence and before you know it, you'll see the benefits!",
  'overlap': 0.0,
  'MLU_match': 31.8,
  'ngram_score': 0.413582684192269,
  'rpt': 0.08346192000000001,
  'bleu1': 0.15686274509803924,
  'bleu4': 0.0,
  'bertscore_f1': 0.8302353024482727},
 {'text': 'Great! Yoga practice has a lot of benefits. How about your listening and speaking skills? Would you like to practice these using a native English speaker?',
  'overlap': 0.047619047619047616,
  'MLU_match': 11.8,
  'ngram_score': -0.23984571315131742,
  'rpt': 0.1160505,
  'bleu1': 0.10000000000000002,
  'bleu4': 0.0,
  'bertscore_f1': 0.8552031517028809},
 {'text': 'Yes, practice is another helpful practice to build up your English skills.',
  'overlap': 0.125,
  'MLU_match': 0.8000000000000007,
  'ngram_score': -0.008942568029654659,

In [49]:
for i, gen in enumerate(generation_scores):
    gen['reference'] = references[i]
    gen['context'] = contexts[i]

In [50]:
generation_scores[:2]

[{'text': "Sure, it comes with practice! Don't give up. It does take a few tries to get comfortable with the poses and breathing techniques. Try to do 15 minutes per day. You'll get there with persistence and before you know it, you'll see the benefits!",
  'overlap': 0.0,
  'MLU_match': 31.8,
  'ngram_score': 0.413582684192269,
  'rpt': 0.08346192000000001,
  'bleu1': 0.15686274509803924,
  'bleu4': 0.0,
  'bertscore_f1': 0.8302353024482727,
  'reference': "Just to note, practice as a verb is an American spelling, the British one is practise. I should have used 'practise', sorry!",
  'context': "So I know her because of Ihor. And now she does online sessions.<|endoftext|>ah, brilliant!<|endoftext|>Sometimes it's yoga, sometimes it's pilates. And I decide to join. Two times per week. I like it.<|endoftext|>I started doing yoga in Kyiv back in 2014. It was not an easy year as you remember and I really appreciated the relaxing effects of yoga. And I have been practicing ever since!<|endo

In [57]:
scores_sorted_by_ngram = sorted(generation_scores, key=lambda x: x['ngram_score'], reverse=True)
scores_sorted_by_ngram[:10]

[{'text': "oh, see if you can infer the meaning from the context clues! 'the meaning is like 'ignore'/ 'fail to notice/recognize' - does that help?",
  'overlap': 0.07142857142857142,
  'MLU_match': 19.4,
  'ngram_score': 3.0327525160457287,
  'rpt': 0.060701780000000004,
  'bleu1': 0.13333333333333333,
  'bleu4': 0.0,
  'bertscore_f1': 0.8421195149421692,
  'reference': "oversee = supervise and overgrown makes no sense here so...'overlook' is the one you want!",
  'context': "what they are like.<|endoftext|>good correction! a hell of a lot actually. <STUDENT> would you like to have a quick go at e.g. page 147 from your book before we finish!? <STUDENT> (sorry).<|endoftext|>Sure.<|endoftext|>ok...<|endoftext|>1d.<|endoftext|>yes.<|endoftext|>no idea about number 2 but I would say A.<|endoftext|>so the meaning is like 'ignore'/ 'fail to notice/recognise' - does that help?<|endoftext|>I don't know the meaning of the words.<|endoftext|>"},
 {'text': "That's right! In UK parliamentary jarg

In [24]:
reference_scores = [{'text': ref} for ref in references]
for ref_score, context in zip(reference_scores, contexts):
    ref = ref_score['text']
    
    tokens_vo = tokenize_and_preprocess(ref, True)
    previous_tokens_vo = tokenize_and_preprocess(context.split("<|endoftext|>")[-2], True)

    intersect = set(previous_tokens_vo).intersection(set(tokens_vo))
    union = set(previous_tokens_vo).union(set(tokens_vo))
    
    ref_score['overlap'] = len(intersect)/len(union) if union else 0 # jaccard

    tokens = tokenize_and_preprocess(ref, False)
    student_tokens = [tokenize_and_preprocess(c, False) for c in context.split("<|endoftext|>")[::2]]

    MLU_gen = len(tokens)
    MLU_student = np.mean([len(utt) for utt in student_tokens])

    ref_score['MLU_match'] = MLU_gen-MLU_student

    bigrams = nltk.ngrams(tokens, 2)
    trigrams = nltk.ngrams(tokens, 3)
    ngram_list = (list(bigrams) + list(trigrams))
    ngram_score = 0
    for ngram in ngram_list:
        if ' '.join(ngram) in ngram_texts:
            # print(ngram, ngram_texts.index(' '.join(ngram)), ngram_coeffs[ngram_texts.index(' '.join(ngram))])
            ngram_score += ngram_coeffs[ngram_texts.index(' '.join(ngram))]
    ref_score['ngram_score'] = ngram_score

    ref_score['bleu1'] = 1  # they ARE the references
    ref_score['bleu4'] = 1
    ref_score['bertscore_f1'] = 1
    


In [26]:
reference_scores[:4]

[{'text': "Just to note, practice as a verb is an American spelling, the British one is practise. I should have used 'practise', sorry!",
  'overlap': 0.0,
  'MLU_match': 7.800000000000001,
  'ngram_score': 0.46713450189846545,
  'bleu1': 1,
  'bleu4': 1,
  'bertscore_f1': 1},
 {'text': "And the noun is practice. as in 'regular yoga practice is wonderful for you'.",
  'overlap': 0.08333333333333333,
  'MLU_match': -0.1999999999999993,
  'ngram_score': -0.02643475191698528,
  'bleu1': 1,
  'bleu4': 1,
  'bertscore_f1': 1},
 {'text': 'I like that I can choose easier or harder sessions. It depends whether you want to spell it in a British or American way)).',
  'overlap': 0.0,
  'MLU_match': 12.8,
  'ngram_score': 0.35095014965583327,
  'bleu1': 1,
  'bleu4': 1,
  'bertscore_f1': 1},
 {'text': 'No! Tell me. Both are good, did you see and did you watch. and Have you seen? would be even better.',
  'overlap': 0.15384615384615385,
  'MLU_match': 11.2,
  'ngram_score': -0.31248915335754,
  'b

# Evaluating the base prompt now

In [6]:
generations_base = []
references_base = []
contexts_base = []

for filename in ['base_prompt_model_out_1.txt', 'base_prompt_model_out_correct_2.txt', 'base_prompt_model_out_correct_3.txt']:
    with open(filename, 'r',  encoding="utf8") as f:
        context = ""
        gen = ""
        long_scaff = False
        for line in f.readlines():
            if long_scaff:
                if '=====================' in line:
                    contexts_base.append(context)
                    context = ""
                    generations_base.append(gen)
                    gen = ""
                    long_scaff = False
                else:
                    if line.strip():
                        gen += ' ' + line.strip() 
            else:
                if line.split()[:2] == ['ORIGINAL', 'SCAFFOLDING:']:
                    references_base.append(' '.join(line.split()[2:]))
                elif line.split()[:2] == ['GENERATED', 'SCAFFOLDING:']:
                    gen += (' '.join(line.split()[2:]))
                    long_scaff = True
                elif line.split()[:1] == ['STUDENT:'] or line.split()[:1] == ['TEACHER:']:
                  context += (' '.join(line.split()[1:]) + "<|endoftext|>")
                elif '=====================' in line:
                  contexts_base.append(context)
                  context = ""
                  generations_base.append(gen)
                  gen = ""

In [7]:
bleu = evaluate.load("bleu")

In [8]:
bertscore = evaluate.load('bertscore')
bert_res_base = bertscore.compute(predictions=generations_base, references=references_base, lang="en")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
generations_base_scores = [{'text': gen} for gen in generations_base]

bert_full_res_base = [{'bert_f1': res, 'generated': gen, 'reference': ref} for res, gen, ref in zip(bert_res_base['f1'], generations_base, references_base)]
for i, res in enumerate(bert_full_res_base):
    generations_base_scores[i]['bertscore_f1'] = res['bert_f1']

# loop for everything else

for gen_base_score, ref, context in zip(generations_base_scores, references_base, contexts_base):
    gen = gen_base_score['text']
    
    tokens_vo = tokenize_and_preprocess(gen, True)
    previous_tokens_vo = tokenize_and_preprocess(context.split("<|endoftext|>")[-2], True)

    intersect = set(previous_tokens_vo).intersection(set(tokens_vo))
    union = set(previous_tokens_vo).union(set(tokens_vo))
    
    gen_base_score['overlap'] = len(intersect)/len(union) if union else 0 # jaccard

    tokens = tokenize_and_preprocess(gen, False)
    student_tokens = [tokenize_and_preprocess(c, False) for c in context.split("<|endoftext|>")[::2]]

    MLU_gen = len(tokens)
    MLU_student = np.mean([len(utt) for utt in student_tokens])

    gen_base_score['MLU_match'] = MLU_gen-MLU_student

    bigrams = nltk.ngrams(tokens, 2)
    trigrams = nltk.ngrams(tokens, 3)
    ngram_list = (list(bigrams) + list(trigrams))
    ngram_score = 0
    for ngram in ngram_list:
        if ' '.join(ngram) in ngram_texts:
            # print(ngram, ngram_texts.index(' '.join(ngram)), ngram_coeffs[ngram_texts.index(' '.join(ngram))])
            ngram_score += ngram_coeffs[ngram_texts.index(' '.join(ngram))]
    gen_base_score['ngram_score'] = ngram_score


    if(len(gen) and len(ref)): # to not include empty ones
        result1 = {'generated': gen, 'reference': ref}
        result4 = {'generated': gen, 'reference': ref}
        result1['score'] = bleu.compute(predictions=[gen], references=[ref], max_order=1)
        result4['score'] = bleu.compute(predictions=[gen], references=[ref], max_order=4)

        gen_base_score['bleu1'] = result1['score']['bleu']
        gen_base_score['bleu4'] = result4['score']['bleu']
    else:
        gen_base_score['bleu1'] = 0
        gen_base_score['bleu4'] = 0

In [73]:
all_results = pd.DataFrame(columns=['bleu-1', 'bleu-4', 'bertscore', 'overlap', 'matching', 'ngram_score'])
all_results

Unnamed: 0,bleu-1,bleu-4,bertscore,rpt,overlap,matching,ngram_score


In [74]:
all_results['bleu-1'] = {
    "base": round(np.mean([score['bleu1'] for score in generations_base_scores]), 4),
    "complex": round(np.mean([score['bleu1'] for score in generation_scores]), 4),
    "ref": None
}
all_results['bleu-4'] = {
    "base": round(np.mean([score['bleu4'] for score in generations_base_scores]), 4),
    "complex": round(np.mean([score['bleu4'] for score in generation_scores]), 4),
    "ref": None,
}
all_results['bertscore'] = {
    "base": round(np.mean([score['bertscore_f1'] for score in generations_base_scores]), 4),
    "complex": round(np.mean([score['bertscore_f1'] for score in generation_scores]), 4),
    "ref": None
}
all_results['overlap'] = {
    "base": round(np.mean([score['overlap'] for score in generations_base_scores]), 4),
    "complex": round(np.mean([score['overlap'] for score in generation_scores]), 4),
    "ref": round(np.mean([score['overlap'] for score in reference_scores]), 4)
}
all_results['matching'] = {
    "base": round(np.mean([score['MLU_match'] for score in generations_base_scores]), 4),
    "complex": round(np.mean([score['MLU_match'] for score in generation_scores]), 4),
    "ref": round(np.mean([score['MLU_match'] for score in reference_scores]), 4)
}
all_results['ngram_score'] = {
    "base": round(np.mean([score['ngram_score'] for score in generations_base_scores]), 4),
    "complex": round(np.mean([score['ngram_score'] for score in generation_scores]), 4),
    "ref": round(np.mean([score['ngram_score'] for score in reference_scores]), 4)
}

In [75]:
all_results

Unnamed: 0,bleu-1,bleu-4,bertscore,rpt,overlap,matching,ngram_score
base,0.1313,0.0045,0.8328,0.088,0.0653,35.1402,0.2009
complex,0.1247,0.0025,0.8369,0.088,0.0661,20.8547,0.1313
ref,,,,0.088,0.0523,16.2331,0.0632


In [1]:
print(all_results.T.to_latex(float_format="%.4f", na_rep="-"))

NameError: name 'all_results' is not defined

In [77]:
generation_scores[:5]

[{'text': "Sure, it comes with practice! Don't give up. It does take a few tries to get comfortable with the poses and breathing techniques. Try to do 15 minutes per day. You'll get there with persistence and before you know it, you'll see the benefits!",
  'overlap': 0.0,
  'MLU_match': 31.8,
  'ngram_score': 0.413582684192269,
  'rpt': 0.08346192000000001,
  'bleu1': 0.15686274509803924,
  'bleu4': 0.0,
  'bertscore_f1': 0.8302353024482727,
  'reference': "Just to note, practice as a verb is an American spelling, the British one is practise. I should have used 'practise', sorry!",
  'context': "So I know her because of Ihor. And now she does online sessions.<|endoftext|>ah, brilliant!<|endoftext|>Sometimes it's yoga, sometimes it's pilates. And I decide to join. Two times per week. I like it.<|endoftext|>I started doing yoga in Kyiv back in 2014. It was not an easy year as you remember and I really appreciated the relaxing effects of yoga. And I have been practicing ever since!<|endo

In [78]:
reference_scores[:5]

[{'text': "Just to note, practice as a verb is an American spelling, the British one is practise. I should have used 'practise', sorry!",
  'overlap': 0.0,
  'MLU_match': 7.800000000000001,
  'ngram_score': 0.46713450189846545,
  'bleu1': 1,
  'bleu4': 1,
  'bertscore_f1': 1,
  'rpt': 0.08346192000000001},
 {'text': "And the noun is practice. as in 'regular yoga practice is wonderful for you'.",
  'overlap': 0.08333333333333333,
  'MLU_match': -0.1999999999999993,
  'ngram_score': -0.02643475191698528,
  'bleu1': 1,
  'bleu4': 1,
  'bertscore_f1': 1,
  'rpt': 0.1160505},
 {'text': 'I like that I can choose easier or harder sessions. It depends whether you want to spell it in a British or American way)).',
  'overlap': 0.0,
  'MLU_match': 12.8,
  'ngram_score': 0.35095014965583327,
  'bleu1': 1,
  'bleu4': 1,
  'bertscore_f1': 1,
  'rpt': 0.08435975000000001},
 {'text': 'No! Tell me. Both are good, did you see and did you watch. and Have you seen? would be even better.',
  'overlap': 0.