In [None]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
pip install pandas



In [None]:
from rouge import Rouge 
from rouge import FilesRouge
from matplotlib.pyplot import figure

import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import textwrap

import plotly.graph_objs as go
from ipywidgets import Output, VBox


In [None]:
def compute_rouge(generated, comparison):
  files_rouge = FilesRouge()
  scores = files_rouge.get_scores(generated, comparison)
  return scores

def get_rouge(scores, ngram, metric):
  rouge_scores = []
  for obj in scores:
    rouge_scores.append(obj[ngram][metric])
  return rouge_scores

def write_scores_to_file(filename, scores, model):
  with open(filename, 'w') as f:
    f.write(f"{model}\n")
    for item in scores:
      f.write(f"{item}\n")

# function to return original prompt-completion pairs given a data frame and an index
def get_prompt_completion(df, index):
  print("Prompt: " + df['prompt'][index])
  print("Completion: " + df['completion'][index])
  # return [df['prompt'][index], df['completion'][index]]

# Test Set

## Original Prompt-Completion

### ROUGE Calculations

In [None]:
# compute recall scores for original prompt (statement) and completion (interviewer question)
#  to identify any patterns on how much recall interview questions originally contain to use as a baseline
#  in evaluation of model generated completions/questions. 

df = pd.read_csv('prompt-completion-testset.csv')

# compute rouge 1gram recall scores for orignal interviewer, interviewee prompt-completion pairs in data 
rouge = Rouge()
original_scores = rouge.get_scores(df['prompt'], df['completion'])
original_scores_recall = get_rouge(original_scores, 'rouge-1', 'r')

# add rouge 1gram recall scores and prompt word count to validation set dataframe
df['rouge1-recall'] = original_scores_recall
df['prompt_word_count'] = df['prompt'].str.strip().str.split().str.len()
df['completion_word_count'] = df['completion'].str.strip().str.split().str.len()
df.head()

Unnamed: 0,prompt,completion,rouge1-recall,prompt_word_count,completion_word_count
0,"Yes. Yeah, actually, whenever I get together w...",Do you believe--and you talked about this brie...,0.219512,114,55
1,"To tell you the truth, I started to cry. I was...",You said there were financial repercussions. W...,0.25,93,9
2,"Right. Which, of course, supposes that these a...",What is the criticism? What is the fear coming...,0.454545,78,15
3,"To some extent, China is more able than the Un...",You've been working on this issue in China for...,0.333333,78,12
4,There are so many incredible writers out there...,How in the world did you decide which tweets t...,0.230769,39,14


In [None]:
#### MATPLOTLIB ####

# figure(figsize=(10, 8), dpi=80)
# plt.scatter(df['Word Count'], original_scores_recall)
# plt.xlabel('Length of Prompt (Statement)')
# plt.ylabel('ROUGE 1-gram Recall Score')
# plt.show()

#### PLOTLY EXPRESS ####

# # text wrapping for cleaner hovering with plot
df['prompt'] = df['prompt'].apply(
    lambda t: '<br>'.join(textwrap.wrap(t))
)

df['completion'] = df['completion'].apply(
    lambda t: '<br>'.join(textwrap.wrap(t))
)

# Graph recall scores vs prompt length (recall between original prompt/statement and original interviewer completion/question)
fig = px.scatter(df, x='prompt_word_count', y='rouge1-recall', trendline="ols", hover_name='completion', hover_data=['prompt_word_count', 'rouge1-recall', df.index], title="[Test Set] Overlap between original prompt & interviewer completion vs Length of Prompt")
fig.show()

  import pandas.util.testing as tm


In [None]:
#### MATPLOTLIB ####

# figure(figsize=(10, 8), dpi=80)
# plt.scatter(df['Word Count'], original_scores_recall)
# plt.xlabel('Length of Prompt (Statement)')
# plt.ylabel('ROUGE 1-gram Recall Score')
# plt.show()

#### PLOTLY EXPRESS ####

# # text wrapping for cleaner hovering with plot
df['completion'] = df['completion'].apply(
    lambda t: '<br>'.join(textwrap.wrap(t))
)

# Graph recall scores vs completion length (recall between original prompt/statement and original interviewer completion/question)
fig = px.scatter(df, x='completion_word_count', y='rouge1-recall', trendline="ols", hover_name='completion', hover_data=['prompt_word_count', 'completion_word_count', 'rouge1-recall', df.index], title="[Test Set] Overlap between original prompt & interviewer completion vs Length of Completion")
fig.show()

In [None]:
# word count 407
df['rouge1-recall'][80]


get_prompt_completion(df, 80)

Prompt: Twenty years ago, when I was a young professor about to teach a course<br>on African-American fiction, I set about to find a forgotten or<br>undiscovered classic by a woman writer. I wanted a book that would<br>hold its own against such urban classics as "Invisible Man" or "Native<br>Son," an older book that would complement the newer works by Toni<br>Morrison and Alice Walker or the recently republished novels of Zora<br>Neale Hurston. What I discovered was Ann Petry's magnificent 1946<br>novel "The Street". Described by some as an urban "To Kill a<br>Mockingbird," minus any redemption and hope, "The Street" tells the<br>story of Lutie Johnson and her 8-year-old son during the last years of<br>the Second World War. Lutie is a young, hard-working single mother in<br>urban America trying to get ahead in a world that ignores and exploits<br>her. I saw her struggles and determination as both inspiring and<br>doomed. In striving to provide for her son's future, she often ignored<br

### Manual Scoring Investigation

In [None]:
df_manual_scoring = pd.read_csv("manual_scoring.csv")
df_manuals = df.join(df_manual_scoring)
df_manuals.head()

Unnamed: 0,prompt,completion,rouge1-recall,prompt_word_count,completion_word_count,Appropriateness,Specificity,Repeat,New Concepts,Relevance
0,"Yes. Yeah, actually, whenever I get together w...",Do you believe--and you talked about this brie...,0.219512,114,55,7,10,2,4,8
1,"To tell you the truth, I started to cry. I was...",You said there were financial repercussions. W...,0.25,93,9,10,10,0,0,10
2,"Right. Which, of course, supposes that these a...",What is the criticism? What is the fear coming...,0.454545,78,15,10,10,2,5,10
3,"To some extent, China is more able than the Un...",You've been working on this issue in China for...,0.333333,78,12,8,10,2,8,7
4,There are so many incredible writers out there...,How in the world did you decide which tweets t...,0.230769,39,14,8,10,0,10,8


In [None]:
df.head

<bound method NDFrame.head of                                                prompt  \
0   Yes. Yeah, actually, whenever I get together w...   
1   To tell you the truth, I started to cry. I was...   
2   Right. Which, of course, supposes that these a...   
3   To some extent, China is more able than the Un...   
4   There are so many incredible writers out there...   
..                                                ...   
95  That's exactly right. There's deep fear about ...   
96  Basically, she prayed for the university. She ...   
97                                         Thank you.   
98                                      Good, thanks.   
99  Well, I'm fine, of course, it's an awkward pos...   

                                           completion  rouge1-recall  \
0   Do you believe--and you talked about this brie...       0.219512   
1   You said there were financial repercussions. W...       0.250000   
2   What is the criticism? What is the fear coming...       0.454545 

In [None]:
# Graph recall scores vs completion length (recall between original prompt/statement and original interviewer completion/question)
fig = px.scatter(df_manuals, x='prompt_word_count', y='Repeat', trendline="ols", hover_name='completion', hover_data=['prompt_word_count', 'rouge1-recall', df.index], title="[Test Set] Manual Repeat Scores by Prompt Length")
fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=3, cols=2, start_cell="top-left", 
                    subplot_titles=("Specificity by Prompt Length", "New Concepts by Prompt Length", 
                                    "Relevance by Prompt Length", "Appropriateness by Prompt Length",
                                    "Repeat by Prompt Length"))

fig.add_trace(go.Scatter(x=df_manuals["prompt_word_count"], y=df_manuals["Specificity"], mode='markers'),
              row=1, col=1)

fig.add_trace(go.Scatter(x=df_manuals["prompt_word_count"], y=df_manuals["New Concepts "], mode='markers'),
              row=1, col=2)

fig.add_trace(go.Scatter(x=df_manuals["prompt_word_count"], y=df_manuals["Relevance"], mode='markers'),
              row=2, col=1)

fig.add_trace(go.Scatter(x=df_manuals["prompt_word_count"], y=df_manuals["Appropriateness"], mode='markers'),
              row=2, col=2)

fig.add_trace(go.Scatter(x=df_manuals["prompt_word_count"], y=df_manuals["Repeat"], mode='markers'),
              row=3, col=1)

# Update xaxis properties
fig.update_xaxes(title_text="Prompt Length", row=1, col=1)
fig.update_xaxes(title_text="Prompt Length", row=1, col=2)
fig.update_xaxes(title_text="Prompt Length", row=2, col=1)
fig.update_xaxes(title_text="Prompt Length", row=2, col=2)
fig.update_xaxes(title_text="Prompt Length", row=3, col=1)


# Update yaxis properties
fig.update_yaxes(title_text="Specificity", row=1, col=1)
fig.update_yaxes(title_text="New Concepts", row=1, col=2)
fig.update_yaxes(title_text="Relevance", row=2, col=1)
fig.update_yaxes(title_text="Appropriateness", row=2, col=2)
fig.update_yaxes(title_text="Repeat", row=3, col=1)

fig.update_layout(width=1500, height=900)

fig.show()

## OpenAI Prompt-Completion

## BART Prompt-Completion

# Validation Set

In [None]:
# read in validation set from valid_question.json file
df_validation = pd.read_json('valid_question.json', lines=True)
df_validation.head

<bound method NDFrame.head of                                                   prompt  \
0       Not in the next couple of days. The latest fo...   
1       Yes, it is. After the January attacks, there ...   
2       Very chaos. Everybody's moving, and they all ...   
3       Well, thank you very much, Melissa, for invit...   
4       Right. And that's actually one of the reasons...   
...                                                  ...   
14995   Yeah. And the investigation that was complete...   
14996   Well, Algeria, partly because of the civil wa...   
14997   You're welcome, Rachel. So this is in Stockho...   
14998   I think U.S.-Russia relations continue to be ...   
14999   Right. So we didn't formally endorse. What we...   

                                              completion  
0       So I understand, Joe, that very shortly you a...  
1       So at this point, are French security officia...  
2                Are people stopping in to buy supplies?  
3       Dr. M

In [None]:
# compute rouge 1gram recall scores for orignal interviewer, interviewee prompt-completion pairs in data 
rouge = Rouge()
validation_scores = rouge.get_scores(df_validation["prompt"], df_validation["completion"])
validation_scores_recall = get_rouge(validation_scores, 'rouge-1', 'r')


In [None]:
# add rouge 1gram recall scores and prompt word count to validation set dataframe
df_validation['rouge1-recall'] = validation_scores_recall
df_validation['prompt_word_count'] = df_validation['prompt'].str.strip().str.split().str.len()
df_validation['completion_word_count'] = df_validation['completion'].str.strip().str.split().str.len()
df_validation.head

<bound method NDFrame.head of                                                   prompt  \
0       Not in the next couple of days. The latest fo...   
1       Yes, it is. After the January attacks, there ...   
2       Very chaos. Everybody's moving, and they all ...   
3       Well, thank you very much, Melissa, for invit...   
4       Right. And that's actually one of the reasons...   
...                                                  ...   
14995   Yeah. And the investigation that was complete...   
14996   Well, Algeria, partly because of the civil wa...   
14997   You're welcome, Rachel. So this is in Stockho...   
14998   I think U.S.-Russia relations continue to be ...   
14999   Right. So we didn't formally endorse. What we...   

                                              completion  rouge1-recall  \
0       So I understand, Joe, that very shortly you a...       0.250000   
1       So at this point, are French security officia...       0.277778   
2                Are peo

In [None]:

#### PLOTLY EXPRESS ####

# # # text wrapping for cleaner hovering with plot
# df_validation['prompt'] = df_validation['prompt'].apply(
#     lambda t: '<br>'.join(textwrap.wrap(t))
# )

# Graph recall scores vs prompt length (recall between original prompt/statement and original interviewer completion/question)
fig = px.scatter(df_validation, x='prompt_word_count', y='rouge1-recall', trendline="ols", hover_data=['prompt_word_count', 'completion_word_count', 'rouge1-recall', df_validation.index], title="[Validation Set] Recall (Overlap) between original prompt & interviewer completion vs Length of Prompt")
fig.show()


In [None]:
# Graph recall scores vs completion length (recall between original prompt/statement and original interviewer completion/question)
fig = px.scatter(df_validation, x='completion_word_count', y='rouge1-recall', trendline="ols", hover_data=['prompt_word_count', 'completion_word_count', 'rouge1-recall', df_validation.index], title="[Validation Set] Overlap between original prompt & interviewer completion vs Length of Completion")
fig.show()

In [None]:
get_prompt_completion(df_validation, 3920)

Prompt:  Well, as I said, David, we're hearing a lot of testimony from women saying, here is something that happened to me. And they're leaving it at that. And there have been interesting stories from women saying, I'm not going to name this person. This happened a long time ago. I've thought about it. It's not going to do any good to disrupt his life or mine. OK, so that's one set of issues. And that is fairly self-contained. But as we saw with the Kavanaugh hearings, when someone is accused, we need some way to weigh the two sides because sometimes we have men - we've seen in the #MeToo movement - saying, as with Louis C.K., yep. What these women said is right. I did it. Oftentimes we don't. And I think we can draw some concerning lessons from campus, where over the past six years or so there's been kind of a revolution in Title IX. This is the federal law that governs - that means no sex discrimination on campus. And under this has been an effort to - in an effort to eliminate sexua

In [None]:
max = df_validation['rouge1-recall'].max()
print(max)

1.0


In [None]:
# filtering validation set to include recall scores higher than 0.5, prompt word count less than 100, and completion word count less than 40
df_highROUGE = df_validation.loc[(df_validation['rouge1-recall'] > 0.5) & (df_validation['prompt_word_count'] < 100) & (df_validation['completion_word_count'] < 40)]
df_highROUGE.to_csv("validation_highROUGE.csv")
df_highROUGE.head()

Unnamed: 0,prompt,completion,rouge1-recall,prompt_word_count,completion_word_count
7,"Oh no, no, not at all. This is - oh, it's cra...",You don't see yourself as a classical musician?,0.625,64,8
100,IndyCar had already looked ahead to safety im...,So if the crash on Sunday had nothing to do w...,0.517241,68,33
215,They have tried to shoot me like 60 bullets a...,You were sure that that was ISIS in that case?,0.625,40,10
249,"Well, it's hard not to think about them. We h...",What are some of those little things that the...,0.538462,53,13
394,"I don't think anything really new for us, alt...",What were they saying about her appearance?,0.571429,87,7


In [None]:
# Graph recall scores vs prompt length (recall between original prompt/statement and original interviewer completion/question)
fig = px.scatter(df_highROUGE, x='prompt_word_count', y='rouge1-recall', trendline="ols", hover_data=['prompt_word_count', 'completion_word_count', 'rouge1-recall', df_highROUGE.index], title="[Validation Set] Recall (Overlap) between original prompt & interviewer completion vs Length of Prompt")
fig.show()


In [None]:
# Graph recall scores vs completion length (recall between original prompt/statement and original interviewer completion/question)
fig = px.scatter(df_highROUGE, x='completion_word_count', y='rouge1-recall', trendline="ols", hover_data=['prompt_word_count', 'completion_word_count', 'rouge1-recall', df_highROUGE.index], title="[Validation Set] Overlap between original prompt & interviewer completion vs Length of Completion")
fig.show()

In [None]:
# filtering validation set to include recall scores higher than 0.5, prompt word count less than 100, and completion word count less than 40
df_lowROUGE = df_validation.loc[(df_validation['rouge1-recall'] < 0.5) & (df_validation['prompt_word_count'] < 100) & (df_validation['completion_word_count'] < 40)]
df_lowROUGE.to_csv("validation_lowROUGE.csv")
df_lowROUGE.head()

Unnamed: 0,prompt,completion,rouge1-recall,prompt_word_count,completion_word_count
0,Not in the next couple of days. The latest fo...,"So I understand, Joe, that very shortly you a...",0.25,46,21
2,"Very chaos. Everybody's moving, and they all ...",Are people stopping in to buy supplies?,0.142857,17,7
4,Right. And that's actually one of the reasons...,"And if you go to Yahoo! 's home site, you'll ...",0.285714,92,22
5,Yeah.,"I mean, does that show that the leaders of th...",0.0,1,17
6,The legislation authorizes the president to u...,You have tried to get a bill like this passed...,0.225806,73,35
