# Detecting Repetitive Speech

**TODO**:
- Generator + LLM critic experiment

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.adress
%aimport detectors.repetitive_speech.unigram_analysis

In [0]:
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
import re
from pprint import pprint
# Evaluation
from utils import evaluate
import seaborn as sns
from sklearn.metrics import roc_auc_score
from scipy.stats import ttest_ind, mannwhitneyu
# Generative AI
from openai import OpenAI
from utils import llm_call
import mlflow
from mlflow.genai.scorers import Safety, scorer
from mlflow.entities import Feedback

## Load the Data

In [0]:
from data.adress import load_transcripts

In [0]:
adress_trans = load_transcripts()
adress_trans = adress_trans[["Speaker", "Transcript", "Transcript_clean", "Repetition", "Revision"]]

# Extract annotation based count of repetitions and revisions
adress_trans["num_repetitions"] = adress_trans["Transcript"].apply(lambda x: len(re.findall(r"\[/\]", x)))
adress_trans["num_revisions"] = adress_trans["Transcript"].apply(lambda x: len(re.findall(r"\[//\]", x)))

adress_trans.head()

In [0]:
trn_pt_utt_idx = (adress_trans.index.get_level_values("split") == "train") & (adress_trans["Speaker"] == "Patient")
dev_pt_utt_idx = (adress_trans.index.get_level_values("split") == "dev")   & (adress_trans["Speaker"] == "Patient")
tst_pt_utt_idx = (adress_trans.index.get_level_values("split") == "test")  & (adress_trans["Speaker"] == "Patient")

## Baseline: Unigram Analysis

In [0]:
from detectors.repetitive_speech.unigram_analysis import UnigramAnalysisDetector
from utils import create_custom_nlp

#### Explore detector configurations.

In [0]:
from itertools import product
from joblib import Parallel, delayed
from tqdm import tqdm

In [0]:
window_sizes = [1, 2, 3, 5, 10]
comparators = ["exact", "lemma_exact"]

configs = {f"max_w_{cfg[0]}_comp_{cfg[1]}": cfg for cfg in product(window_sizes, comparators)}

In [0]:
def run(cfg_name, cfg, dataset):
    # Create spaCy vocab
    nlp = create_custom_nlp()
    # Initialize n-gram analysis detector with config
    d = UnigramAnalysisDetector(nlp, *cfg)
    # Run detector on dataset
    outputs = dataset["Transcript_clean"].apply(d.detect)
    # Evaluate performance
    true = dataset["Repetition"]
    pred = outputs.apply(lambda x: len(x["detections"]) > 0).astype(int)
    scores = evaluate(true, pred)
    return cfg_name, scores

In [0]:
results = Parallel(n_jobs=10, return_as="generator_unordered")(delayed(run)(cfg_name, configs[cfg_name], adress_trans.loc[trn_pt_utt_idx]) for cfg_name in configs)
results = [res for res in tqdm(results, total=len(configs))]

configs, cfg_scores = zip(*results)
table = pd.DataFrame(cfg_scores, index=configs)
# table.to_csv("train_results/repetition_trn_results.csv")

In [0]:
table.sort_values("f1", ascending=False).round(3)
# print(table.round(3).to_latex())

#### Evaluate the best configuration.

In [0]:
# create custom spaCy
nlp = create_custom_nlp()
# initialize detector
repetition_detector_final = UnigramAnalysisDetector(nlp, 2, "exact")
# run on all data
adress_trans["unia_dets"] = adress_trans.apply(lambda x: repetition_detector_final.detect(x["Transcript_clean"]) if x["Speaker"] == "Patient" else pd.NA, axis=1)

###### Performance on the test data.

In [0]:
true = adress_trans.loc[tst_pt_utt_idx, "Repetition"]
pred = adress_trans.loc[tst_pt_utt_idx, "unia_dets"].apply(lambda x: len(x["detections"]) > 0).astype(int)
print(evaluate(true, pred, return_latex=True))

## LLM-Based Detector

In [0]:
mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()

client = OpenAI(
    api_key=mlflow_creds.token,
    base_url=f"{mlflow_creds.host}/serving-endpoints"
)

In [0]:
datasets = {
    "train": [],
    "dev": [],
    "test": []
}

for (split, pt_id, utt_num), row in adress_trans.loc[adress_trans["Speaker"] == "Patient"].iterrows():
    datasets[split].append({
        "split": split,
        "ID": pt_id,
        "utt_num": utt_num,
        "inputs": {"text": row["Transcript_clean"]},
        "expectations": {"has_repetition": bool(row["Repetition"])}
    })

# pprint(datasets)

In [0]:
@scorer
def correct(expectations, outputs):
    return Feedback(value=(expectations["has_repetition"] == (len(outputs["detections"]) > 0)))

In [0]:
def process_mlflow_outputs(result):
    outputs = result.tables["eval_results"][["response", "assessments"]]
    outputs["pred"] = outputs["response"].apply(lambda x: (len(x["detections"]) > 0) if type(x) == dict else False).astype(int)
    outputs["label"] = outputs["assessments"].apply(lambda x: [a["value"] for a in x if a["name"] == "has_repetition"][0]).astype(int)
    return outputs

#### Explore different prompts

In [0]:
"""Your task is to analyze an utterance and identify all instances of exact repetition and significant semantic revision.

**INSTRUCTIONS**
1. Identify all repetitions and revisions. A "repetition" is a retracing without correction where the speaker repeats their words without changes. A "revision" is a retracing with correction and occurs when the speaker changes something (usually the syntax) of an utterance but maintains the same idea.
2. You must output a list of pairs. For each repetition or revision, the pair contains (1) tuple of the start and end character index of the first part of the repeated or revised text and (2) tuple of the start and end character index of the second part of the repeated or revised text. If no repetitions or revisions are found, the list must be empty.

**Input:**
{}
"""

"""Your task is to analyze the following utterances and identify instances of repetition.

**INSTRUCTIONS:**
1. **Input Format**: The input will be a JSON object with a single key "utterances". The value will be a list of objects, where each object has the "text" to be analyzed.
2. **Task**: For each utterance object in the input, identify all instances of repetition its text. A "repetition" is when a sound, word or phrase is repeated exactly. Some repetition text between repeated content is allowed.
3. **Output Format**: Your output must be a JSON object with a single key "utterances". The value will be a list of objects, one for each utterance from the input. Each object in the "utterances" list must contain:
- The original "text".
- A "repetitions" list. This list should contain one sub-list for each detected repetition. Each sub-list must contain a pair of spans: one for the original instance and one for the repeated instance. Each span should be a list of two integers: the start and end character index. 

**INPUT JSON:**
{}
"""
"Identify all instances where the patient repeats the same sounds, words, or phrases--either consecutively or non-consecutively--in a way that indicates cognitive impairment in the following utterance:\n\n{}\n\nReturn a bullet point list, where each bullet contains a complete quote of the full phrase in which the repetition occurs, exactly as spoken by the patient. Start the quote at the first repeated sound, word, or phrase and end the quote after the last repeated sound, word, or phrase. Do not include any explanations. Do not include repetition prompted by the provider. If no repetitions are found, return \"None\"."

# v0
"""# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. Your task is to identify all instances where the patient repeats the same sounds, words, or phrases--either consecutively or non-consecutively--in the provided utterance below.

Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected repetition and must have the following three keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the repeated word or phrase.
- "span1": The character span for the first occurrence of the repeated text.
- "span2": The character span for the second occurrence of the repeated text.

# UTTERANCE
{}
"""

# v1
"""# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. Your task is to identify all clinically significant instances of repetition in the provided utterance below. Include repetition due to hesitation, stuttering, word finding difficulty, and speech planning.

Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected repetition and must have the following three keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the repeated word or phrase.
- "span1": The character span for the first occurrence of the repeated text.
- "span2": The character span for the second occurrence of the repeated text.

# UTTERANCE
{}
"""

# v2
"""# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. Your task is to identify all clinically significant instances of repetition in the provided utterance below. Focus only on repetitions that appear involuntary and disrupt the natural flow of speech.

Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected repetition and must have the following three keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the repeated word or phrase.
- "span1": The character span for the first occurrence of the repeated text.
- "span2": The character span for the second occurrence of the repeated text.

# UTTERANCE
{}
"""

# v3
"""# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. Your task is to identify all clinically significant instances of repetition in the provided utterance below. Focus only on repetitions that appear involuntary and disrupt the natural flow of speech. Do not flag words that repeat for valid grammatical reasons

Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected repetition and must have the following three keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the repeated word or phrase.
- "span1": The character span for the first occurrence of the repeated text.
- "span2": The character span for the second occurrence of the repeated text.

# UTTERANCE
{}
"""

# v4
"""# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. Your task is to identify all clinically significant instances of immediate, verbatim repetition in the provided utterance below. Focus only on repetitions that appear involuntary and disrupt the natural flow of speech. Do not flag words that repeat for valid grammatical reasons.

Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected repetition and must have the following three keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the repeated word or phrase.
- "span1": The character span for the first occurrence of the repeated text.
- "span2": The character span for the second occurrence of the repeated text.

# UTTERANCE
{}
"""

# v5
"""# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. Your task is to identify all clinically significant instances of immediate, verbatim repetition of whole words or phrases in the provided utterance below. Focus only on repetitions that appear involuntary and disrupt the natural flow of speech. Do not flag words that repeat for valid grammatical reasons.

Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected repetition and must have the following three keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the repeated word or phrase.
- "span1": The character span for the first occurrence of the repeated text.
- "span2": The character span for the second occurrence of the repeated text.

# UTTERANCE
{}
"""

# v6
"""# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. Your task is to identify all clinically significant instances of immediate, verbatim repetition of whole words or phrases in the provided utterance below. Focus only on repetitions that appear involuntary and disrupt the natural flow of speech. Do not flag words that repeat for valid grammatical reasons. Every detected repetition MUST correspond to an exact substring found within the provided utterance.

Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected repetition and must have the following three keys-value pairs:
- "type": "repetition".
- "text": The verbatim text from the first occurrence of the repeated word or phrase.
- "s1": The starting character index for the first occurrence of "text".
- "s2": The starting character index for the second occurrence of "text".

# UTTERANCE
{}
"""

# v7
"""# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. Your task is to identify all clinically significant instances of immediate, verbatim repetition in the provided utterance below. Focus only on repetitions that appear involuntary and disrupt the natural flow of speech. Do not flag words that repeat for valid grammatical reasons.

Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected repetition and must have the following four keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the repeated word or phrase.
- "start1": The character index where the FIRST occurrence of the repeated text begins.
- "start2": The character index where the SECOND occurrence of the repeated text begins.

# UTTERANCE
{}
"""

version = "4_3"
prompt = """# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. 

Your task is to identify all clinically significant instances of word repetition in the provided input utterance below. 

### Definition
Word repetition is an involuntary, immediate verbatim repeat of a whole word, which disrupts the flow of speech and signals a potential struggle with speech production. Do not flag words that repeat for valid grammatical reasons (e.g., "I knew that that was the problem.") or for emphasis (e.g., "very very"). Do not flag word repetitions that are part of a self-correction or phrasal restart (e.g., in "He went to the store [silence] to the bank", the repetition of "to" and "the" should be ignored). Do not flag filler sounds as repetitions. Do not flag partial-word stutters (e.g., "s sound") or different forms of the same word (e.g., "get getting").

### Output Format
Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected word repetition and must have the following keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the detected repeat word.
- "span": The character span of the first occurence of "text" in the provided input.
- "span2": The character span of the second occurrence of "text" in the provided input.

# INPUT
{input_text}
"""

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, prompt.format(input_text=text), {"type": "json_object"})

with mlflow.start_run(run_name=f"gpt_trn_pV{version}"): 
    gpt_dets_trn = mlflow.genai.evaluate(
        data=datasets["train"],
        predict_fn=fn,
        scorers=[correct]
    )

In [0]:
outputs = process_mlflow_outputs(gpt_dets_trn)
outputs.to_pickle(f"llm_outputs/repetition_trn_gpt_pV{version}.pkl")

print(evaluate(outputs["label"], outputs["pred"], return_latex=True))

0.513 & 0.953 & 0.667 & 0.918 & 0.934 \\

0.472 & 0.976 & 0.636 & 0.904 & 0.937 \\

##### Try few-shot prompting

First look at errors on the dev dataset.

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, prompt.format(input_text=text), {"type": "json_object"})

with mlflow.start_run(run_name=f"gpt_dev_pV{version}"): 
    gpt_dets_trn = mlflow.genai.evaluate(
        data=datasets["dev"],
        predict_fn=fn,
        scorers=[correct]
    )

Then evaluate again on train.

In [0]:
version = "4_3_fewshot"
fs_prompt = """# INSTRUCTIONS
You are a neurologist analyzing a patient"s speech sample for signs of cognitive impairment. 

Your task is to identify all clinically significant instances of word repetition in the provided input utterance below. 

### Definition
Word repetition is an involuntary, immediate verbatim repeat of a whole word, which disrupts the flow of speech and signals a potential struggle with speech production. Do not flag words that repeat for valid grammatical reasons (e.g., "I knew that that was the problem.") or for emphasis (e.g., "very very"). Do not flag word repetitions that are part of a self-correction or phrasal restart (e.g., in "He went to the store [silence] to the bank", the repetition of "to" and "the" should be ignored). Do not flag filler sounds as repetitions. Do not flag partial-word stutters (e.g., "s sound") or different forms of the same word (e.g., "get getting").

### Output Format
Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected word repetition and must have the following keys-value pairs:
- "type": "repetition".
- "text": The verbatim text of the detected repeat word.
- "span": The character span of the first occurence of "text" in the provided input.
- "span2": The character span of the second occurrence of "text" in the provided input.

# EXAMPLES
**Input**: two s uh two cups and a plate are on the um counter there .
**Correct Output**: 
{{
    "detections": [
        {{"type": "repetition", "text": "two", "span": [0, 3], "span2": [9, 12]}},
    ]
}}

**Input**: and he has a cookie in each hand, handing about to hand one cookie to the little girl who is standing there with her hand reached up for the cookie .
**Incorrect Output**: 
{{
    "detections": [
        {{"type": "repetition", "text": "hand", "span": [28, 32], "span2": [52, 56]}},
        {{"type": "repetition", "text": "cookie", "span": [14, 20], "span2": [61, 67]}}
    ]
}}
**Error**: These are not immediate repetitions but rather normal word reuse in different contexts.

**Input**: and her brother's taking cookies out of the cookie jar .
**Incorrect Output**: 
{{
    "detections": [
        {{"type": "repetition", "text": "cookie", "span": [37, 43], "span2": [51, 57]}},
    ]
}}
**Error**: "cookies" and "cookie" are different word forms and should not be flagged as repetitions.

# INPUT
{input_text}
"""

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, fs_prompt.format(input_text=text), {"type": "json_object"})

with mlflow.start_run(run_name=f"gpt_trn_pV{version}"): 
    gpt_fs_dets_trn = mlflow.genai.evaluate(
        data=datasets["train"],
        predict_fn=fn,
        scorers=[correct]
    )

In [0]:
outputs = process_mlflow_outputs(gpt_dets_trn)
outputs.to_pickle(f"llm_outputs/repetition_trn_gpt_pV{version}.pkl")

print(evaluate(outputs["label"], outputs["pred"], return_latex=True))

0.534 & 0.929 & 0.678 & 0.924 & 0.927 \\

##### Evaluate the best configuration

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, prompt.format(input_text=text), {"type": "json_object"})

with mlflow.start_run(run_name=f"gpt_tst_pV{version}") as run:
    gpt_dets_tst = mlflow.genai.evaluate(
        predict_fn=fn,
        data=datasets["test"],
        scorers=[correct]
    )

In [0]:
outputs = process_mlflow_outputs(gpt_dets_tst)
outputs.to_pickle(f"llm_outputs/repetition_tst_gpt_pV{version}.pkl")

print(evaluate(outputs["label"], outputs["pred"], return_latex=True))

## Generator + LLM Critic

###### Investigate the failure cases.

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False positives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Repetition"] == 0) & (adress_trans["na_dets"].apply(lambda x: len(x["detections"]) > 0 if type(x) == dict else False)), ["Transcript", "na_dets"]].values)

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False negatives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Repetition"] == 1) & (adress_trans["na_dets"].apply(lambda x: len(x["detections"]) == 0 if type(x) == dict else False)), ["Transcript"]].values)

## Summary metrics for repetition detections

In [0]:
from data.adress import load_outcomes

In [0]:
outcomes = load_outcomes()
outcomes.head()

In [0]:
trn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train")].index.values
trn_ad_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train") & (outcomes["AD_dx"] == 1)].index.values
trn_cn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train") & (outcomes["AD_dx"] == 0)].index.values

In [0]:
tst_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test")].index.values
tst_ad_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 1)].index.values
tst_cn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 0)].index.values

**Repetition Rate**: number of repetitions / total number of spoken words

In [0]:
def compute_repetition_rate(outputs):
    num = outputs.apply(lambda x: len(x["detections"]) if not pd.isna(x) else 0).groupby(level=("split", "ID")).sum()
    den = adress_trans.apply(lambda x: sum([1 for token in nlp(x["Transcript_clean"]) if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)]) if x["Speaker"] == "Patient" else 0, axis=1).groupby(level=("split", "ID")).sum()
    return 100 * num / den

In [0]:
# outcomes["gt_repetition_rate"] = compute_repetition_rate(adress_trans["num_repetitions"].apply(lambda x: {"detections": [0] * x}))
outcomes["gt_revision_rate"] = compute_repetition_rate(adress_trans["num_revisions"].apply(lambda x: {"detections": [0] * x}))
outcomes["unia_repetition_rate"] = compute_repetition_rate(adress_trans["unia_dets"])

**Part-of-Speech Repetition Counts**

In [0]:
POS = [
    "ADJ",      # adjective
    "ADP",      # adposition
    "ADV",      # adverb
    "AUX",      # auxiliary
    "CCONJ",    # coordinating conjunction
    "DET",      # determiner
    "INTJ",     # interjection
    "NOUN",     # noun
    "NUM",      # numeral
    "PART",     # particle
    "PRON",     # pronoun
    "PROPN",    # proper noun
    # "PUNCT",    # punctuation
    "SCONJ",    # subordinating conjunction
    # "SYM",      # symbol
    "VERB",     # verb
    "X",        # other
    # "SPACE",    # space
]

def pos_repetition_counts(output_name):
    pos_metrics = pd.DataFrame(index=outcomes.index, columns=POS, dtype=float)

    for (split, pt_id) in pos_metrics.index:
        counts = dict.fromkeys(POS, 0)
        num_words = 0
        for utt_num, row in adress_trans.loc[(split, pt_id)].iterrows():
            if row["Speaker"] == "Patient":
                num_words += sum([1 for token in nlp(row["Transcript_clean"]) if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)])

                if row[output_name]["detections"]:
                    for det in row[output_name]["detections"]:
                        doc = nlp(det["text1"])
                        try:
                            counts[doc[0].pos_] += 1                              
                        except KeyError as e:
                            print(e)
                            print(row["Transcript_clean"])
                            print(det)

        for pos in POS:
            pos_metrics.loc[(split, pt_id), pos] = 100 * counts[pos] / num_words

    return pos_metrics

In [0]:
outcomes[["unia_" + pos + "_rep_rate" for pos in POS]] = pos_repetition_counts("unia_dets")

Analyze correlation between our metrics and the outcome variable

In [0]:
# Generates the rows of Table 10 and 11
def analysis(split_idx, split_ad, split_cn, metrics):
    for m in metrics:
        ## score averages
        mean_ad = outcomes.loc[split_ad, m].mean()
        std_ad = outcomes.loc[split_ad, m].std()
        mean_cn = outcomes.loc[split_cn, m].mean()
        std_cn = outcomes.loc[split_cn, m].std()

        ## correlation metrics
        res_ttest = ttest_ind(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])
        res_mannw = mannwhitneyu(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])
        res_auc   = roc_auc_score(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])

        print("%s & %.2f (%.2f) & %.2f (%.2f) & %.2f (%s) & %.2f (%s) & %.2f \\\\" % 
                (m,
                mean_ad,
                std_ad,
                mean_cn,
                std_cn,
                res_ttest.statistic,
                str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "$<$0.001", 
                res_mannw.statistic, 
                str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "$<$0.001",
                res_auc)
        )

In [0]:
mets = [
    # "gt_repetition_rate", 
    # "gt_revision_rate", 
    "unia_repetition_rate"
] + ["unia_" + pos + "_rep_rate" for pos in POS]

In [0]:
analysis(trn_pts, trn_ad_pts, trn_cn_pts, mets)

In [0]:
outcomes[["unia_repetition_rate"] + ["unia_" + pos + "_rep_rate" for pos in POS]].to_csv("repetition_feats.csv")