### Import data csv and normalize human evaluation scores 

In [None]:
import pandas as pd
from scipy import stats

df = pd.read_csv("focal_ela_science.csv", encoding = "UTF-8")

# def z_score(vals):
#     vals - vals.mean())/vals.std(ddof=0)

# rescaling D1 score (human evaluation of uptake) from 1-4 to 0-1
# adding the normed values back to the dataframe
df = df.assign(normed_D1 = (df["D1"]-1)/3)
df = df.assign(z_D1 = stats.zscore(df["D1"], nan_policy='omit'))

# cleaning URL into uuid
df = df.assign(uuid = df.URL.apply(lambda x: x[39:]))
# df['uuid'] = df.URL.apply(lambda x: x[39:])

df

In [None]:
# getting conversations from the XML files

# for c in conversations:
#     uuid = c["URL"][39:]
#     if c["D1"] == "NA":
#         c["normed_D1"] = None
#     else:
#         c["normed_D1"] = (float(c["D1"]) - 1) / 3

#     turn_getter = Conversation(uuid)
    
#     for this_turn, next_turn in turn_getter.get_pairwise_turns():

#         # calculate_score(this_turn, next_turn, stem=True)
#         pass

        

### Creating a historgram showing values at a glance

In [None]:
import matplotlib.pyplot as plt

# if using a Jupyter notebook, includue:
%matplotlib inline

plt.hist(df["z_D1"], 10,
         density=True,
         histtype='bar',
         facecolor='g',
         alpha=0.5)

plt.show()

In [None]:
plt.hist(df["normed_D1"], 10,
         density=True,
         histtype='bar',
         facecolor='g',
         alpha=0.5)

plt.show()

In [None]:
print(df.z_D1.mean())
print(df.z_D1.median())

### Preprocessing function to be shared by models

In [None]:
import nltk
# nltk.download()

import string 
s =  set(string.punctuation)

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

example = """This is a sample sentence,
                  showing off the stop words filtration."""

def preprocess_text(sentence, remove_stopwords=False, lemmatize=False):
    word_tokens = word_tokenize(sentence)
 
    # lowercase
    word_tokens = [w.lower() for w in word_tokens]
    
    # strip punctuation
    word_tokens = [w for w in word_tokens if w not in s]
    
    if remove_stopwords is True:
        word_tokens = [w for w in word_tokens if not w in stop_words]
    
    if lemmatize is True:
        word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens]
    
    return word_tokens


# preprocess_text(example)


### Baseline Word-Overlapping Model

#### function calculating wordverlap between pairs

In [None]:
# function that calculate word overlapping

def calculate_wordoverlap_score(
    this_turn, next_turn, remove_stopwords=False, lemmatize=False, min_toks=5):
    
    this_turn_toks = set(preprocess_text(this_turn, remove_stopwords, lemmatize))
    next_turn_toks = set(preprocess_text(next_turn, remove_stopwords, lemmatize))
    
    overlap = 0
    
    if len(this_turn_toks) < min_toks:
        return None
    
    for token in this_turn_toks:

        if token in next_turn_toks:
            overlap +=1
    score = overlap / len(this_turn_toks)
        
    return score

#### function getting the baseline prediction (with preprocessing swtich and final calculating switch)

In [None]:

from statistics import mean
from conversations import Conversation

def get_baseline_prediction(uuid, remove_stopwords=False, lemmatize=False, min_toks=5, aggregate=mean):
    turn_getter = Conversation(uuid) # getting conversations from the XML files using Simon's function
    
    scores = [
        calculate_wordoverlap_score(this_turn, next_turn,  remove_stopwords, lemmatize, min_toks)
        for this_turn, next_turn in turn_getter.get_pairwise_turns()
    ]
    
    scores = [score for score in scores if score is not None]
    
    if not len(scores):
        return None
    
    return aggregate(scores)
     

#### raw text without punctuations

In [None]:
# calculating baseline, raw text without punctuation predictions, aggregate by mean
# adding baseline_raw prediction to the dataframe

df = df.assign(baseline_raw_prediction = df.uuid.apply(get_baseline_prediction))

df = df.assign(z_baseline_raw = stats.zscore(df["baseline_raw_prediction"], nan_policy='omit'))


   

In [None]:
plt.hist(df["baseline_raw_prediction"], 10,
         density=True,
         histtype='bar',
         facecolor='r',
         alpha=0.5)

plt.show()

In [None]:
print(df.baseline_raw_prediction.mean())
print(df.baseline_raw_prediction.median())


#### replace mean value with max value

In [None]:
# adding baseline_raw prediction aggregated by max to the dataframe
df = df.assign(baseline_raw_max_prediction = df.uuid.apply(lambda x: get_baseline_prediction(x, aggregate=max)))


In [None]:
# plt.hist(df["baseline_raw_max_prediction"], 10,
#          density=True,
#          histtype='bar',
#          facecolor='r',
#          alpha=0.5)

# plt.show()

#### replacing mean with mean softmax

In [None]:
from scipy.special import softmax

def mean_softmax(vals):
    return mean(softmax(vals))

# adding baseline_raw prediction to the dataframe
df = df.assign(baseline_raw_softmax_mean_prediction = 
               df.uuid.apply(lambda x: get_baseline_prediction(x, aggregate=mean_softmax)))

In [None]:
#### replacing mean with max softmax

In [None]:
def max_softmax(vals):
    return max(softmax(vals))

# adding baseline_raw prediction to the dataframe
df = df.assign(baseline_raw_softmax_max_prediction = 
               df.uuid.apply(lambda x: get_baseline_prediction(x, aggregate=max_softmax)))
df

#### stop words removed

In [None]:
# calculating baseline with stop words removed prediction, aggregated by max
# there are issues with empty turns after moving stop words!

# adding baseline prediction to the dataframe
df = df.assign(baseline_stprm_prediction =
               df.uuid.apply(lambda uuid: get_baseline_prediction(uuid, remove_stopwords=True)))

# using max instead of mean
df = df.assign(baseline_stprm_max_prediction =
               df.uuid.apply(lambda uuid: get_baseline_prediction(uuid, remove_stopwords=True, aggregate=max)))

In [None]:
# plt.hist(df["baseline_stprm_prediction"], 10,
#          density=True,
#          histtype='bar',
#          facecolor='y',
#          alpha=0.5)

# plt.show()

#### lemmatization

In [None]:
# calculating baseline using lemmatization predictions

# adding baseline prediction to the dataframe
df = df.assign(baseline_lemmatized_prediction =
               df.uuid.apply(lambda uuid: get_baseline_prediction(uuid, lemmatize=True)))

df = df.assign(baseline_lemmatized_max_prediction =
               df.uuid.apply(lambda uuid: get_baseline_prediction(uuid, lemmatize=True, aggregate=max)))

In [None]:
plt.hist(df["baseline_lemmatized_prediction"], 10,
         density=True,
         histtype='bar',
         facecolor='c',
         alpha=0.5)

plt.show()

In [None]:
# removing stopwords and lemmatizing
df = df.assign(baseline_all_prediction =
               df.uuid.apply(lambda uuid: get_baseline_prediction(uuid, lemmatize=True, remove_stopwords=True)))

df = df.assign(baseline_all_max_prediction =
               df.uuid.apply(lambda uuid: get_baseline_prediction(uuid, lemmatize=True, remove_stopwords=True, aggregate=max)))


### cosine similarity between sentences

In [None]:
# Program to measure the similarity between 
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def calculate_sentence_similarity(
    this_turn, next_turn, remove_stopwords=False, lemmatize=False, min_toks=5):
    
    this_turn_toks = set(preprocess_text(this_turn, remove_stopwords, lemmatize))
    next_turn_toks = set(preprocess_text(next_turn, remove_stopwords, lemmatize))
        
    if len(this_turn_toks) < min_toks or len(next_turn_toks) < min_toks:
        return None
    
    s1 = []
    s2 = []
    # form a set containing keywords of both strings 
    rvector = this_turn_toks.union(next_turn_toks) 
    
    for w in rvector:
        if w in this_turn_toks:
            s1.append(1) # create a vector
        else:
            s1.append(0)
        if w in next_turn_toks:
            s2.append(1)
        else: 
            s2.append(0)
    c = 0
    
    # cosine formula 
    for i in range(len(rvector)):
            c+= s1[i]*s2[i]
    cosine = c / float((sum(s1)*sum(s2))**0.5)
    return cosine

In [None]:
def get_sentence_similarity_prediction(uuid, remove_stopwords=False, lemmatize=False, min_toks=5, aggregate=mean):
    turn_getter = Conversation(uuid) # getting conversations from the XML files using Simon's function
    
    scores = [
        calculate_sentence_similarity(this_turn, next_turn,  remove_stopwords, lemmatize, min_toks)
        for this_turn, next_turn in turn_getter.get_pairwise_turns()
    ]
    
    scores = [score for score in scores if score is not None]
    
    if not len(scores):
        return None
    
    return aggregate(scores)
     

In [None]:
df = df.assign(sentence_similarity_prediction =
               df.uuid.apply(lambda uuid: get_sentence_similarity_prediction(uuid)))

df = df.assign(sentence_similarity_max_prediction =
               df.uuid.apply(lambda uuid: get_sentence_similarity_prediction(uuid, aggregate=max)))

df = df.assign(sentence_similarity_stprm_prediction =
               df.uuid.apply(lambda uuid: get_sentence_similarity_prediction(uuid, remove_stopwords=True)))

df = df.assign(sentence_similarity_stprm_max_prediction =
               df.uuid.apply(lambda uuid: get_sentence_similarity_prediction(uuid, remove_stopwords=True, aggregate=max)))

df = df.assign(sentence_similarity_lemma_prediction =
               df.uuid.apply(lambda uuid: get_sentence_similarity_prediction(uuid, lemmatize = True)))

df = df.assign(sentence_similarity_lemma_max_prediction =
               df.uuid.apply(lambda uuid: get_sentence_similarity_prediction(uuid, lemmatize = True, aggregate=max)))

df = df.assign(sentence_similarity_all_prediction =
               df.uuid.apply(lambda uuid: get_sentence_similarity_prediction(uuid, remove_stopwords=True, lemmatize=True)))

df = df.assign(sentence_similarity_all_max_prediction =
               df.uuid.apply(lambda uuid: get_sentence_similarity_prediction(uuid, remove_stopwords=True, lemmatize=True, aggregate=max)))

### checking correlations between human labels and baseline models

In [None]:
# print(df["z_D1"].corr(df["z_baseline_raw"]))

print("baseline-raw: ", df["normed_D1"].corr(df["baseline_raw_prediction"]))

print("baseline-raw-max: ",df["normed_D1"].corr(df["baseline_raw_max_prediction"]))

print("baseline-stprm: ",df["normed_D1"].corr(df["baseline_stprm_prediction"]))

print("baseline-stprm_max: ",df["normed_D1"].corr(df["baseline_stprm_max_prediction"]))

print("baseline-lemma: ",df["normed_D1"].corr(df["baseline_lemmatized_prediction"]))

print("baseline-lemma_max: ",df["normed_D1"].corr(df["baseline_lemmatized_max_prediction"]))

print("baseline-all: ", df["normed_D1"].corr(df["baseline_all_prediction"]))

print("baseline-max: ",df["normed_D1"].corr(df["baseline_all_max_prediction"]))

print("ss: ",df["normed_D1"].corr(df["sentence_similarity_prediction"]))

print("ss-max: ",df["normed_D1"].corr(df["sentence_similarity_max_prediction"]))

print("ss-stprm: ",df["normed_D1"].corr(df["sentence_similarity_stprm_prediction"]))

print("ss-stprm-max: ",df["normed_D1"].corr(df["sentence_similarity_stprm_max_prediction"]))

print("ss-lemma: ",df["normed_D1"].corr(df["sentence_similarity_lemma_prediction"]))

print("ss-lemma-max: ",df["normed_D1"].corr(df["sentence_similarity_lemma_max_prediction"]))

print("ss-all: ",df["normed_D1"].corr(df["sentence_similarity_all_prediction"]))

print("ss-all-max: ",df["normed_D1"].corr(df["sentence_similarity_all_max_prediction"]))

# print(df["normed_D1"].corr(df["baseline_raw_softmax_mean_prediction"]))

# print(df["normed_D1"].corr(df["baseline_raw_softmax_max_prediction"]))



### creating confusion matrix and evaluation (F-score, macro F-score)

In [None]:
# %matplotlib inline
from nltk.metrics.distance import edit_distance
from nltk.translate import bleu_score
import numpy as np
# import pandas as pd
import scipy.stats
from sklearn import metrics
# import utils

In [None]:
# assign label

def assign_label(value):
    if value > 0.5:
        return "High"
    return "Low"


In [None]:
### assigning labels

In [None]:
# creating labels

df = df.assign(normed_D1_label =
               df.normed_D1.apply(assign_label))

df = df.assign(baseline_raw_prediction_label =
               df.baseline_raw_prediction.apply(assign_label))

df = df.assign(baseline_raw_max_prediction_label =
               df.baseline_raw_max_prediction.apply(assign_label))

df = df.assign(baseline_lemmatized_prediction_label =
               df.baseline_lemmatized_prediction.apply(assign_label))

df = df.assign(baseline_lemmatized_max_prediction_label =
               df.baseline_lemmatized_max_prediction.apply(assign_label))

df = df.assign(baseline_stprm_prediction_label =
               df.baseline_stprm_prediction.apply(assign_label))

df = df.assign(baseline_stprm_max_prediction_label =
               df.baseline_stprm_max_prediction.apply(assign_label))

df = df.assign(baseline_all_prediction_label =
               df.baseline_all_prediction.apply(assign_label))

df = df.assign(baseline_all_max_prediction_label =
               df.baseline_all_max_prediction.apply(assign_label))

df = df.assign(sentence_similarity_prediction_label =
               df.sentence_similarity_prediction.apply(assign_label))

df = df.assign(sentence_similarity_max_prediction_label =
               df.sentence_similarity_max_prediction.apply(assign_label))

df = df.assign(sentence_similarity_stprm_prediction_label =
               df.sentence_similarity_stprm_prediction.apply(assign_label))

df = df.assign(sentence_similarity_stprm_max_prediction_label =
               df.sentence_similarity_stprm_max_prediction.apply(assign_label))

df = df.assign(sentence_similarity_lemma_prediction_label =
               df.sentence_similarity_lemma_prediction.apply(assign_label))

df = df.assign(sentence_similarity_lemma_max_prediction_label =
               df.sentence_similarity_lemma_max_prediction.apply(assign_label))

df = df.assign(sentence_similarity_all_prediction_label =
               df.sentence_similarity_all_prediction.apply(assign_label))

df = df.assign(sentence_similarity_all_max_prediction_label =
               df.sentence_similarity_all_max_prediction.apply(assign_label))

# df = df.assign(baseline_raw_softmax_mean_prediction_label =
#                df.baseline_raw_softmax_mean_prediction.apply(assign_label))


# df = df.assign(baseline_raw_softmax_max_prediction_label =
#                df.baseline_raw_softmax_max_prediction.apply(assign_label))

df

### generating confusion matrix

In [None]:
# confusion matrix

from sklearn.metrics import confusion_matrix

# raw
X = df.normed_D1_label
Y_raw = df.baseline_raw_prediction_label
Y_raw_max = df.baseline_raw_max_prediction_label
Y_lemma = df.baseline_lemmatized_prediction_label
Y_lemma_max = df.baseline_lemmatized_max_prediction_label
Y_stprm = df.baseline_stprm_prediction_label
Y_stprm_max = df.baseline_stprm_max_prediction_label
Y_all = df.baseline_all_prediction_label
Y_all_max = df.baseline_all_max_prediction_label
Y_ss = df.sentence_similarity_prediction_label
Y_ss_max = df.sentence_similarity_max_prediction_label
Y_ss_stprm = df.sentence_similarity_stprm_prediction_label
Y_ss_stprm_max = df.sentence_similarity_stprm_max_prediction_label
Y_ss_lemma = df.sentence_similarity_lemma_prediction_label
Y_ss_lemma_max = df.sentence_similarity_stprm_max_prediction_label
Y_ss_all = df.sentence_similarity_all_prediction_label
Y_ss_all_max = df.sentence_similarity_all_max_prediction_label

# Y_raw_softmax_mean = df.baseline_raw_softmax_mean_prediction_label
# Y_raw_softmax_max = df.baseline_raw_softmax_max_prediction_label


#Generate the confusion matrix
# cm_raw = confusion_matrix(X, Y_raw)
# cm_raw_max = confusion_matrix(X, Y_raw_max)
# cm_raw_softmax_mean = confusion_matrix(X, Y_raw_softmax_mean)
# cm_raw_softmax_max = confusion_matrix(X, Y_raw_softmax_max)
# cm_stprm = confusion_matrix(X, Y_stprm)
# cm_lemmatized = confusion_matrix(X, Y_lemma)

# print(cm_raw)
# print(cm_raw_max)
# print(cm_raw_softmax_mean)
# print(cm_raw_softmax_max)
# print(cm_lemmatized)

### calculating F1 score

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# precision = precision_score(Y_raw, X, average='binary', pos_label="High")
# recall = recall_score(Y_raw, X, average='binary', pos_label="High")

f1_raw = f1_score(Y_raw, X, average='binary', pos_label="High")
f1_raw_max = f1_score(Y_raw_max, X, average='binary', pos_label="High")
# f1_raw_softmax_mean = f1_score(Y_raw_softmax_mean, X, average='binary', pos_label="High")
# f1_raw_softmax_max = f1_score(Y_raw_softmax_max, X, average='binary', pos_label="High")
f1_lemma = f1_score(Y_lemma, X, average='binary', pos_label="High")
f1_lemma_max = f1_score(Y_lemma_max, X, average='binary', pos_label="High")
f1_stprm = f1_score(Y_stprm, X, average='binary', pos_label="High")
f1_stprm_max = f1_score(Y_stprm_max, X, average='binary', pos_label="High")
f1_all = f1_score(Y_all, X, average='binary', pos_label="High")
f1_all_max = f1_score(Y_all_max, X, average='binary', pos_label="High")
f1_ss = f1_score(Y_ss, X, average='binary', pos_label="High")
f1_ss_max = f1_score(Y_ss_max, X, average='binary', pos_label="High")
f1_ss_stprm = f1_score(Y_ss_stprm, X, average='binary', pos_label="High")
f1_ss_stprm_max = f1_score(Y_ss_stprm_max, X, average='binary', pos_label="High")
f1_ss_lemma = f1_score(Y_ss_lemma, X, average='binary', pos_label="High")
f1_ss_lemma_max = f1_score(Y_ss_lemma_max, X, average='binary', pos_label="High")
f1_ss_all = f1_score(Y_ss_all, X, average='binary', pos_label="High")
f1_ss_all_max = f1_score(Y_ss_all_max, X, average='binary', pos_label="High")


print("raw: ", f1_raw)
print("raw-max: ", f1_raw_max)
# f1_raw_softmax_mean
# f1_raw_softmax_max
print("raw-lemma: ",f1_lemma)
print("raw-lemma-max: ",f1_lemma_max)
print("raw-stprm: ",f1_stprm)
print("raw-stprm-max: ",f1_stprm_max)
print("raw-all: ",f1_all)
print("raw-all-max: ",f1_all_max)
print("ss: ",f1_ss)
print("ss-max: ",f1_ss_max)
print("ss-stprm: ",f1_ss_stprm)
print("ss-stprm-max: ",f1_ss_stprm_max)
print("ss-lemma: ",f1_ss_lemma)
print("ss-lemma-max: ",f1_ss_lemma_max)
print("ss-all: ",f1_ss_all)
print("ss-all-max: ",f1_ss_all_max)

'''interesting insight, removing stopwords does not help with either word-overlap or sentence similarity;
lemmatization does not help with word overlap, but has some benefits in sentence similarity.
Also, raw text word-overlap model still presens'''

### mean static representaions from Bert

In [None]:
# Functions from the cs224u VSM module

def hf_encode(text, tokenizer, add_special_tokens=False):
    """
    Get the indices for the tokens in `text` according to `tokenizer`.
    If no tokens can be obtained from `text`, then the tokenizer.unk_token`
    is used as the only token.

    Parameters
    ----------
    text: str

    tokenizer: Hugging Face tokenizer

    add_special_tokens : bool
        A Hugging Face parameter to the tokenizer.

    Returns
    -------
    torch.Tensor of shape `(1, m)`
        A batch of 1 example of `m` tokens`, where `m` is determined
        by `text` and the nature of `tokenizer`.

    """
    encoding = tokenizer.encode(
        text,
        add_special_tokens=add_special_tokens,
        return_tensors='pt')
    if encoding.shape[1] == 0:
        text = tokenizer.unk_token
        encoding = torch.tensor([[tokenizer.vocab[text]]])
    return encoding


def hf_represent(batch_ids, model, layer=-1):
    """
    Encode a batch of sequences of ids using a Hugging Face
    Transformer-based model `model`. The model's `forward` method is
    `output_hidden_states=True`, and we get the hidden states from
    `layer`.


    Parameters
    ----------
    batch_ids : iterable, shape (n_examples, n_tokens)
        Sequences of indices into the model vocabulary.

    model : Hugging Face transformer model

    layer : int
        The layer to return. This will get all the hidden states at
        this layer. `layer=0` gives the embedding, and `layer=-1`
        gives the final output states.

    Returns
    -------
    Tensor of shape `(n_examples, n_tokens, n_dimensions)`
       where `n_dimensions` is the dimensionality of the
       Transformer model

    """
    with torch.no_grad():
        reps = model(batch_ids, output_hidden_states=True)
        return reps.hidden_states[layer]
    
    
def mean_pooling(hidden_states):
    """
    Get the mean along `axis=1` of a Tensor.

    Parameters
    ----------
    hidden_states : torch.Tensor, shape `(k, m, n)`
        Where `k` is the number of examples, `m` is the number of vectors
        for each example, and `n` is dimensionality of each vector.

    Returns
    -------
    torch.Tensor of dimension `(k, n)`.

    """
    _check_pooling_dimensionality(hidden_states)
    return torch.mean(hidden_states, axis=1)


def _check_pooling_dimensionality(hidden_states):
     if not len(hidden_states.shape) == 3:
        raise ValueError(
            "The input to the pooling function should have 3 dimensions: "
            "it's a batch of k examples, where each example has m vectors, "
            "each of dimensionality n. The function will pool the vectors "
            "for each example, returning a Tensor of shape (k, n).")

In [None]:
from transformers import BertModel, BertTokenizer

bert_weights_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)

In [None]:
import torch

def get_static_rep_from_bert(word):
    # derived from vsm_04_contextualreps.ipynb
    subtok_ids = hf_encode(word, bert_tokenizer)
    subtok_reps = hf_represent(subtok_ids, bert_model, layer=-1)
    subtok_pooled = mean_pooling(subtok_reps)
    return subtok_pooled


def get_score_for_pair_with_similarity_threshold(this_turn, next_turn, threshold=0.1, min_tokens=5):

    overlap = 0
    cosine_similarity = torch.nn.CosineSimilarity(dim=0)
    
    # turn each utterance into a list of tensor representations
    this_turn_tokens = [reps_lookup[t] for t in set(preprocess_text(this_turn))]
    next_turn_tokens = [reps_lookup[t] for t in set(preprocess_text(next_turn))]

    if len(this_turn_tokens) < min_tokens:
        return None
    
    # calculate cosine similarity measures for each pair of token representations
    for token1 in this_turn_tokens:
        for token2 in next_turn_tokens:
            cos =  torch.mean(cosine_similarity(token1, token2), axis=0)
            # if the cosine similarity exceeds the (arbitary) threshold, increment the count
            if cos > threshold:
                overlap +=1

    # divide the count by the number of tokens in the first turn, for some degree of scaling
    score = overlap / len(set(this_turn.split()))
        
    return score

In [None]:
# imporitng source data with samples in ela and science, between 2 students only
    
import csv
conversations = []

with open("focal_ela_science.csv", "r", encoding = "UTF-8") as d:
    reader = csv.DictReader(d)
    for row in reader:
        conversations.append(row)

# conversations

In [None]:
import itertools
def get_all_tokens():
    toks = set()
    for c in conversations:
        uuid = c["URL"][39:]
        turn_getter = Conversation(uuid)
        toks.update(list(itertools.chain(
            *[preprocess_text(turn) for spkr, turn in turn_getter.get_turns() if turn is not None]
        )))
    return toks

all_toks = get_all_tokens()

In [None]:
%time reps_lookup = {tok: get_static_rep_from_bert(tok) for tok in all_toks}

In [None]:
# import logging
# from importlib import reload
# from statistics import mean

# reload(logging)
# logging.basicConfig(
#     level=logging.INFO,
#     format="%(asctime)s: %(message)s",
#     datefmt="%Y-%m-%d %H:%M:%S",
# )

# for i, c in enumerate(conversations):
#     if c["D1"] == "NA":
#         c["normed_D1"] = None
#     else:
#         c["normed_D1"] = (float(c["D1"]) - 1) / 3    
       
#     uuid = c["URL"][39:]
#     conv = Conversation(uuid)

#     scores = []
#     for this_turn, next_turn in conv.get_pairwise_turns():
#         score = get_score_for_pair_with_similarity_threshold(this_turn, next_turn)
#         scores.append(score)

#     score = mean([score for score in scores if score is not None])
    
#     label = ("low", "high")[c["normed_D1"] > 0.5 if c["normed_D1"] is not None else 0]

#     logging.info("%2d: %s %f", i, label, score)

In [None]:
def get_bert_prediction(uuid, min_toks=5, aggregate=mean):
    turn_getter = Conversation(uuid) # getting conversations from the XML files using Simon's function
    
    scores = [
        get_score_for_pair_with_similarity_threshold(this_turn, next_turn)
        for this_turn, next_turn in turn_getter.get_pairwise_turns()
    ]
    
    scores = [score for score in scores if score is not None]
    
    if not len(scores):
        return None
    
    return aggregate(scores)

In [None]:
df = df.assign(bert_prediction =
               df.uuid.apply(lambda uuid: get_bert_prediction(uuid)))

df = df.assign(bert_max_prediction =
               df.uuid.apply(lambda uuid: get_bert_prediction(uuid, aggregate=max)))


In [None]:
# correlation

print(df["normed_D1"].corr(df["bert_prediction"]))
print(df["normed_D1"].corr(df["bert_max_prediction"]))


In [None]:
print(df.bert_prediction.max())
print(df.bert_prediction.min())
print(df.bert_prediction.median())
print(df.bert_max_prediction.max())
print(df.bert_max_prediction.min())
print(df.bert_max_prediction.median())


In [None]:
plt.hist(df["bert_prediction"], 10,
         density=True,
         histtype='bar',
         facecolor='g',
         alpha=0.5)

plt.show()

In [None]:
# assign label for BERT

def assign_bert_label(value, threshold):
    if value > threshold: # using the median number for the dataset
        return "High"
    return "Low"

In [None]:
# assing label
df = df.assign(bert_prediction_label =
               df.bert_prediction.apply(lambda x: assign_bert_label(x, threshold=11)))

df = df.assign(bert_max_prediction_label =
               df.bert_prediction.apply(lambda x: assign_bert_label(x, threshold=21)))

#Generate the confusion matrix
Y_bert = df.bert_prediction_label
Y_bert_max = df.bert_max_prediction_label
# cm_bert = confusion_matrix(X, Y_bert)

# calculate f1 score
f1_bert = f1_score(Y_bert, X, average='binary', pos_label="High")
f1_bert_max = f1_score(Y_bert_max, X, average='binary', pos_label="High")

print("bert: ", f1_bert)
print("bert: ", f1_bert_max)

In [None]:
df

In [None]:
df = df.assign(z_baseline_raw_max = stats.zscore(df["baseline_raw_max_prediction"], nan_policy='omit'))
df = df.assign(z_baseline_lemma_max = stats.zscore(df["baseline_lemmatized_max_prediction"], nan_policy='omit'))
df = df.assign(z_sentence_similarity_raw_max = stats.zscore(df["sentence_similarity_max_prediction"], nan_policy='omit'))
df = df.assign(z_bert_prediction = stats.zscore(df["bert_prediction"], nan_policy='omit'))


In [None]:
df.to_csv("focal_ela_science_NLP.csv", encoding='utf-8', index=False)
