In [3]:
import os
os.chdir(r"C:\Users\Junaid Ur Rehman\Documents\Master of Business Analytics\Semester 5\Applications of Data Science\Final Project")


In [2]:
import numpy as np
from itertools import chain, tee
from nltk.stem import PorterStemmer
import os

# Set current working directory if needed
# os.chdir(r"C:\Users\Junaid Ur Rehman\Documents\Master of Business Analytics\Semester 5\Applications of Data Science\Final Project")

# Load stopwords
stopword_set = set()
# stopword_file_path = "Github Code/data/STOPWORD.list"
stopword_file_path = "Github Code/data/STOPWORD.list"

if os.path.exists(stopword_file_path):
    with open(stopword_file_path) as f:
        for line in f:
            stopword_set.add(line.strip())

# Add punctuation to stopwords
stopword_set.update(["''", ",", ".", "``", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", "/",
                     ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"])

stemmer = PorterStemmer()

CHOI_TEMPLATE = "./Choi Dataset/naacl00Exp/data/{}/{}/{}.ref"

def choi_loader(doc, tp, ref, word_cut=0, remove_stop=False, stem=False):
    """ Load a choi document from the dataset,
        returns a list of parts
        each part is a list of sentences
        each sentence is a list of words,
        the only preprocessing is to lowercase everything """
    try:
        with open(CHOI_TEMPLATE.format(doc, tp, ref)) as f:
            doc = f.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"File {CHOI_TEMPLATE.format(doc, tp, ref)} not found. Please check the file paths.")

    def is_valid(word):
        if stem:
            word = stemmer.stem(word)
        if remove_stop:
            if word in stopword_set:
                return False
        return True

    parts = [x.splitlines() for x in doc.split("==========\n") if x]

    doc = [[[x.lower() for x in sent.split() if is_valid(x)] for sent in doc] for doc in parts]

    filtered = [[sent for sent in part if len(sent) > word_cut] for part in doc]

    return filtered

ARX_TEMPLATE = "./Github Code/data/arxiv/{:03d}.ref"

# def arx_loader(num):
#     """ Load an arxiv document from the dataset,
#         returns a list of parts
#         each part is a list of words """
#     try:
#         with open(ARX_TEMPLATE.format(num)) as f:
#             doc = f.read()
#     except FileNotFoundError:
#         raise FileNotFoundError(f"File {ARX_TEMPLATE.format(num)} not found. Please check the file paths.")

#     return [[[x] for x in x.split()] for x in doc.split("BR")]

def arx_loader(num):
    """ Load an arxiv document from the dataset,
        returns a list of parts
        each part is a list of words """
    try:
        # Use errors="ignore" to skip invalid characters
        with open(ARX_TEMPLATE.format(num), encoding="utf-8", errors="ignore") as f:
            doc = f.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"File {ARX_TEMPLATE.format(num)} not found. Please check the file paths.")
    except UnicodeDecodeError as e:
        raise Exception(f"Error decoding file {ARX_TEMPLATE.format(num)}: {e}. Ensure the file is in UTF-8 format.")

    return [[[x] for x in x.split()] for x in doc.split("BR")]

def allchoi(set, *args, **kwargs):
    if set == "3-5":
        for a in [1, 2]:
            for i in range(50):
                yield choi_loader(a, "3-5", i, *args, **kwargs)
    elif set == "6-8":
        for a in [1, 2]:
            for i in range(50):
                yield choi_loader(a, "6-8", i, *args, **kwargs)
    elif set == "9-11":
        for a in [1, 2]:
            for i in range(50):
                yield choi_loader(a, "9-11", i, *args, **kwargs)
    elif set == "3-11":
        for a in [1, 2]:
            for i in range(50):
                yield choi_loader(a, "3-11", i, *args, **kwargs)
        for i in range(300):
            yield choi_loader(3, "3-11", i, *args, **kwargs)

def collapse(doc):
    """ Turn a document into a single string """
    return " ".join(" ".join(" ".join(sent) for sent in part) for part in doc)

def collapse_sents(doc):
    """ Collapse a doc to a list of sentences """
    return [sent for part in doc for sent in part]

def collapse_words(doc):
    """ Collapse a doc to a list of words """
    return [word for part in doc for sent in part for word in sent]

def word_iter(doc):
    """ Iterate over the words in a document """
    words = (word for part in doc for sent in part for word in sent)
    for word in words:
        yield word

def sent_iter(doc):
    """ Iterate over the sentences in a document """
    sents = (sent for part in doc for sent in part)
    for sent in sents:
        yield sent

def refsplit(doc):
    """ Get the reference splitting for the document """
    middle = np.cumsum([1] + [sum(1 for sent in part for word in sent) for part in doc])
    return (middle[1:-1] - 1).tolist() + [middle[-1] - 1]

def refsplit_sent(doc):
    """ Get the reference splitting for the sentence representation """
    middle = np.cumsum([1] + [sum(1 for sent in part) for part in doc])
    return (middle[1:-1] - 1).tolist() + [middle[-1] - 1]

# A testing document in the same structure as a choi doc
testdoc = [["this is the first sentence".split()] * 5,
           ["second sentence same as the first".split()] * 3,
           ["the blue fish went to the market".split()] * 4,
           ["once upon a midnight dreary with my pack".split()] * 5,
           ["while i pondered weak and weary".split()] * 3,
           ["one fish two fish three fish blue fish".split()] * 5,
           ["pack it up pack it in let me begin".split()] * 3,
           ["i came to win battle me that is a sin to begin".split()] * 5,
           ["and think about how ravens and writing desks".split()] * 4,
           ["other people are people too not ravens or fish".split()] * 3]

def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

def seg_iter(splits):
    return pairwise([0] + splits)

def length_iter(splits):
    return ((b - a) for (a, b) in seg_iter(splits))

xnor = lambda a, b: (a and b) or (not a and not b)

def score(hyp, ref, k=None):
    k = k or int(round(0.5 * ref[-1] / len(ref))) - 1

    length = ref[-1]
    probeinds = np.arange(length - k)
    dref = np.digitize(probeinds, ref) == np.digitize(probeinds + k, ref)
    dhyp = np.digitize(probeinds, hyp) == np.digitize(probeinds + k, hyp)

    return (dref ^ dhyp).mean()

def score_wd(hyp, ref, k=None):
    k = k or int(round(0.5 * ref[-1] / len(ref))) - 1

    length = ref[-1]
    hyp = np.asarray(hyp)
    ref = np.asarray(ref)

    score = 0.0
    tot = 0.0
    for i in range(length - k):
        bref = ((ref > i) & (ref <= i + k)).sum()
        bhyp = ((hyp > i) & (hyp <= i + k)).sum()
        score += 1.0 * (np.abs(bref - bhyp) > 0)
        tot += 1.0
    return score / tot


In [3]:
"""
Code for figuring out various vector representations of documents
"""

import numpy as np
from collections import defaultdict

def tf_sents(doc):
    """ Create a sentence level tf representation of the document """
    words = set(word for word in word_iter(doc))
    word_pk = {word: pk for pk, word in enumerate(words)}

    vecs = []
    for part in doc:
        for sent in part:
            wordcounter = defaultdict(int)
            for word in sent:
                wordcounter[word] += 1

            vec = np.zeros(len(words))
            for word, count in wordcounter.items():  # Changed iteritems() to items() for Python 3
                if word in words:
                    vec[word_pk[word]] += count
            vecs.append(vec)

    return np.array(vecs)

def tf_words(doc):
    """ Create a word level tf representation of the document """
    words = set(word for word in word_iter(doc))
    word_pk = {word: pk for pk, word in enumerate(words)}

    vecs = []
    for part in doc:
        for sent in part:
            for word in sent:
                vec = np.zeros(len(words))
                if word in words:
                    vec[word_pk[word]] += 1
                vecs.append(vec)

    return np.array(vecs)

def vec_sents(doc, word_lookup, wordreps):
    """ Create a vector representation of the document """
    vecs = []
    for part in doc:
        for sent in part:
            wordvecs = [np.zeros(wordreps.shape[1])]
            for word in sent:
                pk = word_lookup.get(word, -1)
                if pk >= 0:
                    wordvecs.append(wordreps[pk])
            vecs.append(np.mean(wordvecs, axis=0))

    return np.array(vecs)

def vec_words(doc, word_lookup, wordreps):
    """ Create a vector representation of the document """
    vecs = []
    for part in doc:
        for sent in part:
            for word in sent:
                pk = word_lookup.get(word, -1)
                if pk >= 0:
                    vecs.append(wordreps[pk])
                else:
                    vecs.append(np.zeros(wordreps.shape[1]))

    return np.array(vecs)

def vectop_sents(doc, word_lookup, wordreps):
    """ Create a vector representation of the document """
    vecs = []
    N = wordreps.max() + 1
    for part in doc:
        for sent in part:
            sentvec = np.zeros(N)
            for word in sent:
                pk = word_lookup.get(word, -1)
                if pk >= 0:
                    sentvec[wordreps[word_lookup[word]]] += 1
            vecs.append(sentvec)

    return np.array(vecs)

def vecdf_sents(doc, word_lookup, wordreps, dfcounter):
    """ Create a vector representation of the document """
    vecs = []
    for part in doc:
        for sent in part:
            wordvecs = [np.zeros(wordreps.shape[1])]
            for word in sent:
                pk = word_lookup.get(word, -1)
                if pk >= 0:
                    wordvecs.append(np.log(500. / (dfcounter.get(word, 1.0) + 0.0)) * wordreps[pk])
            vecs.append(np.mean(wordvecs, axis=0))

    return np.array(vecs)

def vecdf_words(doc, word_lookup, wordreps, dfcounter):
    """ Create a vector representation of the document """
    vecs = []
    for part in doc:
        for sent in part:
            for word in sent:
                pk = word_lookup.get(word, -1)
                if pk >= 0:
                    vecs.append(np.log(500. / (dfcounter.get(word, 1.0) + 0.0)) * wordreps[pk])
                else:
                    vecs.append(np.zeros(wordreps.shape[1]))
    return np.array(vecs)


# Sample document (list of parts, each part is a list of sentences, each sentence is a list of words)
test_doc = [[["this", "is", "a", "test"], ["another", "sentence"]], [["more", "words", "here"]]]

# Testing tf_sents function
tf_vector = tf_sents(test_doc)
print("TF Representation for Sentences:\n", tf_vector)


TF Representation for Sentences:
 [[1. 0. 0. 1. 1. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 1. 0. 0. 1.]]


In [6]:
""
import numpy as np
from scipy.ndimage import generic_filter
from scipy.spatial.distance import cdist
from numpy.random import rand

####################
# C99
####################
'''
def rankkern(x):
    """The kernel for the rank transformation, measures the fraction of the neighbors that
    take on a value less than the middle value."""
    n = x.size
    mid = n // 2
    better = ((x >= 0) & (x < x[mid])).sum()
    return better / ((x >= 0).sum() - 1.0)'''
    
def rankkern(x):
    """ The kernel for the rank transformation, measures the fraction of the neighbors that
    take on a value less than the middle value """
    n = x.size
    mid = n // 2
    better = ((x >= 0) & (x < x[mid])).sum()
    total_neighbors = (x >= 0).sum() - 1.0
    return better / total_neighbors if total_neighbors != 0 else 0

def rankify(mat, size=11):
    """Apply the ranking transformation of a given size."""
    return generic_filter(mat, rankkern, size=(size, size), mode='constant', cval=-1)
    
def c99score(distsmat, hyp, minlength=1, maxlength=None):
    """Do the Choi C99 scoring for a hypothesis splitting."""
    N = distsmat.shape[0]
    beta = 0.0
    alpha = 0.0
    for (a, b) in seg_iter(hyp):
        beta += distsmat[a:b, a:b].sum()
        alpha += (b - a) ** 2
        if minlength and (b - a) < minlength:
            beta += -np.inf
        if maxlength and (b - a) > maxlength:
            beta += -np.inf
    return -beta / (alpha + 0.0)

def c99split(distsmat, k, rank=0, *args, **kwargs):
    """Do the Choi style C99 splitting, given a matrix of distances D,
    and k splits to perform. The rank keyword denotes whether we want to
    do the ranking transformation if positive and if so denotes the size of the
    ranking filter."""
    if rank:
        distsmat = rankify(distsmat, rank)

    N = distsmat.shape[0]
    splits = [N]
    for n in range(k):
        newans = min(
            (c99score(distsmat, sorted(splits + [i]), *args, **kwargs), splits + [i])
            for i in range(1, N - 1) if i not in set(splits)
        )
        splits = newans[1]
    return sorted(splits), newans[0]

####################
# DP
####################

def gensig_euclidean(X, minlength=1, maxlength=None):
    """Generate the sigma for the squared difference from the mean."""
    cs = X.cumsum(0)
    css = (X ** 2).sum(1).cumsum(0)

    def sigma(i, j):
        length = j - i
        if minlength and length < minlength:
            return np.inf
        if maxlength and length > maxlength:
            return np.inf
        if i == 0:
            return css[j - 1] - 1. / j * ((cs[j - 1]) ** 2).sum()
        else:
            return (css[j - 1] - css[i - 1]) - 1. / (j - i) * ((cs[j - 1] - cs[i - 1]) ** 2).sum()

    return sigma

def gensig_cosine(X, minlength=1, maxlength=None):
    """Generate the sigma for the cosine similarity."""
    def sigma(a, b):
        length = b - a
        if minlength and length < minlength:
            return np.inf
        if maxlength and length > maxlength:
            return np.inf
        rep = X[a:b].mean(0)
        if length < 2:
            return np.inf
        return (cdist(X[a:b], [rep], 'cosine') ** 2).sum()

    return sigma

def gensig_model(X, minlength=1, maxlength=None, lam=0.0):
    """Generate the sigma for a model-based segmentation."""
    N, D = X.shape
    over_sqrtD = 1. / np.sqrt(D)
    cs = np.cumsum(X, 0)

    def sigma(a, b):
        length = b - a
        if minlength and length < minlength:
            return np.inf
        if maxlength and length > maxlength:
            return np.inf

        tot = cs[b - 1].copy()
        if a > 0:
            tot -= cs[a - 1]
        signs = np.sign(tot)
        return -over_sqrtD * (signs * tot).sum()

    return sigma

def dpsplit(n, k, sig):
    """Perform the dynamic programming optimal segmentation."""
    K = k + 1
    N = n
    segtable = np.full((n, K), np.nan)
    segtable[:, 0] = [sig(0, j + 1) for j in range(N)]
    segindtable = np.full((N, K), -1, dtype='int')

    for k in range(1, K):
        for j in range(k, N):
            ans = min(
                ((segtable[l, k - 1] + sig(l + 1, j + 1), l + 1)
                 for l in range(k - 1, j)), key=lambda x: x[0]
            )
            segtable[j, k] = ans[0]
            segindtable[j, k] = ans[1]

    current_pointer = segindtable[-1, K - 1]
    path = [current_pointer]
    for k in range(K - 2, 0, -1):
        current_pointer = segindtable[current_pointer - 1, k]
        path.append(current_pointer)

    return sorted(path + [N]), segtable[-1, K - 1]

####################
# Greedy
####################

def greedysplit(n, k, sigma):
    """Perform a greedy split."""
    splits = [n]
    s = sigma(0, n)

    def score(splits, sigma):
        splits = sorted(splits)
        return sum(sigma(a, b) for (a, b) in seg_iter(splits))

    while k > 0:
        usedinds = set(splits)
        new = min(
            (score(splits + [i], sigma), splits + [i])
            for i in range(1, n) if i not in usedinds
        )
        splits = new[1]
        s = new[0]
        k -= 1
    return sorted(splits), s

def refine(splits, sigma, n=1):
    """Refine splits a given number of steps."""
    oldsplits = splits[:]
    counter = 0

    while counter < n:
        splits = [0] + splits
        n = len(splits) - 2
        new = [splits[0]]
        for i in range(n):
            out = bestsplit(splits[i], splits[i + 2], sigma)
            new.append(out[2])
        new.append(splits[-1])
        splits = new[1:]

        if splits == oldsplits:
            break
        oldsplits = splits[:]
        counter += 1

    return splits

def bestsplit(low, high, sigma, minlength=1, maxlength=None):
    """Find the best split inside of a region."""
    length = high - low
    if length < 2 * minlength:
        return (np.inf, np.inf, low)
    best = min(
        ((sigma(low, j), sigma(j, high), j) for j in range(low + 1, high)),
        key=lambda x: x[0] + x[1]
    )
    return best

In [7]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

import re
import sys

# Make sure you provide arguments or set them directly
# K = int(sys.argv[1]) if len(sys.argv) > 1 else 5  # Defaulting to 5 splits if no argument is provided
# infile = sys.argv[2] if len(sys.argv) > 2 else "sample.txt"  # Default input file if not provided
K = 5  # Default number of splits
infile = "sample.txt"  # Default input file name
with open(infile, "r") as f:
    txt = f.read()

# Regular expressions to clean the text
punctuation_pat = re.compile(r"""([!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~])""")
hyphenline_pat = re.compile(r"-\s*\n\s*")
multiwhite_pat = re.compile(r"\s+")
cid_pat = re.compile(r"\(cid:\d+\)")
nonlet = re.compile(r"([^A-Za-z0-9 ])")

def clean_text(txt):
    # No need for utf-8 encode/decode in Python 3
    txt = txt.lower()
    txt = cid_pat.sub(" UNK ", txt)
    txt = hyphenline_pat.sub("", txt)
    txt = punctuation_pat.sub(r" \1 ", txt)
    txt = re.sub("\n", " NL ", txt)
    txt = nonlet.sub(r" \1 ", txt)
    txt = multiwhite_pat.sub(" ", txt)
    return "".join(["START ", txt.strip(), " END"])

txt = clean_text(txt).split()

# Load vectors and vocabulary
vecs = np.load("data/vecs.npy")
words = np.load("data/vocab.npy", allow_pickle=True)
word_lookup = {w: c for c, w in enumerate(words)}

print("Article length:", len(txt))

X = []

mapper = {}
count = 0
for i, word in enumerate(txt):
    if word in word_lookup:
        mapper[i] = count
        count += 1
        X.append(vecs[word_lookup[word]])

# Reverse mapper
mapperr = {v: k for k, v in mapper.items()}

X = np.array(X)
print("X length:", X.shape[0])

# Generate segmentation using greedy split
sig = gensig_model(X)
print("Splitting...")
splits, e = greedysplit(X.shape[0], K, sig)
print("Initial Splits:", splits)

# Refine the splits
print("Refining...")
splitsr = refine(splits, sig, 20)
print("Refined Splits:", splitsr)

# Print refined splits with surrounding text for context
print("Printing refined splits... ")

for i, s in enumerate(splitsr[:-1]):
    k = mapperr.get(s, len(txt))
    print("\nSegment:", i, "Split Point:", s)
    print("Context Before:\n", " ".join(txt[max(0, k - 100):k]))
    print("\nContext After:\n", " ".join(txt[k:k + 100]))

# Save results to a file
with open(f"result_{K}.txt", "w") as f:
    prev = 0
    for s in splitsr:
        k = mapperr.get(s, len(txt))
        f.write(" ".join(txt[prev:k]).replace("NL", "\n"))
        f.write("\nBREAK\n")
        prev = k

print("Done")

Article length: 239
X length: 220
Splitting...
Initial Splits: [21, 38, 133, 175, 190, 220]
Refining...
Refined Splits: [24, 38, 133, 181, 190, 220]
Printing refined splits... 

Segment: 0 Split Point: 24
Context Before:
 START the quick brown fox jumps over the lazy dog . the dog , being lazy , just watched the fox jump . NL in a

Context After:
 distant forest , animals of all kinds gathered under the tall , ancient trees . they were NL discussing the arrival of the new season . it was the time when the leaves turned golden , and NL the air became crisp . NL NL birds sang their songs , announcing that it was a season of change . the wise owl , sitting on NL a high branch , listened carefully to every voice . the deer were gathered near the stream , NL drinking the cool water . rabbits hopped around , playing in the fallen leaves ,

Segment: 1 Split Point: 38
Context Before:
 START the quick brown fox jumps over the lazy dog . the dog , being lazy , just watched the fox jump . NL in a

In [9]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans

# Load GloVe Embeddings
print("Loading GloVe embeddings...")
words = np.load("data/vocab.npy", allow_pickle=True)
vecs = np.load("data/vecs.npy")
print("GloVe embeddings loaded successfully!")

# Create word lookup dictionary
word_lookup = {w: i for i, w in enumerate(words)}

# Function to transform sentences using tf-idf weighting
def tfidf_transform(vectors):
    transformer = TfidfTransformer()
    return transformer.fit_transform(vectors).toarray()

# Function to cluster GloVe embeddings using K-means for C99k50, C99k200
def get_clustered_vectors(word_vectors, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(word_vectors)
    return kmeans.cluster_centers_, kmeans.predict(word_vectors)

Loading GloVe embeddings...
GloVe embeddings loaded successfully!


In [96]:
# Load Choi dataset (3-5 set only)
choi_documents = list(allchoi("3-5"))

### Configuration 1: oC99 (cosine similarity without tf-idf or clustering)
print("oC99 (Choi 3-5)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vec_sents(doc, word_lookup, vecs)  # Direct cosine-based vectors
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 8, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 2: oC99tf (tf-based without idf weighting)
print("oC99tf (Choi 3-5)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vecdf_sents(doc, word_lookup, vecs, dfcounter={})  # tf-based representation
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 3: oC99tfidf (tf-idf with idf weighting)
print("oC99tfidf (Choi 3-5)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vecdf_sents(doc, word_lookup, vecs, dfcounter={})  # tf-based representation
    vectors = tfidf_transform(vectors)  # Apply tf-idf weighting
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

oC99 (Choi 3-5)
P_k: 14.10, WD: 14.10
oC99tf (Choi 3-5)
P_k: 13.63, WD: 13.63
oC99tfidf (Choi 3-5)
P_k: 13.23, WD: 13.23


In [None]:
### Configuration 4: oC99k50 (cosine similarity with KMeans clustering, 50 clusters)
print("oC99k50 (Choi 3-5)")
pk_scores, wd_scores = [], []
clustered_centers, word_labels = get_clustered_vectors(vecs, n_clusters=50)
for doc in choi_documents:
    clustered_vectors = np.array([
        clustered_centers[word_labels[word_lookup[word]]]
        for part in doc for sent in part for word in sent if word in word_lookup
    ])
    clustered_vectors = clustered_vectors.reshape(len(doc), -1)
    sig = gensig_model(clustered_vectors)
    splits, _ = greedysplit(clustered_vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 5: oC99k200 (cosine similarity with KMeans clustering, 200 clusters)
print("oC99k200 (Choi 3-5)")
pk_scores, wd_scores = [], []
clustered_centers, word_labels = get_clustered_vectors(vecs, n_clusters=200)
for doc in choi_documents:
    clustered_vectors = np.array([
        clustered_centers[word_labels[word_lookup[word]]]
        for part in doc for sent in part for word in sent if word in word_lookup
    ])
    clustered_vectors = clustered_vectors.reshape(len(doc), -1)
    sig = gensig_model(clustered_vectors)
    splits, _ = greedysplit(clustered_vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

oC99k50 (Choi 3-5)


In [108]:
# Load Choi dataset (6-8 set only)
choi_documents = list(allchoi("6-8"))

### Configuration 1: oC99 (cosine similarity without tf-idf or clustering)
print("oC99 (Choi 6-8)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vec_sents(doc, word_lookup, vecs)  # Direct cosine-based vectors
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 10, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 2: oC99tf (tf-based without idf weighting)
print("oC99tf (Choi 6-8)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vecdf_sents(doc, word_lookup, vecs, dfcounter={})  # tf-based representation
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 10, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 3: oC99tfidf (tf-idf with idf weighting)
print("oC99tfidf (Choi 6-8)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vecdf_sents(doc, word_lookup, vecs, dfcounter={})  # tf-based representation
    vectors = tfidf_transform(vectors)  # Apply tf-idf weighting
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 10, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

oC99 (Choi 6-8)
P_k: 14.40, WD: 16.24
oC99tf (Choi 6-8)
P_k: 14.40, WD: 16.24
oC99tfidf (Choi 6-8)
P_k: 14.42, WD: 16.42


In [10]:
# Load Choi dataset (9-11 set only)
choi_documents = list(allchoi("9-11"))

### Configuration 1: oC99 (cosine similarity without tf-idf or clustering)
print("oC99 (Choi 9-11)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vec_sents(doc, word_lookup, vecs)  # Direct cosine-based vectors
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 10, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 2: oC99tf (tf-based without idf weighting)
print("oC99tf (Choi 9-11)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vecdf_sents(doc, word_lookup, vecs, dfcounter={})  # tf-based representation
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 3: oC99tfidf (tf-idf with idf weighting)
print("oC99tfidf (Choi 9-11)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vecdf_sents(doc, word_lookup, vecs, dfcounter={})  # tf-based representation
    vectors = tfidf_transform(vectors)  # Apply tf-idf weighting
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

oC99 (Choi 9-11)
P_k: 13.81, WD: 16.12
oC99tf (Choi 9-11)
P_k: 13.50, WD: 15.12
oC99tfidf (Choi 9-11)
P_k: 14.54, WD: 16.35


In [130]:
# Load Choi dataset (3-11)
choi_documents = list(allchoi("3-11"))

### Configuration 1: oC99 (cosine similarity without tf-idf or clustering)
print("oC99 (Choi 3-11)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vec_sents(doc, word_lookup, vecs)  # Direct cosine-based vectors
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 2: oC99tf (tf-based without idf weighting)
print("oC99tf (Choi 3-11)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vecdf_sents(doc, word_lookup, vecs, dfcounter={})  # tf-based representation
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

### Configuration 3: oC99tfidf (tf-idf with idf weighting)
print("oC99tfidf (Choi 3-11)")
pk_scores, wd_scores = [], []
for doc in choi_documents:
    vectors = vecdf_sents(doc, word_lookup, vecs, dfcounter={})  # tf-based representation
    vectors = tfidf_transform(vectors)  # Apply tf-idf weighting
    sig = gensig_model(vectors)
    splits, _ = greedysplit(vectors.shape[0], 9, sig)
    ref_splits = refsplit_sent(doc)
    pk_scores.append(score(splits, ref_splits))
    wd_scores.append(score_wd(splits, ref_splits))
P_k = np.mean(pk_scores) * 100
WD = np.mean(wd_scores) * 100
print(f"P_k: {P_k:.2f}, WD: {WD:.2f}")

oC99 (Choi 3-11)
P_k: 15.00, WD: 16.52
oC99tf (Choi 3-11)
P_k: 15.00, WD: 16.52
oC99tfidf (Choi 3-11)
P_k: 15.84, WD: 17.47


In [11]:
import numpy as np
from sklearn.cluster import KMeans
import os

# Load GloVe embeddings
print("Loading GloVe embeddings...")
words = np.load("vocab1.npy", allow_pickle=True)
vecs = np.load("vecs1.npy")
word_lookup = {word: idx for idx, word in enumerate(words)}
print("GloVe embeddings loaded successfully!")

# Dataset directory
arx_directory = "./Github Code/data/arxiv/"
    
# Helper functions for vector normalization
def normalize_vectors(vectors):
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

# Function to get document vector representation
def get_document_vectors(doc, method="oC99"):
    #print(f"Generating document vectors for method: {method}")
    if method == "oC99":
        # C99 with word embeddings, no normalization
        return vec_sents(doc, word_lookup, vecs)
    elif method == "CVS":
        # CVS with cosine similarity
        return vecdf_sents(doc, word_lookup, vecs, dfcounter=dfcounter)
    elif method == "CVSn":
        # CVS with normalized word embeddings
        normalized_vecs = normalize_vectors(vecs)
        return vecdf_sents(doc, word_lookup, normalized_vecs, dfcounter=dfcounter)
    else:
        raise ValueError("Unknown method specified.")

Loading GloVe embeddings...
GloVe embeddings loaded successfully!


In [12]:
from collections import defaultdict
import os

# Function to build the document frequency counter (dfcounter)
def build_dfcounter(documents):
    dfcounter = defaultdict(int)
    for doc in documents:
        # Track unique words in this document only
        words_in_doc = set(word for part in doc for sent in part for word in sent)
        for word in words_in_doc:
            dfcounter[word] += 1
    return dfcounter

# Load the Arxiv dataset documents
def arx_loader(file_path):
    """ Load an arxiv document from the dataset,
        returns a list of parts
        each part is a list of words """
    try:
        with open(file_path, encoding="utf-8", errors="ignore") as f:
            doc = f.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"File {file_path} not found. Please check the file paths.")

    return [[[x] for x in x.split()] for x in doc.split("BR")]

# Directory and file loading setup
arx_directory = "./Github Code/data/arxiv/"
arx_files = [f for f in os.listdir(arx_directory) if f.endswith('.ref')]

# Process and load each document with arx_loader (first 100 files only)
arx_documents = [arx_loader(os.path.join(arx_directory, file_name)) for file_name in arx_files[:100]]
print(f"Loaded {len(arx_documents)} documents from the Arxiv dataset.")

# Build the document frequency counter (dfcounter)
dfcounter = build_dfcounter(arx_documents)
print(f"Document frequency counter built with {len(dfcounter)} unique words.")

Loaded 100 documents from the Arxiv dataset.
Document frequency counter built with 20397 unique words.


In [13]:
def evaluate_segmentation(documents, num_splits, strategy='G', reference_method=None, word_lookup=None, word_reps=None, method='oC99', num_iter=20):
    """
    Evaluate the segmentation of a list of documents using the P_k and WindowDiff metrics.

    Parameters:
    documents (list): List of documents to segment.
    num_splits (int): Number of splits to perform.
    strategy (str): Strategy to use for generating splits. 'G' for Greedy, 'R' for Refined.
    reference_method (function): Function to generate reference splits for each document.
    word_lookup (dict): Dictionary mapping words to their indices in the word representations.
    word_reps (np.ndarray): Matrix of word representations (e.g., GloVe embeddings).
    method (str): Method to use for generating document vectors, e.g., 'oC99', 'CVS'.
    num_iter (int): Number of refinement iterations, if strategy is 'R'.

    Returns:
    tuple: Average P_k and WD scores (multiplied by 100 for percentage).
    """
    p_k_scores = []
    wd_scores = []

    print(f"Evaluating {method} with strategy {strategy}")
    for doc in documents:
        # Generate document vectors based on the specified method
        X = get_document_vectors(doc, method=method)

        # Generate hypothesis splits
        sig = gensig_model(X)
        if strategy == 'G':
            hyp_splits, _ = greedysplit(X.shape[0], num_splits, sig)
        elif strategy == 'R':
            initial_splits, _ = greedysplit(X.shape[0], num_splits, sig)
            hyp_splits = refine(initial_splits, sig, num_iter)
        else:
            raise ValueError("Invalid strategy. Use 'G' for Greedy or 'R' for Refined.")

        # Generate reference splits
        ref_splits = reference_method(doc)

        # Calculate P_k and WindowDiff scores
        p_k = score(hyp_splits, ref_splits)
        wd = score_wd(hyp_splits, ref_splits)

        # Append scores for averaging
        p_k_scores.append(p_k)
        wd_scores.append(wd)

    # Calculate average P_k and WD scores
    avg_p_k = np.mean(p_k_scores) * 100
    avg_wd = np.mean(wd_scores) * 100

    print(f'{method} - {strategy}: P_k: {avg_p_k:.2f}, WD: {avg_wd:.2f}')
    print()

    return avg_p_k, avg_wd

# Set up parameters
word_lookup = {word: idx for idx, word in enumerate(words)}
word_reps = vecs

In [14]:
print("For X-archive dataset")
pk_score, wd_score = evaluate_segmentation(
    arx_documents, 
    num_splits=2, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    arx_documents, 
    num_splits=2, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    arx_documents, 
    num_splits=7, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    arx_documents, 
    num_splits=7, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    arx_documents, 
    num_splits=8, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    arx_documents, 
    num_splits=8, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

For X-archive dataset
Evaluating oC99 with strategy G
oC99 - G: P_k: 40.56, WD: 40.57

Evaluating oC99 with strategy R
oC99 - R: P_k: 40.57, WD: 40.60

Evaluating CVS with strategy G
CVS - G: P_k: 30.26, WD: 31.61

Evaluating CVS with strategy R
CVS - R: P_k: 30.15, WD: 31.54

Evaluating CVSn with strategy G
CVSn - G: P_k: 28.21, WD: 29.96

Evaluating CVSn with strategy R
CVSn - R: P_k: 28.48, WD: 30.28



In [None]:
choi_documents = list(allchoi("3-5"))

print("For Choi dataset (3-5)")

# Build the document frequency counter (dfcounter)
dfcounter = build_dfcounter(choi_documents)
#print(f"Document frequency counter built with {len(dfcounter)} unique words.")

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=5, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=5, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=7, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=7, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=8, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=8, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

In [241]:
choi_documents = list(allchoi("6-8"))

print("For Choi dataset (6-8)")

# Build the document frequency counter (dfcounter)
dfcounter = build_dfcounter(choi_documents)
#print(f"Document frequency counter built with {len(dfcounter)} unique words.")

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=5, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=5, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=7, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=7, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=8, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=8, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

For Choi dataset (6-8)
Evaluating oC99 with strategy G
oC99 - G: P_k: 21.43, WD: 21.75

Evaluating oC99 with strategy R
oC99 - R: P_k: 20.59, WD: 20.90

Evaluating CVS with strategy G
CVS - G: P_k: 13.22, WD: 13.49

Evaluating CVS with strategy R
CVS - R: P_k: 12.54, WD: 12.80

Evaluating CVSn with strategy G
CVSn - G: P_k: 11.25, WD: 11.71

Evaluating CVSn with strategy R
CVSn - R: P_k: 9.91, WD: 10.36



In [None]:
choi_documents = list(allchoi("9-11"))

print("For Choi dataset (9-11)")

# Build the document frequency counter (dfcounter)
dfcounter = build_dfcounter(choi_documents)
#print(f"Document frequency counter built with {len(dfcounter)} unique words.")

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=5, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=5, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=7, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=7, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=8, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=8, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

In [243]:
choi_documents = list(allchoi("3-11"))

print("For Choi dataset (3-11)")

# Build the document frequency counter (dfcounter)
dfcounter = build_dfcounter(choi_documents)
#print(f"Document frequency counter built with {len(dfcounter)} unique words.")

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=5, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=5, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='oC99', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=7, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=7, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVS', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=8, 
    strategy='G', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

pk_score, wd_score = evaluate_segmentation(
    choi_documents, 
    num_splits=8, 
    strategy='R', 
    reference_method=refsplit_sent, 
    word_lookup=word_lookup, 
    word_reps=word_reps, 
    method='CVSn', 
    num_iter=20
)

For Choi dataset (3-11)
Evaluating oC99 with strategy G
oC99 - G: P_k: 21.17, WD: 21.62

Evaluating oC99 with strategy R
oC99 - R: P_k: 20.33, WD: 20.74

Evaluating CVS with strategy G
CVS - G: P_k: 12.67, WD: 12.88

Evaluating CVS with strategy R
CVS - R: P_k: 12.22, WD: 12.39

Evaluating CVSn with strategy G
CVSn - G: P_k: 11.09, WD: 11.45

Evaluating CVSn with strategy R
CVSn - R: P_k: 10.38, WD: 10.72

