# imports 

In [117]:
#imports
import lingpy
from levenshtein_dist_calc import LevenshteinDistanceCalculator
import nltk
from read_tab_files import TabFileReader
import bcubed
from collections import defaultdict

# Clustering

## testing

In [41]:
barb_cognacy = TabFileReader.tab_reader(
    "chl2024_barbacoandata/chl2023_barbacoan_cognacy.tab"
)
barb_forms = TabFileReader.tab_reader(
    "chl2024_barbacoandata/chl2023_barbacoan_forms.tab"
)

In [42]:
barb_word_list = TabFileReader.get_word_list(barb_forms)
barb_alphabet = TabFileReader.get_alphabet(barb_word_list)

In [54]:
taxa = list(barb_forms.iloc[0][1:])
taxa

['Awa Pit', "Cha'palaa", 'Guambiano', 'Totoro', 'Tsafiki']

In [44]:
cognates = list(barb_forms.iloc[2][1:])
cognates

['pil', 'tu', 'pirɨ', 'pirɨ', 'to']

In [45]:
barb_calcedit = LevenshteinDistanceCalculator(barb_forms, barb_alphabet)

In [50]:
distance_matrix = [[0 for i in range(len(cognates))] 
                           for j in range(len(cognates))]
for i, source in enumerate(cognates):
    for j, target in enumerate(cognates):
        if (source == "" 
            or target == ""):  # if one cognate in pair is missing, null value
            distance_matrix[i][j] = float("nan")
        else:
            distance_matrix[i][j] = nltk.edit_distance(source, target)/max(len(source),len(target))

In [51]:
distance_matrix

[[0.0, 1.0, 0.5, 0.5, 1.0],
 [1.0, 0.0, 1.0, 1.0, 0.5],
 [0.5, 1.0, 0.0, 0.0, 1.0],
 [0.5, 1.0, 0.0, 0.0, 1.0],
 [1.0, 0.5, 1.0, 1.0, 0.0]]

In [53]:
lingpy.upgma(distance_matrix,taxa,distances=True)


"((Awa Pit:0.25,(Guambiano:0.00,Totoro:0.00):0.25):0.25,(Cha'palaa:0.25,Tsafiki:0.25):0.25);"

In [170]:
output_dict_p2 = lingpy.flat_upgma(.2,distance_matrix,taxa)
output_dict_p5 = lingpy.flat_upgma(.5,distance_matrix,taxa)

In [98]:
barb_cognacy.head()

Unnamed: 0,0,1,2,3,4,5
0,﻿,Awa Pit,Cha'palaa,Guambiano,Totoro,Tsafiki
1,1-21. the land,45,45,,,45
2,1-212. the soil,53,45,53,53,45
3,1-214. the mud,,62,,,62
4,1-22. the mountain or hill,67,,65,67,65


In [142]:
gold_clusters = list(barb_cognacy.iloc[2][1:])
gold_clusters

['53', '45', '53', '53', '45']

In [152]:
def get_cluster_dict(taxa, clusters):
    cluster_dict = {}
    for lang, cluster in zip(taxa, clusters):
        cluster_dict[lang] = set([cluster])
    return cluster_dict

In [153]:
gold_dict = get_cluster_dict(taxa, gold_clusters)

In [154]:
gold_dict

{'Awa Pit': {'53'},
 "Cha'palaa": {'45'},
 'Guambiano': {'53'},
 'Totoro': {'53'},
 'Tsafiki': {'45'}}

In [168]:
def get_other_clusters(calculated_dict):
    output_clusters = {}
    for clus, langs in calculated_dict.items():
        for lang in langs:
            output_clusters[lang] = set(str(clus))


{'Awa Pit': {'0'},
 'Guambiano': {'0'},
 'Totoro': {'0'},
 "Cha'palaa": {'1'},
 'Tsafiki': {'1'}}

In [126]:
ground_truth = {
    "item1": set(["gray", "black"]),
    "item2": set(["gray", "black"]),
    "item3": set(["gray"]),
    "item4": set(["black"]),
    "item5": set(["black"]),
    "item6": set(["dashed"]),
    "item7": set(["dashed"]),
}

# example clustering (cdict) in page 24, figure 16
clustering = {
    "item1": set(["A", "B"]),
    "item2": set(["A", "B"]),
    "item3": set(["A"]),
    "item4": set(["B"]),
    "item5": set(["B"]),
    "item6": set(["C"]),
    "item7": set(["C"]),
}

In [169]:
precision = bcubed.precision(output_clusters, gold_dict)
recall = bcubed.recall(output_clusters,gold_dict)
#fscore = bcubed.fscore(output_clusters,gold_dict)
print(precision, recall)

1.0 1.0


In [178]:
from collections import defaultdict


def b_cubed(true_clusters, predicted_clusters):
    # Mapping of cluster indices to labels
    true_cluster_labels = defaultdict(set)
    predicted_cluster_labels = defaultdict(set)

    for label, true_cluster in true_clusters.items():
        true_cluster_labels[true_cluster].add(label)

    for label, predicted_cluster in predicted_clusters.items():
        predicted_cluster_labels[predicted_cluster].add(label)

    # Calculate the B-cubed precision and recall
    precision_sums = 0
    recall_sums = 0
    num_items = len(true_clusters)

    for label, true_cluster in true_clusters.items():
        predicted_cluster = predicted_clusters[label]

        true_cluster_size = len(true_cluster_labels[true_cluster])
        predicted_cluster_size = len(predicted_cluster_labels[predicted_cluster])

        true_positives = len(
            true_cluster_labels[true_cluster].intersection(
                predicted_cluster_labels[predicted_cluster]
            )
        )

        precision = (
            true_positives / predicted_cluster_size if predicted_cluster_size > 0 else 0
        )
        recall = true_positives / true_cluster_size if true_cluster_size > 0 else 0

        precision_sums += precision
        recall_sums += recall

    b_cubed_precision = precision_sums / num_items
    b_cubed_recall = recall_sums / num_items
    b_cubed_f1 = (
        (2 * b_cubed_precision * b_cubed_recall) / (b_cubed_precision + b_cubed_recall)
        if (b_cubed_precision + b_cubed_recall) > 0
        else 0
    )

    return b_cubed_precision, b_cubed_recall, b_cubed_f1


datasets = {
    "Example 1: High Precision, Low Recall": (
        {"A": 1, "B": 1, "C": 1, "D": 2, "E": 2, "F": 2},
        {"A": 1, "B": 1, "C": 2, "D": 3, "E": 3, "F": 4},
    ),
    "Example 2: Low Precision, High Recall": (
        {"A": 1, "B": 1, "C": 1, "D": 2, "E": 2, "F": 2},
        {"A": 1, "B": 1, "C": 1, "D": 1, "E": 1, "F": 1},
    ),
    "Example 3: Moderate Precision and Recall": (
        {"A": 1, "B": 1, "C": 2, "D": 2, "E": 3, "F": 3},
        {"A": 1, "B": 1, "C": 1, "D": 2, "E": 2, "F": 2},
    ),
    "Example 4: Perfect Scores": (
        {"A": 7, "B": 7, "C": 7, "D": 3, "E": 3, "F": 3},
        {"A": 1, "B": 1, "C": 1, "D": 2, "E": 2, "F": 2},
    ),
}

for description, (golden, hypothesis) in datasets.items():
    precision, recall, f1_score = b_cubed(golden, hypothesis)
    print(description)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)
    print()

Example 1: High Precision, Low Recall
Precision: 1.0
Recall: 0.5555555555555555
F1 Score: 0.7142857142857143

Example 2: Low Precision, High Recall
Precision: 0.5
Recall: 1.0
F1 Score: 0.6666666666666666

Example 3: Moderate Precision and Recall
Precision: 0.5555555555555555
Recall: 0.8333333333333334
F1 Score: 0.6666666666666666

Example 4: Perfect Scores
Precision: 1.0
Recall: 1.0
F1 Score: 1.0



In [175]:
output_clusters

{'Awa Pit': {'0'},
 'Guambiano': {'0'},
 'Totoro': {'0'},
 "Cha'palaa": {'1'},
 'Tsafiki': {'1'}}

In [None]:
tiago_gold = gold_dict.copy()
for 

In [176]:
precision, recall, f1_score = b_cubed(gold_dict, output_clusters)

TypeError: unhashable type: 'set'

In [123]:
words = lingpy.Wordlist("test.qlc")

In [125]:
words.columns

['doculect', 'concept', 'glossid', 'orthography', 'ipa', 'tokens', 'cogid']