In [1]:
import pandas as pd
import numpy as np
import os
import math
from collections import Counter
import scipy.stats as ss

In [2]:
# source: Shaked Zychlinski: "The Search for Categorical Correlation"
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://github.com/shakedzy/dython
# see also: https://en.wikipedia.org/wiki/Correlation_ratio
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat) + 1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta

In [3]:
df = pd.read_csv(os.path.join("data", "processed", "all_data2.csv"))

In [4]:
for cat in ["language", "canton", "typology"]:
    print("eta({}, po_value) = {:.3f}".format(cat, correlation_ratio(df[cat], df["po_value"])))

eta(language, po_value) = 0.332
eta(canton, po_value) = 0.523
eta(typology, po_value) = 0.118


In [5]:
# source: Shaked Zychlinski: "The Search for Categorical Correlation"
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://github.com/shakedzy/dython
# see also: https://en.wikipedia.org/wiki/Correlation_ratio
def conditional_entropy(x,y):
    # entropy of x given y
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x,y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y/p_xy)
    return entropy

def theils_u(x, y):
    s_xy = conditional_entropy(x, y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n / total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

In [6]:
theils_u(df["language"], df["canton"])

0.8029420122473584