In [1]:
import os 
import pandas as pd

os.chdir('..')
from src.CEM import ClosenessEvaluationMeasureCompute, ClosenessInformationQuantityCompute
from src.ordinal_class_dist import OrdinalClassDistribution
from src.utils import compute_accuracy_score

We demonstrate the class proximity of two different distribution

In [2]:
first_df = pd.read_csv('data/paper-review-example-01.csv', header=0)
first_paper_review_dist = OrdinalClassDistribution(first_df['class_name'], first_df['order'], first_df['count'])
first_CIQ_compute = ClosenessInformationQuantityCompute(first_paper_review_dist)

print (first_df)
print ()

second_df = pd.read_csv('data/paper-review-example-02.csv', header=0)
second_paper_review_dist = OrdinalClassDistribution(second_df['class_name'], second_df['order'], second_df['count'])
second_CIQ_compute = ClosenessInformationQuantityCompute(second_paper_review_dist)

print (second_df)

for first_class_name in first_df['class_name']:
    for second_class_name in first_df['class_name']:
        first_proximity = first_CIQ_compute.get_proximity_between_two_classes(first_class_name, second_class_name)
        second_proximity = second_CIQ_compute.get_proximity_between_two_classes(first_class_name, second_class_name)
        print (f'First distribution: prox({first_class_name}, {second_class_name}) = {first_proximity:.3f}')
        print (f'Second distribution: prox({first_class_name}, {second_class_name}) = {second_proximity:.3f}')
        print ()



    class_name  order  count
0       reject      1      7
1  weak reject      2    105
2    undecided      3    193
3  weak accept      4     90
4       accept      5      7

    class_name  order  count
0       reject      1    180
1  weak reject      2     10
2    undecided      3      3
3  weak accept      4     10
4       accept      5    173
First distribution: prox(reject, reject) = 6.844
Second distribution: prox(reject, reject) = 2.063

First distribution: prox(reject, weak reject) = 1.890
Second distribution: prox(reject, weak reject) = 1.911

First distribution: prox(reject, undecided) = 0.415
Second distribution: prox(reject, undecided) = 1.868

First distribution: prox(reject, weak accept) = 0.038
Second distribution: prox(reject, weak accept) = 1.734

First distribution: prox(reject, accept) = 0.013
Second distribution: prox(reject, accept) = 0.395

First distribution: prox(weak reject, reject) = 2.756
Second distribution: prox(weak reject, reject) = 1.023

First distribut

We have 2 models with same accuracy. 
We will show that CEM can highlight one that performs better subject to the three characteristics:
- ordinal invariance
- monotonicity
- imbalance

In [3]:
# Model A
df = pd.read_csv('data/systemA-confusion-matrix.csv', header=0).set_index('actual\predict')
CEM_compute = ClosenessEvaluationMeasureCompute(
    confusion_matrix=df, 
    class_names=['negative', 'neutral', 'positive'],
    orders=[1, 2, 3],)

print (df)
print (f'accuracy: {compute_accuracy_score(df.values)}')
print (f'CEM: {CEM_compute.get_proximity_between_two_dists()}')


# Model B
df = pd.read_csv('data/systemB-confusion-matrix.csv', header=0).set_index('actual\predict')
CEM_compute = ClosenessEvaluationMeasureCompute(
    confusion_matrix=df, 
    class_names=['negative', 'neutral', 'positive'],
    orders=[1, 2, 3],)

print (df)
print (f'accuracy: {compute_accuracy_score(df.values)}')
print (f'CEM: {CEM_compute.get_proximity_between_two_dists()}')

# Both having the same proximity matrix of actual class
proximity_matrix = CEM_compute.get_proximity_matrix()
proximity_matrix

                negative  neutral  positive
actual\predict                             
negative               5        1         4
neutral                5       50         5
positive               7        8        15
accuracy: 0.7
CEM: 0.7117023174151088
                negative  neutral  positive
actual\predict                             
negative               7        1         2
neutral               12       45         3
positive               4        8        18
accuracy: 0.7
CEM: 0.7596200661509974


Unnamed: 0,negative,neutral,positive
,,,
negative,4.321928,0.621488,0.074001
neutral,1.321928,1.736966,0.736966
positive,0.234465,0.415037,2.736966


Demonstrate ordinal invariance

In [4]:
df = pd.read_csv('data/ordinal_invariance-example-01.csv', header=0).set_index('actual\predict')
CEM_compute = ClosenessEvaluationMeasureCompute(
    confusion_matrix=df, 
    class_names=['reject', 'weak reject', 'undecided', 'weak accept', 'accept'],
    orders=[1, 2, 3, 4, 5],)

CEM_value_1 = CEM_compute.get_proximity_between_two_dists()

df = pd.read_csv('data/ordinal_invariance-example-02.csv', header=0).set_index('actual\predict')
CEM_compute = ClosenessEvaluationMeasureCompute(
    confusion_matrix=df, 
    class_names=['reject', 'weak reject', 'undecided', 'weak accept', 'accept'],
    orders=[1, 2, 3, 4, 5],)

CEM_value_2 = CEM_compute.get_proximity_between_two_dists()
print (f'CEM value: first example {CEM_value_1:.3f} and second example {CEM_value_2:.3f}')

CEM value: first example 1.000 and second example 1.000


Demonstrate monotonicity

In [5]:
df = pd.read_csv('data/monotonicity-example-01.csv', header=0).set_index('actual\predict')
CEM_compute = ClosenessEvaluationMeasureCompute(
    confusion_matrix=df, 
    class_names=['reject', 'weak reject', 'undecided', 'weak accept', 'accept'],
    orders=[1, 2, 3, 4, 5],)

CEM_value_1 = CEM_compute.get_proximity_between_two_dists()

df = pd.read_csv('data/monotonicity-example-02.csv', header=0).set_index('actual\predict')
CEM_compute = ClosenessEvaluationMeasureCompute(
    confusion_matrix=df, 
    class_names=['reject', 'weak reject', 'undecided', 'weak accept', 'accept'],
    orders=[1, 2, 3, 4, 5],)

CEM_value_2 = CEM_compute.get_proximity_between_two_dists()
print (f'CEM value: first example {CEM_value_1:.3f} and second example {CEM_value_2:.3f}')

CEM value: first example 0.942 and second example 0.912


Demonstrate imbalance

In [6]:
df = pd.read_csv('data/imbalance-example-01.csv', header=0).set_index('actual\predict')
CEM_compute = ClosenessEvaluationMeasureCompute(
    confusion_matrix=df, 
    class_names=['reject', 'weak reject', 'undecided', 'weak accept', 'accept'],
    orders=[1, 2, 3, 4, 5],)

CEM_value_1 = CEM_compute.get_proximity_between_two_dists()

df = pd.read_csv('data/imbalance-example-02.csv', header=0).set_index('actual\predict')
CEM_compute = ClosenessEvaluationMeasureCompute(
    confusion_matrix=df, 
    class_names=['reject', 'weak reject', 'undecided', 'weak accept', 'accept'],
    orders=[1, 2, 3, 4, 5],)

CEM_value_2 = CEM_compute.get_proximity_between_two_dists()
print (f'CEM value: first example {CEM_value_1:.3f} and second example {CEM_value_2:.3f}')

CEM value: first example 0.937 and second example 0.950
