-
Notifications
You must be signed in to change notification settings - Fork 9
/
kappa.py
executable file
·113 lines (97 loc) · 4.1 KB
/
kappa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
from docopt import docopt
import numpy as np
import sys
usage = """Usage: kappa.py [--help] [--linear|--unweighted|--squared] [--verbose] [--csv] --filename <filename>
-h, --help Show this
-l, --linear Linear weights for disagreements (default)
-u, --unweighted Cohen's Kappa (unweighted agreement/disagreement)
-s, --squared Squared weights for disagreements
-v, --verbose Include number of categories and subjects in the output
-c, --csv For text files with comma-separated values
-f <filename>, --filename <filename> The filename to process, with pairs of integers on each line. The values in each pair correspond to the rating that each of the two reviewers gave to a particular subject. The pairs must be whitespaced-separated (or comma-separated, with the -c flag).
"""
def get_mode(args):
if args.get('--unweighted'):
return 'unweighted'
elif args.get('--squared'):
return 'squared'
else:
return 'linear'
def read_ratings(csv, filename):
try:
if csv:
return np.genfromtxt(filename, delimiter=',').astype(int)
else:
return np.genfromtxt(filename).astype(int)
except(IOError):
print('Bad filename: ' + filename)
sys.exit(1)
except(ValueError):
print('Invalid input (the same number of integers required in each row)')
sys.exit(1)
def build_weight_matrix(categories, mode):
if mode == 'unweighted':
# [[0, 1, 1],
# [1, 0, 1],
# [1, 1, 0]]
return np.fromiter((i != j
for i in range(categories)
for j in range(categories)), np.int).reshape(categories, -1)
elif mode == 'squared':
# [[0, 1, 4],
# [1, 0, 1],
# [4, 1, 0]]
return np.fromiter((abs(i - j) ** 2
for i in range(categories)
for j in range(categories)), np.int).reshape(categories, -1)
else: # linear
# [[0, 1, 2],
# [1, 0, 1],
# [2, 1, 0]]
return np.fromiter((abs(i - j)
for i in range(categories)
for j in range(categories)), np.int).reshape(categories, -1)
def build_observed_matrix(categories, subjects, ratings):
observed = np.zeros((categories, categories))
for k in range(subjects):
observed[ratings[k, 0], ratings[k, 1]] += 1
return observed / subjects
def build_distributions_matrix(categories, subjects, ratings):
distributions = np.zeros((categories, 2))
for k in range(subjects):
distributions[ratings[k, 0], 0] += 1
distributions[ratings[k, 1], 1] += 1
return distributions / subjects
def build_expected_matrix(categories, distributions):
return np.fromiter((distributions[i, 0] * distributions[j, 1]
for i in range(categories)
for j in range(categories)), np.float).reshape(categories, -1)
def calculate_kappa(weighted, observed, expected):
sum_expected = sum(sum(weighted * expected))
return 1.0 - ((sum(sum(weighted * observed)) / sum_expected) if sum_expected != 0 else 0.0)
def main(args):
mode = get_mode(args)
ratings = read_ratings(args.get('--csv'), args.get('--filename'))
try:
categories = int(np.amax(ratings)) + 1
except(ValueError):
print('Invalid input (integers required)')
sys.exit(1)
subjects = int(ratings.size / 2)
weighted = build_weight_matrix(categories, mode)
observed = build_observed_matrix(categories, subjects, ratings)
distributions = build_distributions_matrix(categories, subjects, ratings)
expected = build_expected_matrix(categories, distributions)
kappa = calculate_kappa(weighted, observed, expected)
if args.get('--verbose'):
print('Kappa (' + mode + '):')
print(kappa)
print('Categories: ' + str(categories))
print('Subjects: ' + str(subjects))
else:
print(kappa)
return kappa
if __name__ == "__main__":
args = docopt(usage, argv=None, help=True, version=None, options_first=False)
main(args)