/
semantic_classifier.py
34 lines (30 loc) · 1.4 KB
/
semantic_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from classification import Classification
from nltk.corpus import wordnet as wn
import noun_extractor
import cluster
import pdb
# find lowest common ancestory of multiple elements
def lca(synset):
return reduce(lca_helper, synset)
# only return the first hypernym
def lca_helper(x, y):
if x is None or y is None:
return None
hypernyms = x.lowest_common_hypernyms(y)
if len(hypernyms) == 0:
return None
else:
return hypernyms[0]
# runs the entire algorithm, wrap output in Classification object
def run(filename, min_size, max_size, dist):
noun_dict = noun_extractor.get_nouns(filename)
(synset_list, synset_dict) = cluster.get_synset_list(noun_dict)
matrix = cluster.gen_sim_matrix(synset_list)
clustering = cluster.format_clustering(cluster.cluster(matrix), synset_list)
clusters = cluster.get_clusters(clustering, synset_list, min_size, max_size, dist=dist)
clusters = filter(lambda x: x[0] is not None, clusters)
cluster_counts = cluster.get_cluster_counts(clusters, synset_dict)
# sort clusters by noun counts, most frequent first
sorted_clusters = [x[1] for x in sorted(enumerate(clusters), key=lambda x: cluster_counts[x[0]], reverse=True)]
hypernyms = filter(lambda x: x is not None, map(lambda x: lca(x), sorted_clusters))
return Classification(noun_dict, synset_list, synset_dict, matrix, clustering, sorted_clusters, hypernyms)