Skip to content

Commit

Permalink
cleaning and optimizing code
Browse files Browse the repository at this point in the history
  • Loading branch information
jaesivsm committed May 16, 2017
1 parent 865dc92 commit 09f5949
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 26 deletions.
33 changes: 7 additions & 26 deletions src/lib/clustering_af/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,13 @@
that they talk about the same subject, and we group them in a meta-article
"""
import json
from math import log10, sqrt
from collections import Counter
from lib.clustering_af.vector import TFIDFVector


def get_tfidf_weight(token, document, frequences, nb_docs):
return ((document.count(token) / len(document)) # tf
* log10(nb_docs / (1 + frequences.get(token, 0)))) # idf


def get_vector(document, frequences, tokens, nb_docs):
return tuple(get_tfidf_weight(token, document, frequences, nb_docs)
for token in tokens)


def get_norm(vector):
return sqrt(sum(pow(dim, 2) for dim in vector))


def get_similarity_score(art1_vector, art1_norm,
article2, frequences, tokens, nb_docs):
art2_vector = get_vector(article2.valuable_tokens,
frequences, tokens, nb_docs)
scalar_product = sum(p * q for p, q in zip(art1_vector, art2_vector))
return scalar_product / (art1_norm * get_norm(art2_vector))
def get_cosine_similarity(v1, article2, freq, tokens, nb_docs):
v2 = TFIDFVector(article2.valuable_tokens, freq, tokens, nb_docs)
return (v1 * v2) / (v1.norm * v2.norm)


def get_token_occurences_count(*articles):
Expand Down Expand Up @@ -56,10 +39,8 @@ def get_token_occurences_count(*articles):
def get_best_match_and_score(article, neighbors):
nb_docs = len(neighbors)
tokens, freq = get_token_occurences_count(article, *neighbors)
article_vector = get_vector(article.valuable_tokens,
freq, tokens, nb_docs)
article_norm = get_norm(article_vector)
rank = {get_similarity_score(article_vector, article_norm,
neigh, freq, tokens, nb_docs): neigh
vector = TFIDFVector(article.valuable_tokens, freq, tokens, nb_docs,
will_be_left_member=True)
rank = {get_cosine_similarity(vector, neigh, freq, tokens, nb_docs): neigh
for neigh in neighbors}
return rank[max(rank)], max(rank)
29 changes: 29 additions & 0 deletions src/lib/clustering_af/vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from math import log10, sqrt


class SparseVector:

def __init__(self, dimensions, will_be_left_member):
self.dimensions = {i: dim for i, dim in enumerate(dimensions)
if dim != 0}
self.norm = sqrt(sum(pow(v, 2) for v in self.dimensions.values()))
if will_be_left_member:
self._common_dims = set(self.dimensions).intersection

def __mul__(self, other):
return sum(self.dimensions[k] * other.dimensions[k]
for k in self._common_dims(other.dimensions))


class TFIDFVector(SparseVector):

def __init__(self, doc, freq, tokens, nb_docs, will_be_left_member=False):
doc_set = set(doc)
super().__init__((self.get_tfidf_weight(token, doc, freq, nb_docs)
if token in doc_set else 0 for token in tokens),
will_be_left_member)

@staticmethod
def get_tfidf_weight(token, document, frequences, nb_docs):
return ((document.count(token) / len(document)) # tf
* log10(nb_docs / (1 + frequences.get(token, 0)))) # idf

0 comments on commit 09f5949

Please sign in to comment.