In [4]:
import src

In [73]:
w2v = src.datasets.read_english_w2v(lim=10000)
train_texts, train_labels = src.datasets.read_imdb()

100%|█████████▉| 9999/10000 [00:00<00:00, 35875.77it/s]


In [99]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans

class KMeansAspectDetector:
    """
    Detect aspect by applying K means to the sets of word vectors.
    """
    def __init__(self, w2v, k=5, language="english"):
        """
        k : the number of predicted aspects
        """
        self.stop_words = set(stopwords.words(language))
        valid_words = [word for word in w2v.index if word not in self.stop_words and word.isalnum()]
        self.w2v = w2v.loc[valid_words]
        self.w2v_words = set(valid_words)
        self.k = 5
        
    def transform_sentence(self, sentence):
        """
        Transform single sentence to vectors of aspects.
        Removes stop words and punctuation.
        Apply Kmeans to predicts aspect raw vectors (No necessarily exact words) 
        """
        word_tokens = word_tokenize(sentence)  
        filtered_sentence = [w for w in word_tokens if not w in self.stop_words]  
        filtered_sentence = [word for word in filtered_sentence if word.isalnum()]
        filtered_sentence = [word for word in filtered_sentence if word in self.w2v_words]
        vectors = self.w2v.loc[filtered_sentence]
        k_means = KMeans(self.k).fit(vectors)
        raw_aspects = k_means.cluster_centers_
        return raw_aspects
    
    def predict_sentence(self, sentence):
        """
        Predict aspects for single sentence.
        """
        aspects_vectors = self.transform_sentence(sentence)
        aspects = []
        for i in range(self.k):
            score = (self.w2v @ aspects_vectors[i])
            aspect = score.index[score.argmax()]
            aspects.append(aspect)
        return aspects 
    
def test_KMeansAspectDetector():
    example_sent = """This is a sample sentence, 
                  showing off the stop words filtration."""
    w2v = src.datasets.read_english_w2v(lim=10000)
    detector = KMeansAspectDetector(w2v)
    vectors = detector.transform_sentence(example_sent)
    assert vectors.shape == (detector.k, w2v.shape[1])
test_KMeansAspectDetector()

100%|█████████▉| 9999/10000 [00:00<00:00, 36209.77it/s]


In [108]:
sentence = """aised in Scranton, Pennsylvania, and New Castle County, 
Delaware, Biden studied at the University of Delaware before earning his law degree from Syracuse University in 1968. 
He was elected to the New Castle County Council in 1970 and became the sixth-youngest senator in American history 
when he was elected to the U.S. Senate from Delaware in 1972, at the age of 29. Biden was a longtime member of the 
Senate Foreign Relations Committee and eventually became its chairman. He also chaired the Senate Judiciary Committee 
from 1987 to 1995, dealing with drug policy, crime prevention, and civil liberties issues; led the effort to pass the 
Violent Crime Control and Law Enforcement Act and the Violence Against Women Act; and oversaw six U.S. Supreme Court 
confirmation hearings, including the contentious hearings for Robert Bork and Clarence Thomas. 
He ran unsuccessfully for the Democratic presidential nomination in 1988 and again in 2008."""
detector = KMeansAspectDetector(w2v)
detector.predict_sentence(sentence)

['appointed', 'Delaware', 'Senate', 'crimes', '1972']