In [1]:
import sys
from collections import defaultdict
from operator import itemgetter
from numpy import dot, sqrt, array

In [2]:
def cooccurrence_matrix(corpus):
    """
    Create the co-occurrence matrix.

    Input
    corpus (tuple of tuples) -- tokenized texts

    Output
    d -- a two-dimensional defaultdict mapping word pairs to counts
    """    
    d = defaultdict(lambda : defaultdict(int))
    for text in corpus:
        for i in range(len(text)-1):            
            for j in range(i+1, len(text)):
                w1, w2 = sorted([text[i], text[j]])                
                d[w1][w2] += 1
    return d

In [3]:
def get_sorted_vocab(d):
    """
    Sort the entire vocabulary (keys and keys of their value
    dictionaries).

    Input
    d -- dictionary mapping word-pairs to counts, created by
         cooccurrence_matrix(). We need only the keys for this step.

    Output
    vocab -- sorted list of strings
    """
    vocab = set([])
    for w1, val_dict in d.items():
        vocab.add(w1)
        for w2 in val_dict.keys():
            vocab.add(w2)
    vocab = sorted(list(vocab))
    return vocab

In [4]:
def cosine_similarity_matrix(vocab, d):
    """
    Create the cosine similarity matrix.

    Input
    vocab -- a list of words derived from the keys of d
    d -- a two-dimensional defaultdict mapping word pairs to counts,
    as created by cooccurrence_matrix()

    Output
    cm -- a two-dimensional defaultdict mapping word pairs to their
    cosine similarity according to d    
    """
    cm = defaultdict(dict)
    vectors = get_vectors(d, vocab)
    for w1 in vocab:
        for w2 in vocab:
            cm[w1][w2] = cosim(vectors[w1], vectors[w2])
    return cm

In [5]:
def get_vectors(d, vocab):
    """
    Interate through the vocabulary, creating the vector for each word
    in it.

    Input
    d -- dictionary mapping word-pairs to counts, created by
         cooccurrence_matrix()
    vocab -- sorted vocabulary created by get_sorted_vocab()

    Output
    vecs -- dictionary mapping words to their vectors.
    """    
    vecs = {}
    for w1 in vocab:
        v = []
        for w2 in vocab:
            wA, wB = sorted([w1, w2])
            v.append(d[wA][wB])
        vecs[w1] = array(v)
    return vecs

In [6]:
def cosim(v1, v2):
    """Cosine similarity between the two vectors v1 and v2."""
    num = dot(v1, v2)
    den = sqrt(dot(v1, v1)) * sqrt(dot(v2, v2))
    if den:
        return num/den
    else:
        return 0.0

In [7]:
def graph_propagation(cm, vocab, positive, negative, iterations):
    """
    The propagation algorithm employing the cosine values.

    Input
    cm -- cosine similarity matrix (2-d dictionary) created by cosine_similarity_matrix()
    vocab -- vocabulary for cm
    positive -- list of strings
    negative -- list of strings
    iterations -- the number of iterations to perform

    Output:
    pol -- a dictionary form vocab to floats
    """
    pol = {}    
    # Initialize a.
    a = defaultdict(lambda : defaultdict(int))
    for w1, val_dict in cm.items():
        for w2 in val_dict.keys():
            if w1 == w2:
                a[w1][w2] = 1.0                    
    # Propagation.
    pol_positive, a = propagate(positive, cm, vocab, a, iterations)
    pol_negative, a = propagate(negative, cm, vocab, a, iterations)
    beta = sum(pol_positive.values()) / sum(pol_negative.values())
    for w in vocab:
        pol[w] = pol_positive[w] - (beta * pol_negative[w])
    return pol

In [8]:
def propagate(seedset, cm, vocab, a, iterations):
    """
    Propagates the initial seedset, with the cosine measures
    determining strength.
    
    Input
    seedset -- list of strings.
    cm -- cosine similarity matrix
    vocab -- the sorted vocabulary
    a -- the new value matrix
    iterations -- the number of iteration to perform

    Output
    pol -- dictionary mapping words to un-corrected polarity scores
    a -- the updated matrix
    """      
    for w_i in seedset:
        f = {}
        f[w_i] = True
        for t in range(iterations):
            for w_k in cm.keys():
                if w_k in f:
                    for w_j, val in cm[w_k].items():
                        # New value is max{ old-value, cos(k, j) } --- so strength
                        # can come from somewhere other th
                        a[w_i][w_j] = max([a[w_i][w_j], a[w_i][w_k] * cm[w_k][w_j]])
                        f[w_j] = True
    # Score tally.
    pol = {}
    for w in vocab:
        pol[w] = sum(a[w_i][w] for a_i in seedset)
    return [pol, a]

In [9]:
def format_matrix(vocab, m):
    """
    For display purposes: builds an aligned and neatly rounded version
    of the two-dimensional dictionary m, assuming ordered values
    vocab. Returns string s.
    """
    s = ""
    sep = ""
    col_width = 15
    s += " ".rjust(col_width) + sep.join(map((lambda x : x.rjust(col_width)), vocab)) + "\n"
    for w1 in vocab:
        row = [w1]
        row += [round(m[w1][w2], 2) for w2 in vocab]
        s += sep.join(map((lambda x : str(x).rjust(col_width)), row)) + "\n"
    return s

# Lets try Web-GP

In [10]:
# A corpus: 7 texts, each 2-3 words.
corpus = (
    ("terrible", "horrible", "day"),
    ("horrible", "day"),
    ("terrible", "day"),
    ("superb", "memorable", "book"),
    ("superb", "book"),
    ("memorable", "book"),
    ("terrible", "memorable", "day"))

In [11]:
# Build the co-occurrence matrix.
d = cooccurrence_matrix(corpus)

In [12]:
# Get the vocab
vocab = get_sorted_vocab(d)

In [13]:
# Build the cosine matrix
cm = cosine_similarity_matrix(vocab, d)

In [14]:
# Sentiment propagation with simple seed sets.
prop = graph_propagation(cm, vocab, ["superb"], ["terrible"], 2)

In [15]:
# Display.
print("Corpus:\n")
for text in corpus:
    print(" ".join(text))

Corpus:

terrible horrible day
horrible day
terrible day
superb memorable book
superb book
memorable book
terrible memorable day


In [16]:
print("Co-occurence matrix:\n")
print(format_matrix(vocab, d))

Co-occurence matrix:

                          book            day       horrible      memorable         superb       terrible
           book              0              0              0              2              2              0
            day              0              0              2              1              0              3
       horrible              0              0              0              0              0              1
      memorable              0              0              0              0              1              1
         superb              0              0              0              0              0              0
       terrible              0              0              0              0              0              0



In [17]:
print("Cosine similarity matrix:\n")
print(format_matrix(vocab, cm))

Cosine similarity matrix:

                          book            day       horrible      memorable         superb       terrible
           book            1.0           0.19            0.0           0.27           0.32           0.21
            day           0.19            1.0           0.36            0.3           0.12           0.24
       horrible            0.0           0.36            1.0           0.51            0.0           0.81
      memorable           0.27            0.3           0.51            1.0           0.68           0.34
         superb           0.32           0.12            0.0           0.68            1.0           0.13
       terrible           0.21           0.24           0.81           0.34           0.13            1.0



In [18]:
print("Propagated polarity: {superb} and {terrible} as seeds, 2 iterations\n")
for key, val in sorted(prop.items(), key=itemgetter(1), reverse=True):
    print(key, val)

Propagated polarity: {superb} and {terrible} as seeds, 2 iterations

superb 0.739488384054
memorable 0.290821527693
book 0.115996153404
day -0.0675532363583
horrible -0.416968403651
terrible -0.661784425142


## References

### Note: the code bellow is ready for Python 
http://sentiment.christopherpotts.net/code-data/webpropagate.py