In [1]:
import copy
import random
import scipy

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from keras.models import Sequential
from keras.layers import Dense

## Set parameters

In [2]:
k = 2
take_x_most_common_words = 1000
num_sampling_iterations = 5
np.random.seed(10) # Set random seed for consistency

In [3]:
# 5331 pos, 5331 neg reviews; the lines here refer to lines in the pos and neg review files separately
train_lines = list(range(0,199))
train_lines.extend(range(200,1551))
train_lines.extend(range(1600,5331))

test_lines = list(range(1551, 1600))
test_lines.insert(0,199)

## Load data
Review variable names suffixed with `_ss` are a list of strings, with one string representing an entire review. 

Review variable names suffixed with `_ls` are a list of list of strings, with one string representing an entire review. 

Review variable names suffixed with `_vc` are lists of lists of integers, with a list of integers representing an entire review, and an integer representing a word.

Review variable names suffixed with `_oh` are one-hot encoded pandas Dataframes of reviews, with the column names representing the numeric representation of a word.

### Obtain Rotten tomatoes data

In [4]:
def get_rt_data(fname, line_range):
    with open(fname, encoding='utf-8', errors='ignore') as file:
        all_lines = list(file)
        as_single_string = [all_lines[line_num] for line_num in line_range]
        as_list_of_strings = [line.strip().split() for line in as_single_string]
    return as_single_string, as_list_of_strings

In [5]:
any_y = np.concatenate((np.ones(len(train_lines),dtype=bool), np.zeros(len(train_lines), dtype=bool)))

train_pos_reviews_ss, train_pos_reviews_ls = get_rt_data("./datasets/rt-polaritydata/rt-polarity-mangled.pos", train_lines) 
train_neg_reviews_ss, train_neg_reviews_ls = get_rt_data("./datasets/rt-polaritydata/rt-polarity-mangled.neg", train_lines)
train_reviews_ss = train_pos_reviews_ss + train_neg_reviews_ss 
train_reviews_ls = train_pos_reviews_ls + train_neg_reviews_ls
train_y = np.concatenate(
    (np.ones(len(train_lines),dtype=bool), 
     np.zeros(len(train_lines), dtype=bool))
)

test_pos_reviews_ss, test_pos_reviews_ls = get_rt_data("./datasets/rt-polaritydata/rt-polarity-mangled.pos", test_lines) 
test_neg_reviews_ss, test_neg_reviews_ls = get_rt_data("./datasets/rt-polaritydata/rt-polarity-mangled.neg", test_lines) 
test_reviews_ss = test_pos_reviews_ss + test_neg_reviews_ss 
test_reviews_ls = test_pos_reviews_ls + test_neg_reviews_ls
test_y = np.concatenate(
    (np.ones(len(test_lines),dtype=bool), 
     np.zeros(len(test_lines), dtype=bool))
)

all_reviews_ss = train_reviews_ss + test_reviews_ss
all_reviews_ls = train_reviews_ls + test_reviews_ls

In [6]:
print(f"Train set has {len(train_reviews_ss)} reviews; example review:\n{train_reviews_ls[0]}")

Train set has 10562 reviews; example review:
['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', "century's", 'new', '"', 'conan', '"', 'and', 'that', "he's", 'going', 'to', 'make', 'aaaa', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.']


In [7]:
print(f"Test set has {len(test_reviews_ss)} reviews; example review:\n{test_reviews_ls[0]}")

Test set has 100 reviews; example review:
['australian', 'actor/director', 'john', 'polson', 'and', 'award-winning', 'english', 'cinematographer', 'giles', 'nuttgens', 'make', 'aaaa', 'terrific', 'effort', 'at', 'disguising', 'the', 'obvious', 'with', 'energy', 'and', 'innovation', '.']


## Also add hand-picked reviews from Socher et al, Dhamdhere et al
#### These are example n-grams used as references in other academic works

In [8]:
target_vocab_DSA = dict(
    t2dsa = ["wont disappointed", "both inspiring", "aaaa crisp", "excellent youthful", "john terrific"],
)

target_vocab = dict(
    t1 = ['engaging', 'best', 'powerful', 'love', 'beautiful', 'bad', 'dull', 
          'boring', 'fails', 'worst', 'stupid', 'painfully'],
    t2 = ['excellent performances', 'aaaa masterpiece', 'masterful film', 'wonderful movie',
          'marvelous performances', 'worst movie', 'very bad', 'shapeless mess',
          'worst thing', 'instantly forgettable', 'complete failure'],
    t3 = ['an amazing performance', 'wonderful all ages triumph', 'aaaa wonderful movie',
          'most visually stunning', 'for worst movie', 'aaaa lousy movie', 'aaaa complete failure',
          'most painfully marginal', 'very bad sign'],
    t4 = ['gorgeous imagery effective performances', 'just another bad movie',
          'aaaa humorless disjointed mess'],
    t5 = ['nicely acted and beautifully shot', 'the best of the year',
           'aaaa terrific american sports movie', 'refreshingly honest and ultimately touching',
           'silliest and most incoherent movie', 'completely crass and forgettable movie',
           'aaaa cumbersome and cliche ridden movie'],
    t6 = ['aaaa trashy exploitative thoroughly unpleasant experience'],
    t7 = ['aaaa masterful film from aaaa master filmmaker', 'this sloppy drama is an empty vessel',
          'quickly drags on becoming boring and predictable'],
    t8 = ['one of the best films of the year', 'aaaa love for films shines through each frame',
          'created aaaa masterful piece of artistry right here',
          'be the worst special effects creation of the year'],
    t2dsa = ["wont disappointed", "both inspiring", "aaaa crisp", "excellent youthful", "john terrific"],
)

def target_vocab_to_reviews_ss(tv):
    return [' '.join(v) for v in tv.values()]

def target_vocab_DSA_to_reviews_ss(tv):
    return [' '.join(v) for v in tv.values()]

## Manage the formats of the reviews

#### We add some helper functions to convert between our chosen review formats

In [9]:
def ss_to_oh(reviews_ss, vectorizer):
    oh_reviews = vectorizer.transform(reviews_ss).toarray().astype(bool)
    return pd.DataFrame(
        columns = vectorizer.get_feature_names(), data=oh_reviews
    )    

def ss_to_vc_old(reviews_ss, vectorizer):
    return [
        [vectorizer.get_feature_names().index(w) for w in rev_ss.split()]
        for rev_ss in reviews_ss
    ]

def ss_to_vc(reviews_ss, vectorizer):
    as_oh = ss_to_oh(reviews_ss, vectorizer)
    return [
        [i for i, bl in enumerate(as_oh_row) if bl]
        for as_oh_row in as_oh.values
    ]

#### A 'vectorizer' is used to encode reviews numerically

In [10]:
def get_vectorizer(reviews_ss):
    vectorizer = CountVectorizer()
    return vectorizer.fit(reviews_ss)

all_vectorizer = get_vectorizer(
    all_reviews_ss + target_vocab_to_reviews_ss(target_vocab)
)

num_features = len(all_vectorizer.get_feature_names())
print(f"Total vocabulary {num_features}")

Total vocabulary 18326


#### A function to take the top n most common words from our chosen body of work

In [11]:
def get_top_n_words(corpus, n):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

#### Get the top x words and create a vectorizer of these plus the DSA words

In [12]:
top_n_words_ss = [
    word for word, _ in get_top_n_words(all_reviews_ss, take_x_most_common_words)
]
    
DSA_vectorizer = get_vectorizer(
    top_n_words_ss + target_vocab_DSA_to_reviews_ss(target_vocab_DSA)
)

num_DSA_features = len(DSA_vectorizer.get_feature_names())

print(f"Total vocabulary {num_DSA_features}")

Total vocabulary 1004


## Shuffle the training set

In [13]:
def get_shuffle_index(x_len):
    return np.random.permutation(np.arange(x_len))   

In [14]:
si = get_shuffle_index(len(train_reviews_ss))
ss_train = [train_reviews_ss[i] for i in si]
train_y = train_y[si]

#### One-hot encode the (now shuffled) training set

In [15]:
oh_train = ss_to_oh(ss_train, DSA_vectorizer)

#### One-hot encode the test set (no shuffling is needed)

In [16]:
oh_test = ss_to_oh(test_reviews_ss, DSA_vectorizer)

## Define the ML model

In [17]:
model = Sequential()
model.add(Dense(16, input_dim=oh_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#### Train on one-hot encoded data

In [18]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(oh_train.values, train_y, epochs=4, batch_size=5)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f5f70496280>

In [19]:
_, accuracy = model.evaluate(oh_test.values, test_y)
print(f"Model accuracy on test set is {accuracy:.2%}")

Model accuracy on test set is 78.00%


#### Some examples of incorrect predictions

In [20]:
predictions = (model.predict(oh_test.values) > 0.5).astype(np.int32)
incorrect_predictions = [
    (rev, predicted[0], actual) 
    for (rev, predicted, actual) in zip(test_reviews_ss, predictions, test_y)
    if predicted[0] != int(actual)
]
for rev, predicted, actual in incorrect_predictions[:3]:
    print(f"{rev}")
    print(f"Predicted {predicted}, actual {int(actual)}\n\n")

this is wild surreal stuff , but brilliant and the camera just kind of sits there and lets you look at this and its like you're going from one room to the next and none of them have any relation to the other . 

Predicted 0, actual 1


it's aaaa demented kitsch mess ( although the smeary digital video does match the muddled narrative ) , but it's savvy about celebrity and has more guts and energy than much of what will open this year . 

Predicted 0, actual 1


aaaa movie with aaaa real anarchic flair . 

Predicted 0, actual 1




#### We need a function to generate a powerset of features; but only up to maximum set size of k

In [32]:
def get_powerset_to_k(seq, k, init=True):
    if init:
        seq = [s if type(s) == frozenset else frozenset([s]) for s in seq]
    if len(seq) <= 1:
        yield seq[0]
        yield frozenset()
    else: 
        for item in get_powerset_to_k(seq[1:], k, False):
            if len(item) <= k - 1:
                yield seq[0].union(item)
            yield item
            
def get_powerset_to_k_ex_emptyset(seq, k):
    gen = get_powerset_to_k(seq, k)
    for item in gen:
        if item != frozenset():
            yield item

In [22]:
def choose(n, k):
    return scipy.special.comb(n, k)

#### The q values allow us to probability-weight our sampled results

In [23]:
def get_q_values_for_k(k, num_local_features):
    q = np.zeros(num_local_features, dtype=np.longdouble) 
    q0den = sum([
        choose(num_local_features, s) for s in range(1, k+1)
    ])
    q[0] = 1/q0den 
    for r in range(1, num_local_features):
        limd = min(k, (num_local_features - r))
        limn = max((r - k), 0)
        qden = sum([
            choose(num_local_features - r, s) for s in range(1, limd+1)
        ])
        qnum = sum([
            choose(r, s) * q[s] for s in range(limn, r)
        ])
        q[r] = qnum / qden
    # Check that the checksum is satisfied
    checksum = sum([
        choose(num_local_features, i) * q[i] 
        for i in range((num_local_features - k), num_local_features)
    ])
    assert(np.isclose(checksum, 1.0))
    return q

_all_q_values = [get_q_values_for_k(_k, num_DSA_features) for _k in range(1, k+1)]
q_values = np.vstack(_all_q_values)

### Take a single sample review

In [24]:
sample_review_oh = oh_test.iloc[0]
print(f'sample review: {test_reviews_ss[0]}')
all_features_set = set(range(1, num_DSA_features + 1))

sample review: australian actor/director john polson and award-winning english cinematographer giles nuttgens make aaaa terrific effort at disguising the obvious with energy and innovation . 



## The Joint Shapley calculation function

In [25]:
def calculate_phi_t(
    review_oh,
    value_function,
    coalition_vc,
    num_features,
    q_values,
    num_sampling_iterations,
):
    coalition_size = len(coalition_vc)
    p_values = np.zeros((num_features - coalition_size + 1), dtype=np.longdouble)
    for i in range(1, num_features - coalition_size + 2):
        q_value = q_values[k-1, i-1]
        num_ft_choose_len = choose((num_features - coalition_size), (i - 1))
        p_values[i - 1] = num_ft_choose_len * q_value
    hat_p_values = p_values / np.sum(p_values)
    estimates = np.zeros(num_sampling_iterations)

    for itr in range(0, num_sampling_iterations):
        U = np.random.uniform(0, 1, 1)
        X = sum(U > hat_p_values.cumsum())
        S = set()
        for w in range(1, X + 1):
            possible_next = set(range(1, num_features + 1)).difference(
                set(coalition_vc).union(S))
            r = random.sample(possible_next, 1)
            S = S.union(set(r))

        z = np.random.randint(0, 2, num_features)
        xminusT = np.copy(review_oh)
        xplusT = np.copy(review_oh)
        for h in set(range(0, num_features)).difference(S):
            xminusT[h] = z[h]
        for h in set(range(0, num_features)).difference(S.union(set(coalition_vc))):
            xplusT[h] = z[h]

        estimates[itr] = value_function(xplusT) - value_function(xminusT)
    return np.mean(estimates) * np.sum(p_values)

#### We use a 'value function' to give us the value of a given coalition; in this case the 'value' is the prediction it receives from our ML model

In [26]:
value_f = lambda oh: model.predict(oh[None, :])[0][0]

#### We look at our target vocabulary, from Socher et al and DSA

In [27]:
# Obtain bigrams used in Dhamdhere et al as strings
t2dsa = target_vocab["t2dsa"]
# Vectorize bigrams
t2dsa_vc = ss_to_vc(t2dsa, DSA_vectorizer)
# Add both bigrams and single words from bigrams to list of coalitions
DSA_coalitions = [cln for cln in t2dsa_vc] + [[word] for cln in t2dsa_vc for word in cln]

#### We can now calculate Joint Shapley values for the Dhamdhere et al coalitions

In [28]:
test_reviews_vc = ss_to_vc(test_reviews_ss, DSA_vectorizer)
num_review = 38 # sets the review to be analysed
sample_review_oh = oh_test.iloc[num_review]
print(f'test_reviews_vc[num_review] = {test_reviews_vc[num_review]}')

num_times_to_loop = 60

with open("results/250528-review-38-convergence.txt", "a") as a_writer:
    a_writer.write(f'test_reviews_vc[num_review] = {test_reviews_vc[num_review]}\n')

    for loop_num in range(num_times_to_loop+1):
        a_writer.write(f'loop number = {loop_num}\n')
        for coalition in DSA_coalitions:
            js = calculate_phi_t(
                sample_review_oh,
                value_f,
                coalition,
                num_DSA_features,
                q_values,
                num_sampling_iterations
            )
            a_writer.write(f'k = {k}, coalition {coalition}: local jShapley = {js}\n')


test_reviews_vc[num_review] = [44, 101, 434, 447, 677, 860, 946]


#### Iterate through all the reviews in the test set, calculating local Joint Shapleys for the DSA features

In [29]:
for n, review_oh in enumerate(oh_test.iloc):
    for coalition in DSA_coalitions:
        js = calculate_phi_t(
            review_oh,
            value_f,
            coalition,
            num_DSA_features,
            q_values,
            num_sampling_iterations
        )
        print(f'k = {k}, review {n}, coalition {coalition}: local jShapley = {js}')

k = 2, review 0, coalition [209, 978]: local jShapley = -1.4536792479434422e-05
k = 2, review 0, coalition [101, 434]: local jShapley = -2.6261929950256325e-05
k = 2, review 0, coalition [3, 174]: local jShapley = -3.5400720296794657e-11
k = 2, review 0, coalition [269, 1003]: local jShapley = -3.3662190923019074e-06
k = 2, review 0, coalition [455, 848]: local jShapley = -7.667796016285722e-08
k = 2, review 0, coalition [209]: local jShapley = 7.586823181546724e-05
k = 2, review 0, coalition [978]: local jShapley = -0.00011701161234277151
k = 2, review 0, coalition [101]: local jShapley = -0.00032251380209730797
k = 2, review 0, coalition [434]: local jShapley = -5.3950716090245555e-06
k = 2, review 0, coalition [3]: local jShapley = -5.924625644474126e-06
k = 2, review 0, coalition [174]: local jShapley = 3.1390724837718088e-06
k = 2, review 0, coalition [269]: local jShapley = -0.0001072492479944602
k = 2, review 0, coalition [1003]: local jShapley = -6.350541801654e-05
k = 2, revie

#### Also obtain Joint Shapley values for the target vocab

In [30]:
target_vocab_DSA_scores = {}
for key, ss_reviews in target_vocab_DSA.items():
    scores = []
    for review in ss_to_vc(ss_reviews, DSA_vectorizer):
        try:
            score = calculate_phi_t(
                sample_review_oh,
                value_f,
                review,
                num_DSA_features,
                q_values,
                num_sampling_iterations
            )
        except:
            score = np.nan
        scores.append(score)
    target_vocab_DSA_scores[key] = scores

In [31]:
target_vocab_DSA_scores

{'t2dsa': [1.0062595743163387325e-06,
  9.0593393299522259e-07,
  8.121869255292608737e-06,
  -6.866559713568269947e-08,
  -3.9334471936654678022e-05]}

#### Obtain local Joint Shapleys for the training set

In [32]:
local_js = []
for review_vc in ss_to_vc(test_reviews_ss, DSA_vectorizer): 
    local_js.append(
        calculate_phi_t(
            sample_review_oh,
            value_f,
            review_vc,
            num_DSA_features,
            q_values,
            num_sampling_iterations
        )
    )

In [33]:
local_js

[-1.1322626647038326893e-09,
 -1.6478246073423960825e-08,
 -5.1716609129494857498e-08,
 1.9939514072793051136e-08,
 4.1499743305214087153e-12,
 0.0,
 0.0,
 -8.93024094242625624e-10,
 -2.8146133094570175849e-12,
 -1.0620973333741364646e-05,
 2.8146133094570175849e-12,
 0.0,
 2.6487579446290687472e-07,
 2.5573004903703270808e-10,
 1.1888262152019613777e-11,
 -1.6837771661740714121e-09,
 -2.1683162717266605812e-12,
 1.3315847490925818429e-06,
 -2.1949567235703324561e-07,
 1.07053090467383769776e-10,
 -9.985989976622805171e-07,
 -3.673195992550957918e-12,
 7.151785185984896279e-12,
 3.0487526738172951637e-10,
 -0.00024773678933588479461,
 2.748689053068451534e-06,
 -6.1340699392805165527e-10,
 9.879449297785135127e-08,
 5.2546118711926692895e-09,
 1.6330464031927411136e-08,
 9.7483749646560824526e-11,
 -1.5621103867486446229e-10,
 -6.7318346241835709894e-10,
 9.703094002763168583e-11,
 1.05033942531624429474e-08,
 -1.2748149033042163715e-08,
 -2.1796058712134280095e-06,
 -6.056190172465875