In [13]:
from pyspark import SparkContext
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import json
import os
sc = SparkContext()

In [14]:
def get_rdd(base, input, num_part):
        base_dir = os.path.join(base)
        input_path = os.path.join(input)
        file_name = os.path.join(base_dir, input_path)
        rdd = sc.textFile(file_name, num_part)
        rdd_j = rdd.map(json.loads)
        rdd_j.cache()
        return rdd_j

In [15]:
def tf(tokens):
    res = dict()
    addon = 1.0 / len(tokens)
    for tok in tokens:
        res[tok] = res.setdefault(tok, 0) + addon
    return res

In [16]:
def idfs(corpus):
    N = float(corpus.count())
    uniqueTokens = corpus.flatMap(lambda x: x[1]).distinct()
    tokenCountPairTuple = corpus.flatMap(lambda x: set(x[1])).map(lambda x: (x, 1))
    tokenSumPairTuple = tokenCountPairTuple.reduceByKey(lambda a, b: a + b)
    return (tokenSumPairTuple.map(lambda (tok, num): (tok, N / num )))

In [17]:
def tfidf(tokens, idfs):
    """ Compute TF-IDF
    Args:
        tokens (list of str): input list of tokens from tokenize
        idfs (dictionary): record to IDF value
    Returns:
        dictionary: a dictionary of records to TF-IDF values
    """
    tfs = tf(tokens)
    tfIdfDict = {key: idfs[key] * tfs[key] for key in tokens}
    return tfIdfDict

In [18]:
num_part = 4
revs = get_rdd('../data', 'reviews_electronics5000.json', num_part)
rev_texts = revs.map(lambda x: (x['asin'], x['reviewText']))
rev_agg_texts = rev_texts.map(lambda (asin, text): (asin, [text])).reduceByKey(lambda x, y: x + y)
rev_agg = rev_agg_texts.map(lambda (asin, revs): (asin, ' '.join(revs)))
rev_agg = rev_agg.map(lambda (asin, rev): (asin, word_tokenize(rev)))
rev_agg.map(lambda (asin, toks): (asin, tf(toks)))
rev_agg.cache()

PythonRDD[7] at RDD at PythonRDD.scala:43

In [26]:
# tf
tfs = rev_agg.map(lambda (asin, toks): (asin, tf(toks)))

In [20]:
# idf
# use the whole category as idf corpus
idfs_cat = idfs(rev_agg)

In [22]:
idfs_cat.take(5)

[(u'DVD+R', 299.0),
 (u'1,2', 299.0),
 (u'four', 13.0),
 (u'gag', 299.0),
 (u'recommended.UPDATE', 299.0)]

In [23]:
idfs_cat.lookup('1,2')

[299.0]

In [21]:
rev_agg.map(lambda (asin, toks): (asin, toks)).take(10)

Exception: It appears that you are attempting to broadcast an RDD or reference an RDD from an action or transformation. RDD transformations and actions can only be invoked by the driver, not inside of other transformations; for example, rdd1.map(lambda x: rdd2.values.count() * x) is invalid because the values transformation and count action cannot be performed inside of the rdd1.map transformation. For more information, see SPARK-5063.