In [11]:
from pyspark import SparkConf, SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF

In [2]:
# Boilerplate Spark
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf = conf)

In [46]:
# Load data set
rawData = sc.textFile("subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

In [47]:
fields.values

<bound method RDD.values of PythonRDD[52] at RDD at PythonRDD.scala:53>

In [48]:
# Store the document names
documentNames = fields.map(lambda x: x[1])

In [49]:
# hash the words in each document
# 100K hash buckets just to save some memory
hashingTF = HashingTF(100000)  
tf = hashingTF.transform(documents)

In [50]:
# compute the TF*IDF of each term in each document:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

In [57]:
# index a sparse vector from HashingTF gives us back:
word = "Arizona"
hashingTFTransform = hashingTF.transform([word])

In [58]:
wordHashValue = int(hashingTFTransform.indices[0])
wordHashValue

39304

In [59]:
wordRelevance = tfidf.map(lambda x: x[wordHashValue])

# zip in the document names so we can see which is which:
zippedResults = wordRelevance.zip(documentNames)

# document with the maximum TF*IDF value:
print("Best document for {} is:".format(word))
print(zippedResults.max())

Best document for Arizona is:
(9.893510600012988, 'Apollo program')
