## Implementation of TF-IDF

Motivation: Spark MLlib tools are intended to generate feature vectors for ML algorithms. 
Through that implementation, it's not possible to figure out the weight for a particular term in a particular document.

In [187]:
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.types import StructField, StructType, StringType, ArrayType, IntegerType, FloatType

In [162]:
sentences = ["one green one white and one pink", "no green is blue", "no dark blue no light blue and no pink"]

In [163]:
data = enumerate(s.split() for s in sentences)

In [164]:
schema = StructType([
    StructField("doc_id", IntegerType(), True),
    StructField("document", ArrayType(StringType()), True)
])

In [165]:
df = spark.createDataFrame(data, schema)

In [166]:
df.show(5, False)

+------+------------------------------------------------+
|doc_id|document                                        |
+------+------------------------------------------------+
|0     |[one, green, one, white, and, one, pink]        |
|1     |[no, green, is, blue]                           |
|2     |[no, dark, blue, no, light, blue, and, no, pink]|
+------+------------------------------------------------+

### Calculate Term Frequency

In [167]:
unfolded_df = df.withColumn('token', f.explode(f.col('document')))

In [168]:
unfolded_df.show(20, False)

+------+------------------------------------------------+-----+
|doc_id|document                                        |token|
+------+------------------------------------------------+-----+
|0     |[one, green, one, white, and, one, pink]        |one  |
|0     |[one, green, one, white, and, one, pink]        |green|
|0     |[one, green, one, white, and, one, pink]        |one  |
|0     |[one, green, one, white, and, one, pink]        |white|
|0     |[one, green, one, white, and, one, pink]        |and  |
|0     |[one, green, one, white, and, one, pink]        |one  |
|0     |[one, green, one, white, and, one, pink]        |pink |
|1     |[no, green, is, blue]                           |no   |
|1     |[no, green, is, blue]                           |green|
|1     |[no, green, is, blue]                           |is   |
|1     |[no, green, is, blue]                           |blue |
|2     |[no, dark, blue, no, light, blue, and, no, pink]|no   |
|2     |[no, dark, blue, no, light, blue

In [194]:
# Count the frequency of each token in each document
df_TF = unfolded_df.groupBy("doc_id", "token").agg(f.count("document").alias("tf"))

In [170]:
df_TF.show(20, False)

+------+-----+---+
|doc_id|token|tf |
+------+-----+---+
|0     |and  |1  |
|0     |green|1  |
|1     |no   |1  |
|2     |light|1  |
|2     |no   |3  |
|1     |is   |1  |
|0     |pink |1  |
|2     |and  |1  |
|1     |blue |1  |
|2     |pink |1  |
|2     |dark |1  |
|0     |white|1  |
|2     |blue |2  |
|1     |green|1  |
|0     |one  |3  |
+------+-----+---+

### Calculate Inverse Document Frequency

In [172]:
# Calculate the Document Frequnecy. i.e. is the number of documents having a given term
df_DF = unfolded_df.groupBy("token").agg(f.countDistinct("doc_id").alias("df"))

In [173]:
df_DF.show(20, False)

+-----+---+
|token|df |
+-----+---+
|green|2  |
|one  |1  |
|light|1  |
|white|1  |
|is   |1  |
|pink |2  |
|dark |1  |
|and  |2  |
|no   |2  |
|blue |2  |
+-----+---+

In [176]:
num_docs = df.count() * 1.

In [192]:
# Apply the IDF formula
df_IDF = df_DF.withColumn("idf", f.log((num_docs + 1)/(f.col("df") + 1)))

In [193]:
df_IDF.show(20, False)

+-----+---+-------------------+
|token|df |idf                |
+-----+---+-------------------+
|green|2  |0.28768207245178085|
|one  |1  |0.6931471805599453 |
|light|1  |0.6931471805599453 |
|white|1  |0.6931471805599453 |
|is   |1  |0.6931471805599453 |
|pink |2  |0.28768207245178085|
|dark |1  |0.6931471805599453 |
|and  |2  |0.28768207245178085|
|no   |2  |0.28768207245178085|
|blue |2  |0.28768207245178085|
+-----+---+-------------------+

### Extract TF-IDF

In [196]:
df_TF_IDF = df_TF.join(df_IDF, "token", "left").withColumn("tf_idf", f.col("tf") * f.col("idf"))

In [197]:
df_TF_IDF.printSchema()

root
 |-- token: string (nullable = true)
 |-- doc_id: integer (nullable = true)
 |-- tf: long (nullable = false)
 |-- df: long (nullable = true)
 |-- idf: double (nullable = true)
 |-- tf_idf: double (nullable = true)

In [198]:
df_TF_IDF.show(20, False)

+-----+------+---+---+-------------------+-------------------+
|token|doc_id|tf |df |idf                |tf_idf             |
+-----+------+---+---+-------------------+-------------------+
|green|0     |1  |2  |0.28768207245178085|0.28768207245178085|
|green|1     |1  |2  |0.28768207245178085|0.28768207245178085|
|one  |0     |3  |1  |0.6931471805599453 |2.0794415416798357 |
|light|2     |1  |1  |0.6931471805599453 |0.6931471805599453 |
|white|0     |1  |1  |0.6931471805599453 |0.6931471805599453 |
|is   |1     |1  |1  |0.6931471805599453 |0.6931471805599453 |
|pink |0     |1  |2  |0.28768207245178085|0.28768207245178085|
|pink |2     |1  |2  |0.28768207245178085|0.28768207245178085|
|dark |2     |1  |1  |0.6931471805599453 |0.6931471805599453 |
|and  |0     |1  |2  |0.28768207245178085|0.28768207245178085|
|and  |2     |1  |2  |0.28768207245178085|0.28768207245178085|
|no   |1     |1  |2  |0.28768207245178085|0.28768207245178085|
|no   |2     |3  |2  |0.28768207245178085|0.86304621735

## Summarized code

In [203]:
def calc_tf_idf(df, id_col="doc_id", tokens_col="document"):

    # Calculate number of documents
    num_docs = df.count() * 1.

    # Turn array of tokens into rows of tokens
    unfolded_df = df.withColumn("token", f.explode(f.col(tokens_col)))

    # Calculate Term Frequency
    df_TF = unfolded_df.groupBy(id_col, "token").agg(f.count(tokens_col).alias("tf"))

    # Calculate Inverse Document Frequency
    df_IDF = unfolded_df\
        .groupBy("token").agg(f.countDistinct(id_col).alias("df"))\
        .withColumn("idf", f.log((num_docs + 1)/(f.col("df") + 1)))

    # Calculate TF.IDF
    TF_IDF = df_TF\
        .join(df_IDF, "token", "left")\
        .withColumn("tf_idf", f.col("tf") * f.col("idf"))
    
    return TF_IDF.select(id_col, "token", "tf_idf")

In [204]:
test = calc_tf_idf(df, id_col="doc_id", tokens_col="document")

In [205]:
test.show(20, False)

+------+-----+-------------------+
|doc_id|token|tf_idf             |
+------+-----+-------------------+
|0     |green|0.28768207245178085|
|1     |green|0.28768207245178085|
|0     |one  |2.0794415416798357 |
|2     |light|0.6931471805599453 |
|0     |white|0.6931471805599453 |
|1     |is   |0.6931471805599453 |
|0     |pink |0.28768207245178085|
|2     |pink |0.28768207245178085|
|2     |dark |0.6931471805599453 |
|0     |and  |0.28768207245178085|
|2     |and  |0.28768207245178085|
|1     |no   |0.28768207245178085|
|2     |no   |0.8630462173553426 |
|1     |blue |0.28768207245178085|
|2     |blue |0.5753641449035617 |
+------+-----+-------------------+