In [1]:
from numpy.random import seed
seed(1010)

## Implementation in Scikit-Learn

![](https://www.evernote.com/l/AAGiYGcKcIxIaJ7sCg97K9JDtUO2dY9mywoB/image.png)

### Raw Text Data

<img src="https://www.evernote.com/l/AAFfAyDQQ1xGPLTIxT2hcUSLrHuQDbYzsuYB/image.png" width=600px>

Here each line of text is a **document** and the collection of all lines of text is the **body**.

In [4]:
%sh curl --remote-name-all 'https://joshua-databricks.s3-us-west-2.amazonaws.com/text-data/pagesParsed.json'

In [5]:
dbutils.fs.cp("file:/databricks/driver/pagesParsed.json", "dbfs:/FileStore/tmp/pagesParsed.json")

In [2]:
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
wikiDF = spark.read.json("../data/pagesParsed.json")

wiki_df = wikiDF.toPandas()
# display(wiki_df) 

### Document-Term Matrix

<img src="https://www.evernote.com/l/AAFtjaKOjT5CYr5N_NPHKU6vpBWNnBgbWLIB/image.png" width=600px>

The Document-Term Matrix can be created using the `TfidfVectorizer` model [[doc]](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) in Scikit-Learn.

In [3]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def no_number_preprocessor(tokens):
    r = re.sub('(\d)+', '', tokens.lower())
    # This alternative just removes numbers:
    # r = re.sub('(\d)+', '', tokens.lower())
    return r
  
vectorizer = TfidfVectorizer(stop_words='english', preprocessor=no_number_preprocessor)
bag_of_words = vectorizer.fit_transform(wiki_df.text)

In [4]:
feature_names = vectorizer.get_feature_names()

In [5]:
from sklearn.feature_selection import mutual_info_regression

In [6]:
topics = [
    ["american","born","died","pianist","jazz","league","baseball","ball","composer","saxophonist"],
    ["baseball","batter","ball","pitcher","base","team","league","runner","game","home"],
    ["jazz","new","records","released","label","labels","music","recordings","record","musicians",]
]

In [7]:
topic_indices = [
    [feature_names.index(word) for word in topic]
    for topic in topics
]

topic_indices

[[1363, 5290, 12129, 34284, 23275, 25767, 3559, 3191, 9078, 39924],
 [3559, 3705, 3191, 34520, 3558, 45205, 25767, 39240, 17740, 20963],
 [23275, 31070, 37153, 37574, 25246, 25251, 30377, 37149, 37142, 30391]]

In [8]:
import pandas as pd

In [9]:
bag_of_words = pd.DataFrame(bag_of_words.todense())

In [10]:
bag_of_words.shape

(1848, 51090)

In [26]:
def PMI(i1, i2):
    return mutual_info_regression(bag_of_words[[i1]], bag_of_words[i2])[0]

In [27]:
from itertools import combinations

In [28]:
def coherence(topic_indices):
    pairwise_indices = list(combinations(topic_indices, 2))
    sum = 0
    for pair in pairwise_indices:
        sum += PMI(*pair)
    return sum

In [29]:
coherence(topic_indices[0])

5.0211433118614295

In [30]:
coherence(topic_indices[1])

5.825909846748949

In [31]:
coherence(topic_indices[2])

6.91645245305084