In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

import pandas as pd
import numpy as np
from IPython.display import display

from collections import defaultdict, Counter
from sklearn import metrics

In [2]:
savm = sqlContext.sql("select * from ignite.savm_parsed").repartition(100).cache()

In [3]:
grouped = savm.map(lambda x : (x.sales_acct_id, x.tokenized_name)).reduceByKey(lambda x, y : x + y).toDF(['sales_acct_id', 'words'])

In [4]:
count_vectorizer = CountVectorizer(inputCol = 'words', outputCol = 'tf', vocabSize = 1 << 20).fit(grouped)
builder = count_vectorizer.transform(grouped)
tfidf = IDF(inputCol="tf", outputCol="idf").fit(builder).transform(builder).cache()

In [5]:
def normalize(idf_vector, words):
    normalizing_factor = len(words) + 1
    values = []
    for i in range(len(idf_vector.indices)):
        values.append(idf_vector.values[i] / normalizing_factor)
    return Vectors.sparse(idf_vector.size, idf_vector.indices, values)

def true_idf(tf_vector, idf_vector):
    indices = tf_vector.indices
    values = []
    for i in range(len(indices)):
        values.append(idf_vector.values[i] / tf_vector.values[i])
    return Vectors.sparse(len(indices), indices, values)

normalized_tfidf = tfidf.withColumn('normalized_tfidf', F.udf(normalize, VectorUDT())(F.col('idf'), F.col('words'))).repartition(100).cache()
normalized_tfidf = normalized_tfidf.withColumn('true_idf', F.udf(true_idf, VectorUDT())(F.col('tf'), F.col('idf')))

In [6]:
normalized_tfidf.take(5)

[Row(sales_acct_id=254312596.0, words=[u'trung', u'tam', u'cong', u'nghe', u'thong', u'tin', u'thua', u'thien', u'hue'], tf=SparseVector(480948, {2393: 1.0, 5884: 1.0, 6216: 1.0, 9079: 1.0, 10399: 1.0, 13458: 1.0, 21438: 1.0, 36618: 1.0, 136344: 1.0}), idf=SparseVector(480948, {2393: 6.2787, 5884: 7.7897, 6216: 8.2171, 9079: 8.1808, 10399: 8.079, 13458: 8.4219, 21438: 9.0794, 36618: 9.9443, 136344: 11.513}), normalized_tfidf=SparseVector(480948, {2393: 0.6279, 5884: 0.779, 6216: 0.8217, 9079: 0.8181, 10399: 0.8079, 13458: 0.8422, 21438: 0.9079, 36618: 0.9944, 136344: 1.1513}), true_idf=SparseVector(9, {2393: 6.2787, 5884: 7.7897, 6216: 8.2171, 9079: 8.1808, 10399: 8.079, 13458: 8.4219, 21438: 9.0794, 36618: 9.9443, 136344: 11.513})),
 Row(sales_acct_id=253403496.0, words=[u'public', u'servants', u'social', u'security', u'agency'], tf=SparseVector(480948, {101: 1.0, 177: 1.0, 192: 1.0, 224: 1.0, 43900: 1.0}), idf=SparseVector(480948, {101: 4.1425, 177: 5.2936, 192: 5.5032, 224: 5.931, 4

In [9]:
normalized_tfidf.write.saveAsTable('ignite.topic_modeling_savm_tfidf', mode = 'overwrite')

In [39]:
indexed_vocab = sc.parallelize([([v, v], ) for v in count_vectorizer.vocabulary]).toDF(['words']).cache()

In [46]:
indexed_vocab.take(1)

[Row(words=[u'at&t', u'at&t'])]

In [49]:
count_vectorizer.transform(indexed_vocab).take(5)

[Row(words=[u'at&t', u'at&t'], tf=SparseVector(480948, {0: 2.0})),
 Row(words=[u'inc', u'inc'], tf=SparseVector(480948, {1: 2.0})),
 Row(words=[u'corporation', u'corporation'], tf=SparseVector(480948, {2: 2.0})),
 Row(words=[u'services', u'services'], tf=SparseVector(480948, {3: 2.0})),
 Row(words=[u'llc', u'llc'], tf=SparseVector(480948, {4: 2.0}))]

In [53]:
def get_index(tf_vector):
    return tf_vector.indices[0]

indexed = count_vectorizer.transform(indexed_vocab).map(lambda x : (x.words[0], int(x.tf.indices[0]))).toDF(['word', 'index']).cache()

In [12]:
def gini(list_of_values):
    sorted_list = sorted(list(list_of_values))
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.
    fair_area = height * len(list_of_values) / 2
    return (fair_area - area) / fair_area

def norm_max(values):
    return float(max(values) / np.linalg.norm(values))

norm_max([1, 3, 3, 3])

0.5669467095138409

In [13]:
def flatten_tfidfs(tfidf_vector):
    tfidf_pairs = []
    for i in range(len(tfidf_vector.indices)):
        tfidf_pairs.append((int(tfidf_vector.indices[i]), float(tfidf_vector.values[i])))
    return tfidf_pairs

In [14]:


gini_coefficients = normalized_tfidf.flatMap(lambda x : flatten_tfidfs(x.normalized_tfidf)).groupByKey().mapValues(lambda x : norm_max([row for row in x])).toDF(['index', 'norm_max'])
normalized_tfidf_per_instance = normalized_tfidf.flatMap(lambda x : flatten_tfidfs(x.normalized_tfidf)).toDF(['index', 'value'])
min_instance_tfidf = normalized_tfidf_per_instance.groupby(['index']).agg({'value' : 'min'})
avg_instance_tfidf = normalized_tfidf_per_instance.groupby(['index']).agg({'value' : 'avg'})
max_instance_tfidf = normalized_tfidf_per_instance.groupby(['index']).agg({'value' : 'max'})
count_instance_tfidf = normalized_tfidf_per_instance.groupby(['index']).agg({'value' : 'count'})

In [61]:
builder = gini_coefficients.join(min_instance_tfidf.select(['index', F.col('min(value)').alias('min_tfidf')]), 'index')
builder = builder.join(avg_instance_tfidf.select(['index', F.col('avg(value)').alias('avg_tfidf')]), 'index')
builder = builder.join(max_instance_tfidf.select(['index', F.col('max(value)').alias('max_tfidf')]), 'index')
builder = builder.join(count_instance_tfidf.select(['index', F.col('count(value)').alias('count_docs')]), 'index')
per_word_index_stats = builder.join(indexed, 'index').cache()

In [57]:
per_word_index_stats.where(F.col('word') == 'comcast').take(10)

[Row(index=32, norm_max=0.25742810898080287, min_tfidf=0.00011422164154389107, avg_tfidf=1.1433796711068294, max_tfidf=3.0837341218988197, count_docs=47, word=u'comcast')]

In [62]:
per_word_index_stats.write.saveAsTable('ignite.topic_modeling_per_word', mode = 'overwrite')

In [60]:
per_word_index_stats = sqlContext.sql("select * from ignite.topic_modeling_per_word")

In [58]:
per_word_index_stats.where(F.col('word') == 'noritz').take(1)

[Row(index=60956, norm_max=0.8944271824028374, min_tfidf=0.0009096348640121267, avg_tfidf=2.9343304654924522, max_tfidf=5.8680545077422295, count_docs=3, word=u'noritz')]

In [59]:
normalized_tfidf.rdd.filter(lambda x : x.normalized_tfidf[60956] > 1).take(10)

[Row(sales_acct_id=271403401.0, words=[u'shanghai', u'noritz', u'coltd', u'shanghai', u'noritz', u'company', u'ltd', u'shanghai', u'noritz', u'co', u'ltd.', u'shanghai', u'noritz', u'company', u'ltd'], tf=SparseVector(480948, {9: 2.0, 13: 2.0, 43: 1.0, 68: 1.0, 100: 1.0, 248: 4.0, 60956: 4.0}), idf=SparseVector(480948, {9: 3.2285, 13: 4.0939, 43: 3.2408, 68: 3.9098, 100: 3.4836, 248: 17.3948, 60956: 46.9444}), normalized_tfidf=SparseVector(480948, {9: 0.2018, 13: 0.2559, 43: 0.2026, 68: 0.2444, 100: 0.2177, 248: 1.0872, 60956: 2.934}), true_idf=SparseVector(7, {9: 1.6143, 13: 2.0469, 43: 3.2408, 68: 3.9098, 100: 3.4836, 248: 4.3487, 60956: 11.7361})),
 Row(sales_acct_id=276965243.0, words=[u'noritz', u'noritz', u'corporation', u'noritz', u'corporation', u'noritz', u'corporation', u'noritz', u'corporation', u'noritz', u'corporation', u'noritz', u'corporation', u'noritz', u'corporation', u'noritz', u'corporation', u'noritz', u'corporation'], tf=SparseVector(480948, {2: 9.0, 60956: 10.0})