In [1]:
import pandas as pd
import numpy as np

import pyspark
from pyspark.sql.functions import col
from pyspark.sql import SQLContext

import spacy
import string

import re
import gensim
from gensim import corpora

import en_core_web_sm
nlp = en_core_web_sm.load()

In [3]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

In [4]:
book_review_df = spark.read.json('data/reviews_Books.json')
comics_df = spark.read.json('data/comic_reviews_wtitle.json')

In [12]:
comics_asins = [asin[0] for asin in comics_df.select('asin').dropDuplicates().collect()]

In [15]:
book_review_df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [18]:
# Filtering review text for comic books used from all book reviews
comic_review_text  = book_review_df.select(['asin',
                                            'reviewText',
                                            'summary'
                                           ]).filter(col("asin").isin(comics_asins)).collect()

In [23]:
# Converting individual reviews to dictionary of ASIN with all review text.
review_text = {}
for review in comic_review_text:
    if review[0] not in review_text.keys():
        review_text[review[0]] = review[1] + review[2]
    else:
        review_text[review[0]] += review[1] + review[2]

In [2]:
# Temporarily saving to CSV
reviews_df = pd.read_csv('data/review_text.csv', index_col=0)

In [3]:
reviews_df.head()

Unnamed: 0,asin,text
0,316107255,PENGUIN DREAMS AND STRANGER THINGS is an engag...
1,345507460,"Well, I'm sorry there's people who didn't enjo..."
2,345529375,beautiful story with bryan's wonderful and awk...
3,375424148,"The drawings are somtimes almost psychedelic, ..."
4,375424334,"If you like Chris Ware this is a must, the usu..."


In [57]:
# Functions to clean and tokenize text
def replace_punct_and_numbers(text):
    """Remove punctuation from document"""
    punct = [punc for punc in string.punctuation]
    num = list(range(10))
    clean_text = "".join([letter for letter in text if (letter not in punct) and (letter not in num)])
    return clean_text

def remove_ner(text):
    """Remove NER words from text"""
    doc_nlp = nlp(text)
    ner_words = [word.text for word in doc_nlp.ents]
    ner_regex = re.compile("|".join(ner_words))
    doc_ner_clean = ner_regex.sub('', text)
    return doc_ner_clean

def clean(doc):
    """Process text and return tokenized"""
    punct_free = replace_punct_and_numbers(doc)
    doc_ready = nlp(remove_ner(punct_free))
    tokens = [token.lemma_.lower() for token in doc_ready 
                  if (not token.is_stop) and (token.text.strip() != "")]
    return tokens

In [31]:
texts = [clean(doc) for doc in reviews_df['text'].tolist()]

In [33]:
dictionary = corpora.Dictionary(texts)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in texts]

In [33]:
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)

In [63]:
print(ldamodel.print_topics(num_topics=5, num_words=10))

[(0, '0.001*"odst" + 0.000*"calender" + 0.000*"greenyellow" + 0.000*"index" + 0.000*"handbook" + 0.000*"ilver" + 0.000*"kindergartner" + 0.000*"adventures34" + 0.000*"ballet" + 0.000*"dorrie"'), (1, '0.024*"story" + 0.024*"book" + 0.014*"read" + 0.014*"not" + 0.012*"s" + 0.010*"like" + 0.010*"comic" + 0.009*"good" + 0.009*"great" + 0.008*"character"'), (2, '0.020*"story" + 0.017*"s" + 0.014*"not" + 0.012*"character" + 0.011*"book" + 0.011*"good" + 0.010*"like" + 0.008*"issue" + 0.008*"read" + 0.008*"series"'), (3, '0.003*"dale" + 0.003*"album" + 0.002*"lion" + 0.002*"immigrant" + 0.002*"que" + 0.001*"de" + 0.001*"ic" + 0.001*"gunslinger" + 0.001*"katet" + 0.001*"la"'), (4, '0.002*"di" + 0.001*"e" + 0.001*"che" + 0.001*"il" + 0.001*"nd" + 0.001*"tht" + 0.001*"una" + 0.001*"un" + 0.001*"del" + 0.000*"tp"')]


In [45]:
ldamodel.save('data/lda.model')

In [52]:
ldamodel.get_document_topics(doc_term_matrix[1])

[(1, 0.8621955), (2, 0.13542761)]

In [49]:
Lda = gensim.models.ldamodel.LdaModel
ldamodel_3topics = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)

In [64]:
print(ldamodel_3topics.print_topics(num_topics=3, num_words=10))

[(0, '0.022*"story" + 0.018*"book" + 0.014*"not" + 0.014*"s" + 0.011*"read" + 0.010*"good" + 0.010*"like" + 0.010*"character" + 0.008*"great" + 0.007*"series"'), (1, '0.010*"vampire" + 0.002*"dale" + 0.002*"chef" + 0.002*"food" + 0.002*"album" + 0.001*"sushi" + 0.001*"rman" + 0.001*"di" + 0.001*"rgirl" + 0.001*"mandias"'), (2, '0.001*"immigrant" + 0.001*"de" + 0.001*"que" + 0.001*"la" + 0.001*"nd" + 0.001*"en" + 0.001*"los" + 0.000*"el" + 0.000*"threeboote" + 0.000*"es"')]


In [59]:
ldamodel_3topics.get_document_topics(doc_term_matrix[1])

[(0, 0.9998981)]

In [61]:
lsi_model = gensim.models.LsiModel(doc_term_matrix, num_topics=3, id2word=dictionary)

Since the model didn't give me a useful enough combination of words, I am going to try a couple other ways of using the text:
* Leaving in the NER items
* Using just the NER items

In [4]:
doc = nlp(reviews_df.iloc[0]['text'])

In [26]:
def get_adjectives(text):
    doc = nlp(text)
    adj = [token.text for token in doc if token.tag_ == "JJ"]
    return adj

def get_ner(text):
    doc = nlp(text)
    ents = [token.text for token in doc.ents]
    return ents

In [18]:
text_adj = [get_adjectives(text) for text in reviews_df.text.tolist()]

In [27]:
text_ents = [get_ner(text) for text in reviews_df.text.tolist()]

In [21]:
dictionary_adj = corpora.Dictionary(text_adj)
adj_term_matrix = [dictionary_adj.doc2bow(doc) for doc in text_adj]

In [23]:
Lda_adj = gensim.models.ldamodel.LdaModel
ldamodel_adj = Lda_adj(adj_term_matrix, num_topics = 3, id2word = dictionary_adj, passes=50)

In [25]:
ldamodel_adj.print_topics(num_topics=3, num_words=20)

[(0,
  '0.042*"comic" + 0.042*"great" + 0.033*"first" + 0.022*"original" + 0.018*"good" + 0.017*"early" + 0.015*"other" + 0.012*"white" + 0.012*"black" + 0.011*"classic" + 0.011*"many" + 0.010*"Great" + 0.010*"worth" + 0.010*"few" + 0.009*"old" + 0.009*"nice" + 0.008*"same" + 0.007*"Dead" + 0.007*"amazing" + 0.007*"different"'),
 (1,
  '0.037*"good" + 0.032*"great" + 0.032*"new" + 0.022*"first" + 0.022*"other" + 0.015*"comic" + 0.012*"little" + 0.012*"interesting" + 0.012*"bad" + 0.011*"many" + 0.010*"much" + 0.009*"few" + 0.009*"old" + 0.009*"big" + 0.009*"own" + 0.008*"different" + 0.008*"last" + 0.008*"same" + 0.007*"main" + 0.007*"next"'),
 (2,
  '0.035*"graphic" + 0.026*"comic" + 0.017*"great" + 0.016*"good" + 0.015*"first" + 0.014*"other" + 0.012*"many" + 0.011*"Superman" + 0.011*"little" + 0.009*"interesting" + 0.008*"real" + 0.008*"own" + 0.008*"different" + 0.007*"same" + 0.007*"much" + 0.007*"beautiful" + 0.007*"young" + 0.007*"few" + 0.006*"amazing" + 0.006*"dark"')]

In [28]:
dictionary_ent = corpora.Dictionary(text_ents)
ent_term_matrix = [dictionary_ent.doc2bow(doc) for doc in text_ents]

In [33]:
Lda_ent = gensim.models.ldamodel.LdaModel
ldamodel_ent = Lda_ent(ent_term_matrix, num_topics = 2, id2word = dictionary_ent, passes=50)

In [34]:
ldamodel_ent.print_topics(num_topics=2, num_words=25)

[(0,
  '0.048*"Batman" + 0.025*"Superman" + 0.023*"first" + 0.021*"DC" + 0.017*"one" + 0.012*"two" + 0.010*"Joker" + 0.006*"Morrison" + 0.006*"second" + 0.006*"1" + 0.005*"2" + 0.005*"Robin" + 0.005*"Fables" + 0.005*"Wonder Woman" + 0.004*"three" + 0.004*"Earth" + 0.004*"Green Lantern" + 0.003*"Gotham" + 0.003*"One" + 0.003*"JLA" + 0.003*"Aquaman" + 0.003*"3" + 0.003*"Flash" + 0.003*"Geoff Johns" + 0.003*"Bruce"'),
 (1,
  '0.029*"first" + 0.020*"one" + 0.019*"Marvel" + 0.014*"two" + 0.007*"Thor" + 0.007*"Wolverine" + 0.007*"Spider-Man" + 0.007*"second" + 0.006*"1" + 0.006*"2" + 0.006*"Avengers" + 0.005*"three" + 0.004*"Peter" + 0.004*"Captain America" + 0.004*"One" + 0.004*"Iron Man" + 0.004*"Spidey" + 0.004*"3" + 0.004*"Bendis" + 0.004*"Daredevil" + 0.003*"Cap" + 0.003*"5" + 0.003*"Earth" + 0.003*"four" + 0.003*"Spider"')]

In [39]:
ldamodel_ent.get_document_topics(ent_term_matrix[0])

[(0, 0.990969)]