In [2]:
import pandas as pd
import numpy as np

import pyspark
from pyspark.sql.functions import col
from pyspark.sql import SQLContext

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords

import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities

from scipy.stats import entropy

import time
import re

In [3]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

In [4]:
book_review_df = spark.read.json('data/reviews_Books.json')
comics_df = spark.read.json('data/comic_reviews_wtitle.json')

In [12]:
comics_asins = [asin[0] for asin in comics_df.select('asin').dropDuplicates().collect()]

In [15]:
book_review_df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [18]:
# Filtering review text for comic books used from all book reviews
comic_review_text  = book_review_df.select(['asin',
                                            'reviewText',
                                            'summary'
                                           ]).filter(col("asin").isin(comics_asins)).collect()

In [23]:
# Converting individual reviews to dictionary of ASIN with all review text.
review_text = {}
for review in comic_review_text:
    if review[0] not in review_text.keys():
        review_text[review[0]] = review[1] + review[2]
    else:
        review_text[review[0]] += review[1] + review[2]

In [36]:
# Temporarily saving to CSV
pd.DataFrame(list(review_text.items()), columns=['asin', 'text']).to_csv('data/review_text.csv')

In [76]:
import spacy
from spacy import displacy
from collections import Counter
from pprint import pprint
import en_core_web_sm
nlp = en_core_web_sm.load()

In [82]:
doc = nlp(review_text['0316107255'])
pprint([(X.text, X.label_) for X in doc.ents if X.label_ is 'PERSON'])

[("Ronald Reagan's", 'PERSON'),
 ('Calvin', 'PERSON'),
 ('Hobbs', 'PERSON'),
 ('Bill Watterson', 'PERSON'),
 ('Gary Larson', 'PERSON'),
 ("Scott Adams'", 'PERSON'),
 ('Dilbert', 'PERSON'),
 ('Get Fuzzy', 'PERSON'),
 ('Darby Conley', 'PERSON'),
 ('Tatsuya Ishid', 'PERSON'),
 ('Opus', 'PERSON'),
 ("Gary Trudeau's Doonesbury", 'PERSON'),
 ('Breathed', 'PERSON'),
 ('Doonesbury', 'PERSON'),
 ('Trudeau', 'PERSON'),
 ('Hunter S. Thompson', 'PERSON'),
 ('Oliver Wendell Holmes', 'PERSON'),
 ('Douglas Adams', 'PERSON'),
 ('Berke Breathed', 'PERSON'),
 ('Ronald Reagan', 'PERSON'),
 ('Walter Mondale', 'PERSON'),
 ('Michael Jackson', 'PERSON'),
 ('Steve Dallas', 'PERSON'),
 ('Steve Dallas', 'PERSON'),
 ('Steve Dallas', 'PERSON'),
 ('Opus', 'PERSON'),
 ('Bill the Cat', 'PERSON')]
