In [1]:
import pandas as pd
import numpy as np

import pyspark
from pyspark.sql.functions import col
from pyspark.sql import SQLContext

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import FreqDist

import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities

from scipy.stats import entropy

import time
import re

In [2]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

In [3]:
books_df = spark.read.json('data/goodreads_books_comics_graphic.json')

In [4]:
books_df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- edition_information: string (nullable = true)
 |-- format: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- is_ebook: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- kindle_asin: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- link: string (nullable = true)
 |-- num_pages: string (nullable = true)
 |-- popular_shelves: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- pub

In [5]:
books = books_df.select(['book_id', 'popular_shelves', 'title', 'description']).toPandas()

In [15]:
books.popular_shelves[2]

[Row(count='493', name='to-read'),
 Row(count='113', name='graphic-novels'),
 Row(count='102', name='comics'),
 Row(count='97', name='marvel'),
 Row(count='36', name='captain-america'),
 Row(count='35', name='graphic-novel'),
 Row(count='32', name='comic-books'),
 Row(count='31', name='currently-reading'),
 Row(count='23', name='superheroes'),
 Row(count='22', name='favorites'),
 Row(count='20', name='marvel-comics'),
 Row(count='20', name='comics-graphic-novels'),
 Row(count='19', name='superhero'),
 Row(count='18', name='comic'),
 Row(count='17', name='fiction'),
 Row(count='12', name='graphic-novels-comics'),
 Row(count='11', name='owned'),
 Row(count='10', name='comics-and-graphic-novels'),
 Row(count='10', name='comics-read'),
 Row(count='7', name='read-in-2014'),
 Row(count='5', name='comics-marvel'),
 Row(count='5', name='read-in-2015'),
 Row(count='5', name='marvel-unlimited'),
 Row(count='5', name='ebook'),
 Row(count='5', name='read-comics'),
 Row(count='5', name='super-hero'