In [1]:
from pyspark.sql.functions import lit

# Load in one of the tables
df1 = spark.sql("select * from default.video_games_5")
df1 = df1.withColumn('category', lit("video_games"))

df2 = spark.sql("select * from default.home_and_kitchen_5_small")
df2 = df2.withColumn('category', lit("home_and_kitchen"))

df3 = spark.sql("select * from default.books_5_small")
df3 = df3.withColumn('category', lit("books"))

df = df1.union(df2).union(df3)

# Take a sample (useful for code development purposes)
df = df.sample(False, 0.15, seed=0)

df = df.cache()

print((df.count(), len(df.columns)))

In [2]:
df.printSchema()

In [3]:
# Let's look at some quick summary statistics
df.describe().show()

In [4]:
display(df.groupBy("category").count().orderBy("category"))

category,count
books,149471
home_and_kitchen,300223
video_games,73082


In [5]:
from pyspark.sql.functions import col
display(df.groupBy("overall").count().orderBy("overall"))

overall,count
1.0,26813
2.0,22995
3.0,43537
4.0,91646
5.0,337785


In [6]:
# The most common product IDs
display(df.groupBy("asin").count().orderBy(col("count").desc()).head(50))

asin,count
0007420412,2919
000711835X,2847
0007548672,2345
0007350899,1172
0007444117,1164
B000YGEVMI,1029
0007378033,930
B0015TMHSI,892
006017322X,780
0007350783,733


In [7]:
display(df.groupBy("label").count().orderBy("label")))

label,count
0,428479
1,94297


In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.sql import functions as f

# We'll tokenize the text using a simple RegexTokenizer
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")

# Remove standard Stopwords
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")

pipeline = Pipeline(stages=[tokenizer, stopwordsRemover])

pipelineFit = pipeline.fit(df)
df = pipelineFit.transform(df)

In [9]:
counts = df.select(f.explode('filtered').alias('col')).groupBy('col').count().sort(f.desc('count')).collect()
display(counts)

col,count
one,187045
book,184569
game,152816
great,146841
like,144142
good,118181
well,104851
read,99594
time,92205
get,89471
