# Trusted user groups - Babies!

## Loading data

In [1]:
reviews = spark.read.json('./data/raw_data/reviews_Amazon_Instant_Video_5.json.gz')

## Extracting ranking components

In [4]:
reviews_per_reviewer = reviews.groupBy('reviewerID').count()

In [2]:
from pyspark.sql.functions import col, udf, avg
from pyspark.sql.types import DoubleType

usefulness_ratio = udf(
    lambda (useful, out_of): useful / float(out_of + 1), 
    returnType=DoubleType())

usefulness = (reviews
  .select('reviewerID', usefulness_ratio(col('helpful')).alias('usefulness'))
  .groupBy('reviewerID')
  .agg(avg(col('usefulness')).alias('usefulness')))

## Computing rankings & visualizing the good and bad reviews from the most trusted users

In [5]:
rankings = (usefulness
    .join(reviews_per_reviewer, 'reviewerID')
    .select('reviewerID', (col('usefulness') * col('count')).alias('rank'))
    .filter(col('rank') > 1))

In [13]:
most_reviewed_product = reviews.groupBy('asin').count().sort('count', ascending=False).take(1)[0][0]

best_reviewers = (reviews
   .filter(col('asin') == most_reviewed_product)
   .join(rankings, 'reviewerID')
   .select('rank', 'summary'))

good_reviews = best_reviewers.filter(col('overall') > 3).sort('rank', ascending=False)

bad_reviews = best_reviewers.filter(col('overall') <= 3).sort('rank', ascending=False)

In [18]:
good_reviews.toPandas().head(10)

Unnamed: 0,rank,summary
0,21.467122,Heronymous
1,9.129708,Get Filming ~ We Can't Wait!
2,5.085658,Titus Welliver is perfect as Bosch!
3,5.020119,Grown up entertainment
4,4.49986,Looks like a great cop show
5,4.260714,Bosch
6,3.883211,Woah!
7,3.528111,As A Harry Bosch Fan in Michael Connolly Books...
8,3.51278,Best of the new Amazon Originals
9,3.455723,A Total Surprise - Very Well Done


In [19]:
bad_reviews.toPandas().head(10)

Unnamed: 0,rank,summary
0,4.707372,NO
1,3.416667,Okay
2,3.274638,Liked it but would not look for it
3,3.190476,Very dissapointing
4,2.880392,"THIS BOSCH POWER TOOL""s BATTERY NEEDS CCHARGING"
5,2.766667,"Slow, boring, cliched"
6,2.119048,"Meh, part good, part bad"
7,2.058974,"Liked it , Just way way too many cop shows alr..."
8,2.0,Typical Detective Show
9,1.890756,"The books are compelling, but this series isn't."
