# Trusted user groups - Babies!

## Loading data

In [1]:
babies = spark.read.json('./data/raw_data/reviews_Baby_5.json.gz')

## Extracting ranking components

In [2]:
reviews_per_reviewer = babies.groupBy('reviewerID').count()

In [3]:
from pyspark.sql.functions import col, udf, avg
from pyspark.sql.types import DoubleType

get_ratio = udf(lambda x: float(x[0])/(x[1] + 1), returnType=DoubleType())

usefulness = (babies
  .select('reviewerID', get_ratio(col('helpful')).alias('coeff'))
  .groupBy('reviewerID')
  .agg(avg(col('coeff')).alias('coeff')))

## Computing rankings

In [4]:
rankings = (usefulness
    .join(reviews_per_reviewer, 'reviewerID')
    .select('reviewerID', (col('coeff') * col('count')).alias('rank'))
    .filter(col('rank') > 1))

In [5]:
product = '097293751X'

best_reviewers = (babies
   .filter(col('asin') == product)
   .join(rankings, 'reviewerID')
   .select('rank', 'summary'))

good_reviews = best_reviewers.filter(col('overall') > 3).sort('rank', ascending=False)

bad_reviews = best_reviewers.filter(col('overall') <= 3).sort('rank', ascending=False)

In [6]:
good_reviews.show(truncate=False)

+------------------+---------------------------------------------+
|rank              |summary                                      |
+------------------+---------------------------------------------+
|4.914285714285715 |Best for Tracking!                           |
|2.439598997493734 |Helpful Reminder                             |
|1.9666666666666666|Perfect for the working mom                  |
|1.7999999999999998|Should be required for all new parents!      |
|1.25              |Great for newborns                           |
|1.1666666666666665|Compact and Easy way to record the milestones|
+------------------+---------------------------------------------+



In [7]:
bad_reviews.show(truncate=False)

+------------------+----------------------------------------------------------------+
|rank              |summary                                                         |
+------------------+----------------------------------------------------------------+
|6.332768361581921 |Expensive and Somewhat Limited Format                           |
|1.4333333333333331|This is fine ... but I haven't used it.                         |
|1.1666666666666665|Needs clearer AM & PM                                           |
|1.1333333333333333|It's ok, but I liked a regular weekly planner better for a baby.|
+------------------+----------------------------------------------------------------+

