### Import Amazon Luxury Beauty Product Reviews
Import review and product JSON files. Data exploration and cleaning (restrict to certain categories - see 1a file for initial investigations). Combine review and products together into one MongoDB entry

In [1]:
## Import from command line
# (metis) Jocelyns-MacBook-Pro:project 4 Jocelyn$ gunzip amazon_luxury_beauty/Luxury_Beauty.json.gz
# (metis) Jocelyns-MacBook-Pro:project 4 Jocelyn$ mongoimport --db amazon_lb --collection product_reviews --file amazon_luxury_beauty/Luxury_Beauty.json

In [1]:
from pymongo import MongoClient
from pprint import pprint

import pandas as pd
import re
import nltk

In [2]:
from bson.objectid import ObjectId

In [3]:
# This creates a client that uses the default port on localhost.
# If connecting to AWS, you need a connection string.
# Can do the same thing with MongoClient("mongodb://localhost:27017")
# client = MongoClient()
client = MongoClient("mongodb://localhost:27017")

In [4]:
# Makes it look similar to shell mongo
db = client.amazon_lb

In [5]:
db.list_collection_names()

['skin_care_face_reviews',
 'product_reviews',
 'skin_care_face_rev_agg',
 'skin_care_face_products',
 'skin_care_face_prod_rev',
 'product_metadata']

In [6]:
cursor = db.product_reviews.find_one({}, {'_id':0,})

In [7]:
list(db.product_metadata.find().limit(1))

[{'_id': ObjectId('5dc881769f9b98109203bc99'),
  'description': ["Rich, black mineral mud, harvested from the banks of the Dead Sea, is comprised of layer upon layer of sedimentary clay formed over thousands of years. Captured within is an extremely high concentration of minerals, scientifically proven to be essential in maintaining healthy skin. Ahava Black Mineral Mud works deep to clean, purify and restore the skin's natural moisture balance, leaving it smooth, radiant and revitalized.",
   '',
   ''],
  'title': 'AHAVA Dead Sea Mineral Mud, 8.5 oz, Pack of 4',
  'image': ['https://images-na.ssl-images-amazon.com/images/I/41O1luEZuHL._SX50_SY65_CR,0,0,50,65_.jpg'],
  'rank': '1,806,710inBeautyamp;PersonalCare(',
  'details': '\n  <div class="content">\n\n\n\n\n\n\n\n<ul>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                                                                \n\n\n    <li><b>\n    Product Dimensions: \n    </b>\n    5.1 x 3 x 5.5 inches ; 2.48 pounds\n    </li>\n

In [8]:
list(db.product_reviews.find().limit(1))

[{'_id': ObjectId('5dc881afaf3db5220c957b7f'),
  'overall': 5.0,
  'verified': True,
  'reviewTime': '04 18, 2018',
  'reviewerID': 'A2EM03F99X3RJZ',
  'asin': 'B00004U9V2',
  'style': {'Size:': ' 3.5 oz.'},
  'reviewerName': 'Maureen G',
  'reviewText': 'Great hand lotion',
  'summary': 'Five Stars',
  'unixReviewTime': 1524009600}]

### restrict to facial skincare

In [10]:
# checked categories in second notebook - consolidate cat3
#Cat 2
# Counter({'Treatments & Masks': 717,
#          'Exfoliators, Polishes & Scrubs': 91,
#          'Creams & Moisturizers': 785,
#          'Toners & Astringents': 132,
#          'Cleansers': 461,
#          'Sets & Kits': 89,
#          'Other': 41})
# Cat 3
# ['Other': 1088,
#          'Exfoliators': 29,
#          'Masks': 238,
#          'Serums': 257,
#          'Face Moisturizers': 223,
#          'Scrubs': 60,
#          'Gels': 90,
#          'Night Creams': 78,
#          'Acids & Peels': 49,
#          'Washes': 114,
#          'Face Oil': 29,
#          'Bars': 4,
#          'Microdermabrasion': 11,
#          'Tinted Moisturizers': 13,
#          'Face Mists': 3,
#          'Cloths & Towelettes': 13,
#          'Neck & Dcollet': 14,
#          'Pore Cleansing Strips': 2,
#          'Polishes': 1})]

cat3_list = ['Bars','Microdermabrasion','Face Mists','Cloths & Towelettes','Neck & Dcollet',
             'Pore Cleansing Strips','Polishes']

In [43]:
cursor = list(db.product_metadata.find())

In [44]:
try:
    db.drop_collection('skin_care_face_products')
except:
    pass

asin_list = []
title_list = []
missed_counter = 0

for i in cursor:
    try:
        a = i['details']
        a1 = a[a.find('<span class="zg_hrsr_ladder">'):]
        a2 = a1[:a1.find('</span>\n')]
        a3 = [a.strip() for a in re.sub('<[^<]+?>','',a2).split('&gt;')]

        if 'Skin Care' in a3:
            if 'Face' in a3: #keep only skin care face products
                if len(i['title']) > 400: #clean up messy titles
                    x = i['title']
                    i['title'] = re.sub('\n|/|Amazon.com:|: Luxury Beauty','',x[:x.find("=====")][x[:x.find("=====")].find('-->')+3:]).strip()
                i['categories'] = a3
                i['rating'] = a[a.find('out of 5 stars')-4:a.find('out of 5 stars')-1]
                i['details'] = re.sub('{[^{]+?}|<[^<]+?>|\n|\t','',i['details'])
                i['category1']= 'Face'
                try:
                    i['category2']=i['categories'][3].replace("&amp;","&")
                except:
                    i['category2']='Not listed'
                try:
                    i['category3']=i['categories'][4].replace("&amp;","&")
                    if i['category3'] in cat3_list:
                        i['category3'] = 'Other'
                    elif i['category3'] == 'Tinted Moisturizers':
                        i['category3'] = 'Face Moisturizers'
                except:
                    i['category3']='Not listed'
                try:
                    i['description'] = ' '.join(i['description'])
                except:
                    i['description'] = ''
                # keep running asin list and don't insert if it's already been added; same for title dups
                if (i['asin'] not in asin_list) and (i['title'] not in title_list):
                    if len(i['title']) < 1000:
                        db.skin_care_face_products.insert_one(i)
                        asin_list.append(i['asin'])
                        title_list.append(i['title'])
#                 db.skin_care_face_products.update_one({'_id':ObjectId(i['_id'])},{'$set':{'categories':a3}})
    #         else:
    #             print('not facial')
    #     else:
    #         print('not skin care')
    except:
#         raise
        missed_counter+=1
print('missed',missed_counter)
print('asin_list length',len(asin_list))
print('title_list length',len(title_list))

missed 3
asin_list length 2241
title_list length 2241


In [45]:
products_face = list(db.skin_care_face_products.find())
len(products_face)

2241

In [46]:
products_face[0]['title']

'DERMAdoctor Calm, Cool & Corrected anti-redness tranquility cream - 1.7 Oz'

In [47]:
products_face[142]['category3']

'Face Moisturizers'

In [48]:
product_id_list = []
for i in products_face:
    product_id_list.append(i['asin'])

In [49]:
len(product_id_list)
product_id_list[0:5]

['B0000Y3NO6', 'B00012C5RS', 'B0001EKTTC', 'B0001EL5Q8', 'B0001EL5JA']

### grab reviews for these products

In [50]:
rev_cursor = list(db.product_reviews.find())

try:
    db.drop_collection('skin_care_face_reviews')
except:
    pass

for i in rev_cursor:
    if i['asin'] in product_id_list:
        try:
            len(i['reviewText'])
        except:
            i['reviewText']='N/A'
        db.skin_care_face_reviews.insert_one(i)

In [51]:
reviews_face = list(db.skin_care_face_reviews.find())
len(reviews_face)

81038

In [52]:
reviews_face[1]

{'_id': ObjectId('5dc881afaf3db5220c9582ef'),
 'overall': 5.0,
 'vote': '18',
 'verified': True,
 'reviewTime': '03 1, 2007',
 'reviewerID': 'A2DKQ5CLJ2KWM3',
 'asin': 'B0000Y3NO6',
 'reviewerName': 'Jessi Hope',
 'reviewText': "After trying all the drugstore rosacea products to moisturize my skin and calm the redness and itching down, I gave up hope that anything would work. I have moderate rosacea that sometimes becomes severely inflamed across my cheeks, nose, and above my nose. Some products would give me relief from symptoms for a week or so, and then stop working. I tried creams, natural and vitamin supplements, mineral creams, yoga, everything I remotely thought might help. This product is the only thing that has EVER worked, and it works quickly (drastically reduces redness within an hour usually- two hours at my worst inflammation), leaves no greasiness, and doesn't mess with my allergies. It is the most expensive beauty product I've ever bought, so it was risky, but I was des

### create aggregated review collection

In [53]:
sc_revs = db.skin_care_face_reviews.aggregate([{'$group': {'_id': "$asin", 
                                                        "reviews": {'$push': "$$ROOT"},
                                                        "count": {'$sum': 1}}
                               }])

In [54]:
rev_groups = list(sc_revs)

In [55]:
rev_groups[0]

{'_id': 'B000YTA4RG',
 'reviews': [{'_id': ObjectId('5dc881b3af3db5220c975db1'),
   'overall': 5.0,
   'vote': '5',
   'verified': False,
   'reviewTime': '05 15, 2011',
   'reviewerID': 'A1ST5O7M8PFNOC',
   'asin': 'B000YTA4RG',
   'reviewerName': 'A. Resch',
   'reviewText': "I love this stuff!!  I noticed a difference in my skin right away.  It does sting a bit when applied but then again beauty can be painful...still the benefits are worth it.  This is my second purchase and will continue to purchase it as long as it's available.  I only wish it wasn't so expensive.  I'm military and have a limited budget.",
   'summary': 'AimHighFlyGirl',
   'unixReviewTime': 1305417600},
  {'_id': ObjectId('5dc881b3af3db5220c975db3'),
   'overall': 5.0,
   'verified': False,
   'reviewTime': '12 15, 2013',
   'reviewerID': 'A2PRIDNZVTY6K3',
   'asin': 'B000YTA4RG',
   'reviewerName': 'S. Pawson',
   'reviewText': "Amazing results.  This product may be a bit expensive, but it's worth every penny. 

In [56]:
try:
    db.drop_collection('skin_care_face_rev_agg')
except:
    pass

db.skin_care_face_rev_agg.insert_many(rev_groups)

<pymongo.results.InsertManyResult at 0x1a745e5908>

### add reviews to product dict

In [57]:
prod_revs = db.skin_care_face_products.aggregate([
    { '$lookup':
        {
           'from': "skin_care_face_reviews",
           'localField': "asin",
           'foreignField': "asin",
           'as': "reviews"
        }
    }
])

In [58]:
prod_revs1 = list(prod_revs)

try:
    db.drop_collection('skin_care_face_prod_rev')
except:
    pass

db.skin_care_face_prod_rev.insert_many(prod_revs1)

<pymongo.results.InsertManyResult at 0x1a81fb86c8>

In [59]:
prod_revs1[2]

{'_id': ObjectId('5dc881769f9b98109203bcb9'),
 'description': 'Promotes a supple texture and bright tone Glytone, transforming solutions for beautiful and healthy looking skin',
 'title': 'Glytone Rejuvenating Mask, 3 oz.',
 'also_buy': ['B002D48QIE',
  'B002D48QI4',
  'B001MTYR8Y',
  'B000AJ23EC',
  'B002D48QRA',
  'B0176U0IM6',
  'B003H8EYUE',
  'B002D48QNE',
  'B004KIB688',
  'B01IG2KLZC',
  'B0037O5920',
  'B002D48QIY',
  'B002D48QR0',
  'B01IG8K4L2',
  'B01JH4ACB6',
  'B002D48QJ8'],
 'image': ['https://images-na.ssl-images-amazon.com/images/I/411mybce8lL._SX50_SY65_CR,0,0,50,65_.jpg',
  'https://images-na.ssl-images-amazon.com/images/I/31wL2rHBRvL._SX50_SY65_CR,0,0,50,65_.jpg'],
 'rank': '96,424inBeautyPersonalCare(',
 'also_view': ['B002D48QJ8',
  'B00LPK7UY8',
  'B000AJ23EC',
  'B002D48QR0',
  'B01IG2KLZC',
  'B002D48QRA',
  'B07L8W53DT',
  'B002D48QRU',
  'B07JLZHPRR',
  'B004L99UEI',
  'B01DV9E38G',
  'B002D48QIY',
  'B0176U0IM6',
  'B009150ZNW',
  'B0037O5920',
  'B003H8EYUE'

In [60]:
db.list_collection_names()

['product_reviews',
 'skin_care_face_rev_agg',
 'product_metadata',
 'skin_care_face_reviews',
 'skin_care_face_prod_rev',
 'skin_care_face_products']