In [25]:
import os 
import random 
random.seed(6806) 
random_state = 6806

import pandas as pd
pd.set_option("display.max_colwidth", None)

from utils import *

In [26]:
csv_files = [f"data/{file}" for file in os.listdir("data") if file.endswith(".csv")]

In [27]:
csv_files

['data/luxury_beauty.csv',
 'data/software.csv',
 'data/arts_crafts_and_sewing.csv',
 'data/prime_pantry.csv',
 'data/industrial_and_scientific.csv',
 'data/gift_cards.csv',
 'data/all_beauty.csv',
 'data/magazine_subscriptions.csv',
 'data/digital_music.csv',
 'data/appliances.csv',
 'data/musical_instruments.csv',
 'data/amazon_fashion.csv']

In [28]:
category_to_df = {} 

for path in csv_files:
    df = pd.read_csv(path, index_col=[0])
    original_length = len(df)

    df = df.drop_duplicates()
    df = df[(df["reviewText"].str.split().str.len() > 0) & (df["reviewText"].str.split().str.len() <= 100)] # Exclude reviews that are too long

    category = path.split("/")[-1][:-4]
    category_to_df[category] = df 
    
    print(category, original_length, len(df))

luxury_beauty 34278 14910
software 12805 4996
arts_crafts_and_sewing 494485 330242
prime_pantry 137788 96007
industrial_and_scientific 77071 51382
gift_cards 2972 1920
all_beauty 5269 1128
magazine_subscriptions 2375 1542
digital_music 169781 86578
appliances 2277 127
musical_instruments 231392 157809
amazon_fashion 3176 426


In [29]:
sum([len(df) for df in category_to_df.values()])

747067

In [30]:
for category, df in category_to_df.items():
    if len(df) > 10000:
        df = df.sample(10000, random_state=random_state)
        category_to_df[category] = df
        
    print(f"\033[1m{category}\033[0m, length={len(df)}")
    display(df.sample(5, random_state=random_state))
    print()

[1mluxury_beauty[0m, length=10000


Unnamed: 0,reviewText,overall
4769,Expensive & I think it only works with the Acne Gel (also by PCA) but the two together do a good job at keeping my skin clear.,5.0
26558,Great product for my skin with acne.,5.0
15248,"In my personal opinion, color is good to be used as base for another one. Didn't like it using it alone-very solid color, no glossy or pearl like finish.",2.0
6881,Best moisturizer I have ever used. Skin looks softer and smoother and younger and brighter :-),5.0
842,"This foundation is a very good match to my complexion. I am very pale, and the color doesn't stand out. I need a light coverage foundation, and this is light and makes my skin look dewy and glowing. A very nice product, even for older skin.",5.0



[1msoftware[0m, length=4996


Unnamed: 0,reviewText,overall
12542,Found the firewall either blocks too much or does not block enough. Antivirus on a mac is over kill and it slowed my system down.,1.0
9008,"Office 365 is what I've been using for my email and documents for over a year now. It's great to have access to all of my files, with the ability to edit and share them, from wherever I go. Getting a subscription card just makes it that much easier, so you don't have to worry about reoccurring payments - it's a great gift for a college student!",5.0
10922,"Very easy to install; no computer skills required.\n\nMcAfee is my preferred choice of protection. I've never had issues\n\nOn my daughter's laptop, I was able to install parental controls. On my desktop and laptop, it was easy to skip the installation of parental controls.\n\nHighly recommend!",5.0
6435,Windows 7 still the best Operating System Microsoft has released blows windows 8 out of the water and loved the price.,5.0
350,"I don't have this exact model, I have Zoom's Model #3048, which is nearly identical. I've had it for just over five months, (got it at Staples), and it's a terrific modem. Very reliable and stable. If you're in the market for an external modem, I highly recommend this one.",5.0



[1marts_crafts_and_sewing[0m, length=10000


Unnamed: 0,reviewText,overall
222695,"These work so much nicer than pins in some cases ... when working with Vinyl you can't use pins as holes will be left behind , this eliminates that problem ...",5.0
413917,"Very pleased with this purchase. Great value for a full set of hooks with a few smaller ones you normally don't find. All felt smooth and I did try a few before writing this review. They were fine to work with and had my preferred ""Boye-type"" hook shape rather than the Susan Bates. Sizes were marked in mm and they came in a ziplock bag. Great set and would purchase again if the need arose.",5.0
164061,always love ab,5.0
359681,LOVE THEM NOW I AM NOT STABBING MYSELF WITH A METAL NEEDLE!,5.0
200313,The product came on time and the colors are very vivid. I'm gonna order some more. At the price it is a good deal. No breakage or tangling or anything. Love it.,5.0



[1mprime_pantry[0m, length=10000


Unnamed: 0,reviewText,overall
89790,My son doesn't like minty toothpaste so I was excited to try this flavor. HE LOVES IT! He has an expander and braces so good dental hygiene is very important now more than ever. I am very pleased we found this flavor!,5.0
23356,I wanted these for a healthier snack but I don't like them at all. The flavor isn't too bad but the texture is terrible. I know they aren't claiming to be gummies but that was closer to my expectation for the texture. I was disappointed.,2.0
46538,A must have for all bathrooms especially when regular toilet paper just won't do. You can use it to wipe off make-up too. Refillable. I keep one in each bathroom.,5.0
51266,I buy this for tuna salad sandwiches...it's already in bits.,4.0
105718,Favorite of all the Folgers. Too bad they they decreased the amt and increased the cost... Have to wait for sale to buy again,5.0



[1mindustrial_and_scientific[0m, length=10000


Unnamed: 0,reviewText,overall
68294,Exactly as Described. Fast Shipping. Good Price. Really Easy to use. Good Adhesion. No Cuts on hands. Seems Durable. Couldn't be Happier. Thank you!,5.0
8506,"Very solid. I have not yet been able to put it to use, but it appears to be very well made.",5.0
20722,"Lets you draw clear and accurate lines on metal. Dries fast enough and comes of easel enough. Has a great appearance, not that this matters in a practical sense. Always wanted something like this as I make steel gate hardware and it is nice to do it in a professional way.",5.0
57260,"great for gasoline fuel lines! I replaced the return line in my car using this hose. Make sure to get hose clamps, preferably stainless steel.\n\nhttp://www.instructables.com/id/Replacing-a-fuel-return-line/",5.0
33514,It arrived but haven't used it yet. Im sure its fine.,5.0



[1mgift_cards[0m, length=1920


Unnamed: 0,reviewText,overall
745,Was a great gift. They loved it.,5.0
741,great card given for xmas.,5.0
1384,great gift and the tin is very nice,5.0
2171,Very nice gift presentation.,5.0
2554,food was ok,4.0



[1mall_beauty[0m, length=1128


Unnamed: 0,reviewText,overall
261,Awesome product,5.0
709,This body wash is so concentrated and fragrant. It lathers beautifully and moisturizes.,5.0
739,This gel is so fragrant and good lather. Very luxurious!,5.0
5267,Ok this eye gel is good stuff.,5.0
361,It's all my son uses and great price and delivery,5.0



[1mmagazine_subscriptions[0m, length=1542


Unnamed: 0,reviewText,overall
626,"Got this one as a free read on the Fire and was very surprised to even see it listed, as I am a print subscriber and I didn't even know there was a Kindle edition. Still haven't been able to find out how/if print subscribers could get the Kindle edition included with their subscription. Mag looks nice on the Fire though.",3.0
1485,I love this magazine. However every since I moved I have not received my magazines and I put in my new address.,5.0
1168,"In an age where ""People"" magazine and ""Enquirer"" are bestsellers, this lovely magazine is a treasure. I actually appreciate the new glossy look. There's no reason why a magazine can't look beautiful and still be a useful product and informative. The gardening articles are always the best but the other material is good too.\n\nThe articles are always good and I usually read the magazine from cover to cover when I get it. A great choice for people who love the natural world.",5.0
450,love it...super cheap and easy to view on my kindle fire hd,5.0
2095,What a fun magazine to receive every month! I love the recipes that are included each month and reading about the TV personalities who prepared them.,5.0



[1mdigital_music[0m, length=10000


Unnamed: 0,reviewText,overall
166885,I picked this song because I liked the sound of the beat.,3.0
93404,"this is a catchy, upbeat song. Love it.",5.0
134794,"Love all of willies songs, this is another favorite of mine and it's nice to be able to pick the ones that are special to me.",5.0
144282,one of the greats from my childhood--associated with lots of good memories and filled with old time worship feelings that touch the soul,5.0
81013,I like a few of Miguel's songs. This is one of them. The quality is good. It was easy to find and purchase. I am a satisfied customer.,5.0



[1mappliances[0m, length=127


Unnamed: 0,reviewText,overall
20,A spare thermofuse. Two years ago my dryer spun but there was no heat. I got a service tech undid the back and changed it in seconds. So I decided for the price I ordered a spare just in case.,4.0
109,Good Job,5.0
14,Great product,5.0
15,Did the job for fixing our Maytag dryer,5.0
21,"Since the motor on my humidifier still works, all I had to do was run vinegar through it twice and replace the filter. It's operating good as new, at least until it doesn't.\n\nThe filter is important as it reduces the really fine white dust just as advertised. There's no way to look at the old used filter to know it should be replaced, but there's no way one of these can be used for two seasons.",5.0



[1mmusical_instruments[0m, length=10000


Unnamed: 0,reviewText,overall
69414,"It's perfect for what I wanted, the bad thing is that it does not have a power button, it also has a small buzz, but that with all the connected instruments disappears.",4.0
81332,must have itim,4.0
28616,"too noisy to use, I wanted to retire my 1978 model for this....Dunlop just can't get this right like my old one.",1.0
192087,Great value! An excellent buy for a small pedal board. Comes with everything you need. Very happy with the quality and all the accessories. So good I bought a second one!,5.0
8204,Great reeds.,5.0



[1mamazon_fashion[0m, length=426


Unnamed: 0,reviewText,overall
352,Great fit so comfortable love them!!!!!,5.0
443,Took other reviewers advice and went up half size and it was too large. I thought this was a more sock-like fit but it had a tongue that came up high on my ankle not at all sock like. Nice looking and very light weight with a rounded square toe that left plenty of room front foot and toes. These just weren't what I was looking for.,3.0
331,Very comfortable sneakers and I also like the way it fits.,4.0
216,She loves them,5.0
334,"I love these shoes my second pair of 40 plus $ shoes and they both have certainly out done the $100 K-swiss that I'm looking to return . The shoes are extremely comfortable, all my clients love them, just wish they came in more colors.",5.0





In [31]:
sum([len(df) for df in category_to_df.values()])

70139

In [32]:
source_domains = get_all_source_domains()
sum([len(category_to_df[domain]) for domain in source_domains]) / sum([len(df) for df in category_to_df.values()])

0.42772209469767175

In [34]:
sum([len(df[df["overall"] <= 3]) for df in category_to_df.values()]) / sum([len(df) for df in category_to_df.values()])

0.13655740743381