In [102]:
import os 
import random 
random.seed(6806) 
random_state = 6806

import pandas as pd
pd.set_option("display.max_colwidth", None)

from utils import *

In [103]:
csv_files = [f"data/{file}" for file in os.listdir("data") if file.endswith(".csv")]

In [104]:
csv_files

['data/luxury_beauty.csv',
 'data/software.csv',
 'data/arts_crafts_and_sewing.csv',
 'data/prime_pantry.csv',
 'data/industrial_and_scientific.csv',
 'data/gift_cards.csv',
 'data/all_beauty.csv',
 'data/magazine_subscriptions.csv',
 'data/digital_music.csv',
 'data/appliances.csv',
 'data/musical_instruments.csv',
 'data/amazon_fashion.csv']

In [105]:
category_to_df = {} 

for path in csv_files:
    df = pd.read_csv(path, index_col=[0])
    original_length = len(df)

    df = df.drop_duplicates()
    df = df[df["overall"] != 3] # Exclude neutral/undetermined reviews
    df = df[(df["reviewText"].str.split().str.len() > 0) & (df["reviewText"].str.split().str.len() <= 250)] # Exclude reviews that are too long

    category = path.split("/")[-1][:-4]
    category_to_df[category] = df 
    
    print(category, original_length, len(df))

luxury_beauty 34278 19709
software 12805 7206
arts_crafts_and_sewing 494485 327780
prime_pantry 137788 92019
industrial_and_scientific 77071 53450
gift_cards 2972 1922
all_beauty 5269 1262
magazine_subscriptions 2375 1627
digital_music 169781 87916
appliances 2277 131
musical_instruments 231392 168305
amazon_fashion 3176 390


In [106]:
sum([len(df) for df in category_to_df.values()])

761717

In [107]:
for category, df in category_to_df.items():
    if len(df) > 30000:
        df = df.sample(30000, random_state=random_state)
        category_to_df[category] = df
        
    print(f"\033[1m{category}\033[0m, length={len(df)}")
    display(df.sample(5, random_state=random_state))
    print()

[1mluxury_beauty[0m, length=19709


Unnamed: 0,reviewText,overall
31991,"Pros:\nGoes on easy - especially on slightly damp skin\nIs all natural\nMy skin seems to love it - especially on the back of my hands - REALLY!\n\nCons\nThe scent - this has a very strong sort of citrus-y scent that really lasts. I can't find that it's got any added scent on the packaging so it must be the combo of some of the oils.\nThe ridiculous price\n\nThoughts: I will report back when I'm done with the bottle, I keep it in the refrigerator along with my vitamin C serum. If this last 6 months, it might be worth the price. I like this better than Argon oil on my face, waaaay better. If you don't balk at $72 for a half ounce of product, than buy this right now and don't worry about how long it lasts.",5.0
15877,"I have very sensitive skin, and I am happy to report that this eye treatment did not irritate my skin at all. It is not a miracle product, and it did not make me look 20 years younger overnight, but I do see some improvement. My skin around eyes looks smoother and more even.\nI do not like the applicator, the pump works inconsistently - sometimes I need to push on it several times, and then too much product comes out, but sometimes it works just fine. You can use the round tip to apply it around your eyes, but I like it better to use my finger.\nIn about 2-3 seconds after application, the area starts to feel heat, and it lasts for a few minutes, then it feels cold for a few more minutes. It is quite puzzling, but I kind of started to like it. It does not make eye area red and just feels warm and then cool. It absorbs very well, and does not feel greasy or sticky afterwards.",4.0
26821,"I tried Dermablend some years ago and didn't like it because it looked really cakey and thick. Things have changed! This foundation goes on smoothly, doesn't cake around fine lines and wrinkles, and smooths out skin tone very nicely.\n\nYou can apply this foundation with a sponge or with your fingertips (as I do) and it goes on smoothly. after waiting a few minutes, it's a good thing to set this foundation with a light brush of powder. The coverage is very good and this foundation stays in place for about 8 hours before it needs touching up. It looks quite natural in daylight and office lighting, I think.\n\nI have very sensitive skin and this product did not cause breakouts or irritation--it's a very clean foundation with no oils to clog pores. One thing: the color runs a bit darker than expected, so I would recommend purchasing a shade lighter than you might ordinarily buy.\n\nAll in all, I'm very happy with this product.",5.0
22550,"It's from Jack Black -- of course, it is great!!",5.0
24301,"My wife's favorite moisturizer:\nGreat heavy-duty moisturizer that is highly concentrated. Its consistency is thick and almost pastey; you only need a little bit to get the job done. It has a really nice aroma, but is not perfumeylike many other moisturizers. You won't be smelling it all day long. You will LOVE how your skin feels after this sits on your skin. I like to use it at night and use the Hydrate Facial Moisturizer during the day. Both great products, but this one has an extra kick if you have drier skin. Highly recommended!!!",5.0



[1msoftware[0m, length=7206


Unnamed: 0,reviewText,overall
1992,"I have found that after using ""Turbo-Tax"" That they really have a flaw in their calculations! Let me say one more time! ""They screwed me up""!\n\nThey allowed me to change a iteration, Without a problem! This caused me to pay a fine for for failure to pay the Damn-Tax!\n\nI will be contacting ""Intuit"" for their total dis regard of support the customer!",2.0
12463,"Great software, including MS Word!",5.0
6682,"First, unlike many reviewers here, I am new to Quickbooks. I do have a business, and have tried some cheaper software (Peachtree, for one) without much success. Since my business is small, I could get by with a paper ledger, check register and Word--but it wasn't pretty and the record keeping and invoicing was clumsy and really inefficient\n\nSo, for me, as a small business owner with a currently unsatisfactory accounting system and no previous Quickbooks experience, this was really great. Looking at the home page, it's pretty clear how well organized the categories are. The tutorial was also helpful. Best of all, for me, were two things: (1) how much time it saved to have vendors' accounts entered ONCE and then access them for invoices, etc without retyping/formatting; (2) how well organized and accurate my records got with very little effort. And, (3) it all looked professional--a big difference from my Staples invoice forms or Word documents.\n\nI'm no Quickbooks expert, but for me, it was useful and improved my business organization, presentation and performance exponentially.",4.0
6355,"I have used Norton anti-virus and security products ever since I first became involved with computers in the late 1990s. For the most part, Norton products have served me well over the years, aside from a glitch or two along the way. I have installed this version on three machines, one running Windows 7, one running Windows Vista, and the third equipped with Windows XP. The installation on the old XP machine gave me the most trouble. I had to stop the installation, restart the machine -- which had frozen -- and then finish the installation. Everything went smoothly on the two newer operating systems, and I have been pleased with the performance of this product since installation.",5.0
10113,good product I have used it for 7 years,5.0



[1marts_crafts_and_sewing[0m, length=30000


Unnamed: 0,reviewText,overall
361797,Good price for these hard to find cutting pads for the newest Big Shot.,5.0
164932,"I must admit I was a little disappointed when these arrived and they did not stick. I had to apply more glue to the back to make them stay flat and because they are so fragile, many broke in the process.\nWould not recommend this to anyone living in a humid climate. They don't work.",1.0
20329,"I love this sewing machine. It was so easy to set up right out of the box and the moment I set it up, I was able to start sewing right away. It has tons of cute stitches and you can sit it right on top of your table and work from there",5.0
370154,great! Looking forward to my project.,5.0
5988,It's beautiful! Can't wait to get started on it. It has to get in line behind all my other ones.. ^_^,5.0



[1mprime_pantry[0m, length=30000


Unnamed: 0,reviewText,overall
30066,"Great Toothpaste !!\nAlways buy fluoride free !! For more info google ""the fluoride deception""",5.0
105703,"Yummy. Great for long bike rides as a quick, energy snack.",5.0
75493,Good quick side,5.0
81024,Do yourself a favor and get this coffee. The Brooklyn Bean company has some of the best coffee. I love this single cup for Keurig. Nice having such a variety.,5.0
106560,My new favorite granola... Very nice with yogurt!,5.0



[1mindustrial_and_scientific[0m, length=30000


Unnamed: 0,reviewText,overall
669,"works great, priced right",5.0
63259,Works so well on my Printrbot I never have problems printing with Hatchbox,5.0
6360,"Working on circuits first time in over a decade, so this has been very useful, testing circuits before hooking them up -- works well.",5.0
46307,Looks bigger in pictures. Kinda tiny but i can still get good use out of it,5.0
1869,I used this for my table saw (Rigid Hybrid). I love it because I can use this and my adapter to hook up my shopvac. Great and solid piece.,5.0



[1mgift_cards[0m, length=1922


Unnamed: 0,reviewText,overall
1759,"Great to purchase the card without having to ""shop"".",5.0
109,Always have a few on hand for last minute birthday gifts,5.0
1883,A+++,5.0
1128,"yum,yum",5.0
430,"Bought several for my pre-teen granddaughter who is enjoying shopping for her new clothes. Put the cards in a darling ""change-style"" purse that she can continue to use.",5.0



[1mall_beauty[0m, length=1262


Unnamed: 0,reviewText,overall
645,Fine,5.0
4724,"I bought this as I was really motivated by the commercials for this ""Optic White"" mouthwash. I was a victim of the promise. Maybe it does work, but seriously, this is NASTY stuff. NASTY. I would force myself to use it (white teeth=life's problems solved!), but then end up spitting it out short of ten seconds. I hate that I bought a four pack and am stuck with two unopened bottles.\n\nI'm not a wimp with flavors, but this is gross. Hate.",1.0
552,"For some reason, nowhere near me sells this. Amazon fixed this problem, and I'm now a happy fox.",5.0
220,what can I say my favorite hair product at a great price. I had to buy it!,5.0
193,All of the different colognes and the price.,5.0



[1mmagazine_subscriptions[0m, length=1627


Unnamed: 0,reviewText,overall
1912,"I am giving Esquire four stars because it is so cheap, and occasionally features well written award winning articles. In general, each issue is filled with ads that serve to subsidize the cost of the magazine to subscribers. Which is why I am getting this magazine at less than $0.40 an issue, which doesn't even cover postage. I subscribe to Esquire for the occasional article that is insightful and illuminating. You get what you pay for.\n\nPros\n- Cheap\n- Literary Tradition\n- Occasionally features well written articles\n\nCons\n- Full of Ads\n- Most articles are boring",4.0
1608,Great for the money,5.0
693,"I just started to subscribe to Writer's Digest again after a few years of letting my subscription lapse. I'm glad I did. Although there are a lot of advertisements and some of the ads are for some pretty bad vanity presses, I understand that magazines need ad revenue to stay in business. I get it.\n\nThe magazine is really targeted towards traditional publishing, but even self-published authors (like me) can learn something here. And some of the interviews are really great, even if just to gape at how some lucky authors get to live.\n\nThe first issue I received was the August 2009 issue, and it had a pretty good ""Publishing Survival Guide"" in it. The interview was with Anne Tyler, and that alone was worth the price of the magazine. There's also a good interview with Rick Steves (I'm a huge fan). Interestingly, though, they decided to put Anne Tyler on the cover instead of Rick Steves. I thought that Rick would have been a better cover than Anne Tyler from a marketing standpoint.\n\nAnyway, the ads are a bother but the magazine is still worth it.\n\nRecommended.",4.0
2226,"This review is on the Family Circle magazine and not the Amazon subscription program.\n\nFamily Circle is an all around good magazine. It offers well written articles, doable recipes and easy to read stories. Yes, it's chalk full of advertising, but in this day and age, anybody not expecting that ought to look elsewhere for entertainment.\n\nFamily Circle is guaranteed not to offend anyone. It is mainstream America.",4.0
138,LOVE THIS! LOVE THIS! LOVE THIS!,5.0



[1mdigital_music[0m, length=30000


Unnamed: 0,reviewText,overall
131945,"This song begins with some great guitar chords and driving drums (that remind me a bit of The Who, although just briefly). The vocal comes in and, in typical Cars fashion, seems a bit in the background. I am not a super fan of this vocalist but he can drop into a very nice Bowie-like baritone from time to time that gives a nice contrast to his more alto default style. There is some nice synthesizer throughout with some contrasting drum rhythms here and there with a solid bass line. The lyrics are unobtrusive. A classic Cars tune.",5.0
12634,Timeless classic from a music genius.,5.0
81943,"I'm not really a fan, but I like good music. This song shows growth and class. People can be up to date with her personal life all they want."" her and Chris brown"". I really don't give a crap .it's not my business and neither should it be anyone else's business. But I do know this song is awesome, its sound great. Written by Sia Furler together with Benjamin ""Benny Blanco"" Levin, it shows an artistic way of expressing love",5.0
64047,Great group and great song,5.0
77049,Awesome song! ! Very uplifting and encouraging to the soul.,5.0



[1mappliances[0m, length=131


Unnamed: 0,reviewText,overall
1,good item,5.0
2175,"ALL WAS GREAT, NO PROBLEM WORKS FINE.",5.0
2236,Worked fine -easy to install,5.0
2232,Common failure part on Whirlpool built top load washers. This is an easy replacement and the customer was up and running in no time.,5.0
27,"see original remarks, made the mistake of buying it again from same dealer same problems different machine. Save your money !!",1.0



[1mmusical_instruments[0m, length=30000


Unnamed: 0,reviewText,overall
192329,"great quality and the fit is great, good stuff for the money...",5.0
176132,They actually work. Put one on each of my guitars.,5.0
212747,This thing is a tank. And the motion is smoooooth.,5.0
157769,"Great product for a great price. My only complaint is the really strong smell... Whether it's the glue/plastic, or what... it's not very pleasant.",4.0
130594,"Amazing choice for the price.\nThe quality is good ! Good clean, dirty and fx sounds.\nClean channel is very good. Raising the gain level you don't get OD but a nice valve compress simulation. The clean sound is really good and the eq controls give you a good configurations range.\nThe 3 OD channels are great and you can get configurations for an extensive range: from a light blues, a dirty blues, hard rock and havier distortion.\nIt has a good selection of effects and offers good choices of predetermined configuration options. It's OK if you don't expect has the possibility of change the fx's parameters.\nThe option of save each channel's configuration is very useful.\nI don't like the hard-wired force cable\nIn my opinion it is a good choice for a small amp.",5.0



[1mamazon_fashion[0m, length=390


Unnamed: 0,reviewText,overall
167,"Great shoe. I've had Nike's before and have always been pleased with the comfort, performance, and quality. Though I will say this was the first time I ordered a pair online without trying them on first, so I was nervous. But this turned out to be a great choice. Breathable, lightweight but sturdy, and comfortable during all my workouts. The built-in arch support is great and I've had no discomfort after 2 weeks of use. As far as the sole thickness and cushioning goes, I would consider these a 'medium' build (not thin, not overly thick) so if you're into heavy running outdoors then take that into consideration. Everyone is different in their preference for cushioning, but I think for light runs on a treadmill they would be sufficient. Love these, highly recommend!",5.0
310,I've not even had them a month and the bottom is coming off. I'm not happy!,1.0
182,Love these lightweight shoes! Absolute favorite. Always order half a size bigger with nike shoes than with every other shoe in my closet.,5.0
431,"i bought a pair from DSW for 50$ and they are very comfortable but i bought a size 6.5 when I normally where a size 7. and even the 6.5 feels a little roomy. I have wide flat feet btw. im not sure if I will keep them even though i love their look and comfort. I've been having feet problems while taking a gym class and I think they won't give me enough support when running. they're great for wearing at home or gym and doing exercises in one place- extremely light weight and flexible, but i don't know if they'll last long if you use them everywhere else. :( i really like them but i cant just buy 2 different shoes.\nalso, i really like the strings inside the lining on the side of the shoes to adjust and make the shoe fit snug. it makes me feel secure :D",4.0
219,c,5.0





In [108]:
sum([len(df) for df in category_to_df.values()])

182247

In [110]:
source_domains = get_all_source_domains()
sum([len(category_to_df[domain]) for domain in source_domains]) / sum([len(df) for df in category_to_df.values()])

0.4938352894697855