In [1]:
import os 
import random 
random.seed(6806) 
random_state = 6806

import pandas as pd
pd.set_option("display.max_colwidth", None)

from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
csv_files = [f"data/{file}" for file in os.listdir("data") if file.endswith(".csv")]

In [3]:
csv_files

['data/luxury_beauty.csv',
 'data/software.csv',
 'data/arts_crafts_and_sewing.csv',
 'data/prime_pantry.csv',
 'data/industrial_and_scientific.csv',
 'data/gift_cards.csv',
 'data/all_beauty.csv',
 'data/magazine_subscriptions.csv',
 'data/digital_music.csv',
 'data/appliances.csv',
 'data/musical_instruments.csv',
 'data/amazon_fashion.csv']

In [4]:
category_to_df = {} 

for path in csv_files:
    df = pd.read_csv(path, index_col=[0])
    original_length = len(df)

    df = df.drop_duplicates()
    df = df[df["overall"] != 3] # Exclude neutral/undetermined reviews
    df = df[(df["reviewText"].str.split().str.len() > 0) & (df["reviewText"].str.split().str.len() <= 100)] # Exclude reviews that are too long

    category = path.split("/")[-1][:-4]
    category_to_df[category] = df 
    
    print(category, original_length, len(df))

luxury_beauty 34278 13303
software 12805 4515
arts_crafts_and_sewing 494485 309209
prime_pantry 137788 88505
industrial_and_scientific 77071 48230
gift_cards 2972 1891
all_beauty 5269 1097
magazine_subscriptions 2375 1399
digital_music 169781 82875
appliances 2277 123
musical_instruments 231392 146607
amazon_fashion 3176 380


In [6]:
sum([len(df) for df in category_to_df.values()])

698134

In [8]:
for category, df in category_to_df.items():
    if len(df) > 10000:
        df = df.sample(10000, random_state=random_state)
        category_to_df[category] = df
        
    print(f"\033[1m{category}\033[0m, length={len(df)}")
    display(df.sample(5, random_state=random_state))
    print()

[1mluxury_beauty[0m, length=10000


Unnamed: 0,reviewText,overall
21503,Great hair serum! Makes my hair healthy and shiny! Smells great too! This is the new Loma Pearatin so its now called Loma Organics and they changed their packaging to green bottles.,5.0
24250,"The usual high standards of Crabtree and Evelyn show in the Lily of the Valley scented hand cream. The Shea and Macadamia nut butters make my hands feel and look soft. The lovely Lily of the Valley scent is subtle but a hint lingers for several hours. This size tube is perfect for a purse, tote bag, or backpack. I have purchased Crabtree and Evelyn body lotion from the company web site for years, and now it is available at Amazon.",5.0
15553,"It's like a medium brown. Coffee with a little cream. I believe it would look great on any skin tone. It's a good color for thin application, no streaking. I love CND!!",5.0
26777,"This is an excellent foundation, and I love how it includes a strong SPF for summer use. It lasts all day and provides complete coverage and a beautiful canvas on which to apply other makeup. It blends very well, and I think it is the perfect texture. It does not smell like sunscreen (or really anything else, either). The tube is smaller than I was expecting (although I received a full-size product), so it's expensive. Great product, though.",5.0
5558,"I like the precision of this pencil. It actually allows you to make little hair lines that fill in the brow, rather than just a generic color-in effect. The color is perfect. I still prefer Anastasia's, but this is a better price.",4.0



[1msoftware[0m, length=4515


Unnamed: 0,reviewText,overall
8157,The price of the product was nearly trippled since I bought Norton last time. Since 2015 all antivirus prices got increased except cheap McAfee.,5.0
1742,"BLESS Acronis!!!!!!!!!!!!!!!!! Clone software that works and without giving you\na frustration fueled foul mouth. The software saved my hide twice so far by allowing me to clone entire hard drives, OS included. Which allowed me to switch hard drives and continue work on my laptop when an older hard drive decided to quit as I knew it would sooner or later. Then there was the disk with all those video files I use repeatedly. Only days after cloning that disk, the original died.\n\nIf you value your data, get Acronis.",5.0
7159,"WinZip does what it's supposed to, is easy to use, and reasonably priced. It compares well in features and function to StuffIt- These days I use them pretty much interchangeably. StuffIt has a prettier interface though...",5.0
10382,Not bad but not much better,2.0
9107,Good,2.0



[1marts_crafts_and_sewing[0m, length=10000


Unnamed: 0,reviewText,overall
178154,"a convenient package containing 3 colors, perfect for making bracelets, my daughters found them very easy to work with and loved them",4.0
61839,"THese are amazing!! Learn to Needlepoint! So easy, and everything is included!",5.0
27031,Apply it to a blank canvas as a prep or use it in acrylic paint as a tint. Many different uses. Use your imagination!,4.0
461906,Great fabric organizers they work out so well to organize trims and ribbons.,5.0
299595,"I had a little trouble figuring out the wire arm at first (I didn't pull it out hard enough), but now I love this thing. I crochet and use yarn bowls and this winds all different kinds of yarn perfectly for my use. A great decision for my little business.",5.0



[1mprime_pantry[0m, length=10000


Unnamed: 0,reviewText,overall
136999,Kids really liked it.,4.0
129084,"It works as it should. It's mascara, I really cannot gush.",5.0
102299,really great,5.0
112986,This is the only brand I buy anymore. Does.the.job.,5.0
129768,"I thought it would mix with coffee, agave, and a splash of creamer to make a ""Mocha"", but it didn't (lies on the bottom, rather . I will try again, and add it when the concoction starts fo cool. It states clearly that it mixes well with milk, so maybe that means cold only. I'm sure it's a good product.",4.0



[1mindustrial_and_scientific[0m, length=10000


Unnamed: 0,reviewText,overall
26084,These are great for containing moisture in sensitive storage areas. Work great so far!,5.0
51929,Nice.,5.0
45159,"I use these in my tool drawers in my shop to help prevent rust. I like the 50 Pack, I put some in all the drawers for complete coverage.",5.0
12174,Excellent for soldering / de-soldering support !! I love how I can clamp onto a 2 pin / 3 pin component and just let the weight of the tweezers pull the component out as I de-solder the leads !!! High quality and reasonable price !!,5.0
52772,"Correct bed adhesion is challenging on larger parts for obvious reasons. Skookum as frig prints though, really excellent.",5.0



[1mgift_cards[0m, length=1891


Unnamed: 0,reviewText,overall
1378,everyone lived this for christmas better thqn just a plain naked card to give them,5.0
1041,Great gift card to use shopping for affordable cloths at JCPenny. 20% off only available on Amazon. What a great Thanksgiving purchase.,5.0
2154,pizzzza,5.0
2012,"This is the 1st time to get as an email, as I was going to see a movie tonight. It worked just fine. :)",5.0
2591,A birthday present my niece loved.,5.0



[1mall_beauty[0m, length=1097


Unnamed: 0,reviewText,overall
4,"If you ever want to feel pampered by a shampoo this one is the one. It smells like a wonderful perfume and cleans your hair until it shines plus adding a fullness that most other shampoo's don't give you. It is expensive, but worth it!",5.0
4934,favorite soap since it has only a slight scent. It has shea butter in it. It does not leave a soap residue,5.0
753,"I really enjoy this product. They started selling it at our local store so I don't need to order it online any more. I am allergic to many products, so I was happy to find this one.",5.0
81,"This and the matching conditioner saved my scalp. My hair was falling out due to a very dry scalp. Nothing else I tried would cure my issue. My head literally hurt. After one use, my scalp had real relief. My hair looks beautiful after just two washes. I'm definitely going to buy this again. The smell is not great, but you'll look past it to save your scalp. It is one of the only products that is 100% truly organic. No added chemicals. Very gentle and effective. I highly recommend this product.",5.0
781,"I was so happy that I was able to purchase my favorite Bath & Body Works scent that had been discontinued in the store. This was a great price for the three bottles, and if this person/company has more of this scent I am going to buy more!",5.0



[1mmagazine_subscriptions[0m, length=1399


Unnamed: 0,reviewText,overall
1628,"This is my favorite 'family' magazine of all times. Infor and articles on houses, cleaning, recipes, family activities, family health, organizing, different towns/cities, vacation ideas, etc. It is always fresh, family appropriate, with lots of find more info links too if something catches your interest.\n\nThe best all around family magazine in my opinion!",5.0
10,I love glamour mag. I have read it for over 25 years. The articles are most always good and some of the clothes are beautiful.,5.0
1870,"I like this magazine. The recipes are seasonal which makes keeping an interesting menu plan easy. Most of the recipes are accessible and don't require a ton of special or pricey ingredients. My only complaint is that the focus is on eating ""light"" which isn't always eating ""healthy.""",4.0
956,This is a great magazine to get. Lots of health tips on a variety of subjects and its smaller size I can stick it in my purse\nwhen I anticipate having to wait somewhere Always learn something in each issue,5.0
1674,Enjoy reading their human interest stories.,5.0



[1mdigital_music[0m, length=10000


Unnamed: 0,reviewText,overall
23673,he sings those notes like he is playing a killer lead guitar,5.0
123640,"love it, one of their best songs beautifully sung.",4.0
116392,I like the feel of the song. Anfeemlyrics not too crazy about.,5.0
111934,I love this song! It's so upbeat and the perfect addition to any summer road trip mix.,5.0
100944,"Wow! Wonderful, beautiful, 5 stars all the way.",5.0



[1mappliances[0m, length=123


Unnamed: 0,reviewText,overall
2241,First time I used this brand. But will buy it again. Worked just like all the others I have bought in the past,5.0
2234,Filter works just like the more expensive filters,5.0
14,Great product,5.0
15,Did the job for fixing our Maytag dryer,5.0
21,"Since the motor on my humidifier still works, all I had to do was run vinegar through it twice and replace the filter. It's operating good as new, at least until it doesn't.\n\nThe filter is important as it reduces the really fine white dust just as advertised. There's no way to look at the old used filter to know it should be replaced, but there's no way one of these can be used for two seasons.",5.0



[1mmusical_instruments[0m, length=10000


Unnamed: 0,reviewText,overall
68029,"great practice pad for new drummers, could make a better pad, I love it, I can play on this anywhere...",5.0
143083,"For a musician on a budget, this bag met all of my requirements: backpack straps, protective padding, and plenty of storage compartments. I'm able to carry everything I need along with the guitar: picks, accessories, cord, music, and music stand. Although the pocket that is intended to hold music (papers, a folder, etc.) is a bit small, it gets the job done. And the backpack straps have always been my favorite feature when it comes to cases, as I can have two free hands when transporting my guitars. Great product for the price.",5.0
59424,"Great guitar strap. Hooks on well. Great value! Does what it needs to, it is simple. I use it for my guitar and play as a hobby. Great for hobbyists and performers alike!",5.0
34418,Unfortunately it fits so perfect that only this guitar will fit but oh well thats all I need.,5.0
130754,"I bought 2 of these to fool around with my electric guitar, but I wouldn't recommend 'em. guess sometimes you simply get what you paid for...",1.0



[1mamazon_fashion[0m, length=380


Unnamed: 0,reviewText,overall
392,"Good quality, perfect fit. It's a Nike!",5.0
468,Love these! I have 3 pairs...they're so comfortable - I'm on my feet all day & my feet actually don't hurt at the end of the day when I wear these :)\nI've learned to get these 1/2 size smaller....I'm normally a size 7 and the 6 1/2 fit perfectly,5.0
383,Good fit and comfy. Not so cushiony so definetly better for training and not for running or aerobics. Bought size 8 which is my usual size.,5.0
183,My knees were bothering me during HIIT workouts so i purchased these shoes after researching many styles. Fit true to size and are lightweight. My knees feel better so so hard so good. They are cute in too!!,5.0
421,There very nice,5.0





In [9]:
sum([len(df) for df in category_to_df.values()])

69405

In [10]:
source_domains = get_all_source_domains()
sum([len(category_to_df[domain]) for domain in source_domains]) / sum([len(df) for df in category_to_df.values()])

0.4322455154527772