### Organize imports

In [1]:
import json
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import random
import re
import string

### Set parameters

In [2]:
num_good = 50000
num_bad = 100000
num_neutral = 50000
dataset_src_fn = 'data/amazon_dataset_deskewed.json'
final_df_name = 'data/full_cols_bad.csv'

### Load full dataset

In [3]:
category_map = {
    "Amazon_Instant_Video" : 0,
    "Apps_for_Android" : 1,
    "Automotive" : 2,
    "Baby" : 3,
    "Beauty" : 4,
    "Books" : 5,
    "CDs_and_Vinyl" : 6,
    "Cell_Phones_and_Accessories" : 7,
    "Clothing_Shoes_and_Jewelry" : 8,
    "Digital_Music" : 9,
    "Electronics" : 10,
    "Grocery_and_Gourmet_Food" : 11,
    "Health_and_Personal_Care" : 12,
    "Home_and_Kitchen" : 13,
    "Kindle_Store" : 14,
    "Movies_and_TV" : 15,
    "Musical_Instruments" : 16,
    "Office_Products" : 17,
    "Patio_Lawn_and_Garden" : 18,
    "Pet_Supplies" : 19,
    "Sports_and_Outdoors" : 20,
    "Tools_and_Home_Improvement" : 21,
    "Toys_and_Games" : 22,
    "Video_Games" : 23
}

col_names = ["reviewer_id", "asin", "review_text", "overall", "category", 
             "good", "bad"]

def read_dataset(fn):
    data = []
    with open(fn) as f:
        for line in f:
            d = json.loads(line)
            pf, tf = d["helpful"]
            score = (1.0 * pf) / (1.0 * tf)
            row = [d["reviewerID"], 
                   d["asin"], 
                   d["summary"] + ' ' + d["reviewText"],
                   d["overall"],
                   category_map[d["category"]],
                   int(score > 0.80),
                   int(score <= 0.20)]
            data.append(row)
    return pd.DataFrame(data, columns=col_names)

print 'Creating dataframe...'
df = read_dataset(dataset_src_fn)
df = df.sample(frac=1).reset_index(drop=True) # Randomize entry order
df.head()

Creating dataframe...


Unnamed: 0,reviewer_id,asin,review_text,overall,category,good,bad
0,AA5ZS75RGRYUN,B001CWT4JI,Amazing and Disgusting I bought this product o...,5.0,12,1,0
1,A2EAG9GXCYXLBZ,0310212472,A Beautifully Written Legend Several years ago...,5.0,5,0,0
2,A18O7HJIJNTO46,0553270257,Not her best but not bad This book is a pretty...,3.0,5,0,1
3,A1JBBR4MNGQ70G,1419819100,This train doesn't seem to know where its goi...,2.0,15,0,0
4,A1P7S9FE1QPK31,B002NPY7GS,Is Quality Control Slipping? What Is Going On ...,1.0,15,0,0


### Prune dataset

In [4]:
df_good = df.loc[df['good'] == 1]
df_good = df_good.sample(frac=1).reset_index(drop=True)
df_good.drop(df_good.index[num_good:], inplace=True)

df_bad = df.loc[df['bad'] == 1]
df_bad = df_bad.sample(frac=1).reset_index(drop=True)
df_bad.drop(df_bad.index[num_bad:], inplace=True)

df_neutral = df.loc[(df['good'] == 0) & (df['bad'] == 0)]
df_neutral = df_neutral.sample(frac=1).reset_index(drop=True)
df_neutral.drop(df_neutral.index[num_neutral:], inplace=True)

print len(df_good), len(df_bad), len(df_neutral)

50000 100000 50000


In [5]:
df_min = pd.concat([df_good, df_bad, df_neutral], axis=0, join='outer', ignore_index=True)
df_min = df_min.sample(frac=1).reset_index(drop=True)
del df, df_good, df_bad, df_neutral # Free memory
print "Number of entries:", len(df_min)
print "Good count:", len(df_min.loc[df_min['good'] == 1])
print "Bad count:", len(df_min.loc[df_min['bad'] == 1])
print "Neutral count:", len(df_min.loc[(df_min['good'] == 0) & (df_min['bad'] == 0)])
df_min.head()

Number of entries: 200000
Good count: 50000
Bad count: 100000
Neutral count: 50000


Unnamed: 0,reviewer_id,asin,review_text,overall,category,good,bad
0,AH55IQMRBY2Y0,0762418133,Excellent book!!! Thomas Paine was the most im...,5.0,5,0,0
1,A1G5Q9HBN0EGDV,0780626702,"""I Was Afraid You'd Stop Loving Me"" Joan Fonta...",5.0,15,1,0
2,A3QVAKVRAH657N,0061741361,Early and excellent Elmore Leonard In 1992 the...,5.0,5,1,0
3,A1AFXJ8U72MD6L,068481594X,Great methods here if you stick to the formula...,4.0,5,0,0
4,A2YM6JTQIBZ8YC,B00008DDWT,Scumbags Skating on and destroying other peopl...,1.0,15,0,1


### Normalize review text

In [6]:
stop_words = stopwords.words('english')
word_pattern = re.compile("[A-Za-z]+")
n_entries = len(df_min)
df_norm = pd.DataFrame(columns=col_names, index=range(n_entries))

def normalize_review_text(text):
    def norm_filter(w):
        return w not in stop_words and \
               len(w) > 2
    tokens = nltk.regexp_tokenize(text.lower(), word_pattern)
    return ' '.join(filter(norm_filter, tokens))

for idx in xrange(n_entries):
    row = df_min.iloc[idx]
    norm_text = normalize_review_text(row['review_text'])
    df_norm.iloc[idx] = [
        row['reviewer_id'],
        row['asin'],
        norm_text,
        row['overall'],
        row['category'],
        row['good'],
        row['bad']
    ]
    if idx % 10000 == 0 or idx + 1 == n_entries:
        print 'Entry ' + str(idx + 1) + '/' + str(n_entries) + '.'

print "Finished pre-processing review text."

Entry 1/200000.
Entry 10001/200000.
Entry 20001/200000.
Entry 30001/200000.
Entry 40001/200000.
Entry 50001/200000.
Entry 60001/200000.
Entry 70001/200000.
Entry 80001/200000.
Entry 90001/200000.
Entry 100001/200000.
Entry 110001/200000.
Entry 120001/200000.
Entry 130001/200000.
Entry 140001/200000.
Entry 150001/200000.
Entry 160001/200000.
Entry 170001/200000.
Entry 180001/200000.
Entry 190001/200000.
Entry 200000/200000.
Finished pre-processing review text.


In [7]:
df_norm.to_csv(path_or_buf=final_df_name, sep='|')
print "Saved to disk!"

Saved to disk!
