### Organize imports

In [2]:
import json
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import random
import re
import string

### Set parameters

In [3]:
num_good = 100000
num_bad = 50000
num_neutral = 50000
dataset_src_fn = '/Users/kaushiksurikuchi/Downloads/deskewed_shuffled_dataset.json'
final_df_name = 'data/full_cols_good.csv'

### Load full dataset

In [4]:
category_map = {
    "Amazon_Instant_Video" : 0,
    "Apps_for_Android" : 1,
    "Automotive" : 2,
    "Baby" : 3,
    "Beauty" : 4,
    "Books" : 5,
    "CDs_and_Vinyl" : 6,
    "Cell_Phones_and_Accessories" : 7,
    "Clothing_Shoes_and_Jewelry" : 8,
    "Digital_Music" : 9,
    "Electronics" : 10,
    "Grocery_and_Gourmet_Food" : 11,
    "Health_and_Personal_Care" : 12,
    "Home_and_Kitchen" : 13,
    "Kindle_Store" : 14,
    "Movies_and_TV" : 15,
    "Musical_Instruments" : 16,
    "Office_Products" : 17,
    "Patio_Lawn_and_Garden" : 18,
    "Pet_Supplies" : 19,
    "Sports_and_Outdoors" : 20,
    "Tools_and_Home_Improvement" : 21,
    "Toys_and_Games" : 22,
    "Video_Games" : 23
}

col_names = ["reviewer_id", "asin", "review_text", "overall", "category", 
             "good", "bad"]

def read_dataset(fn):
    data = []
    with open(fn) as f:
        for line in f:
            d = json.loads(line)
            pf, tf = d["helpful"]
            score = (1.0 * pf) / (1.0 * tf)
            row = [d["reviewerID"], 
                   d["asin"], 
                   d["summary"] + ' ' + d["reviewText"],
                   d["overall"],
                   category_map[d["category"]],
                   int(score > 0.80),
                   int(score <= 0.20)]
            data.append(row)
    return pd.DataFrame(data, columns=col_names)

print('Creating dataframe...')
df = read_dataset(dataset_src_fn)
df = df.sample(frac=1).reset_index(drop=True) # Randomize entry order
df.head()

Creating dataframe...


Unnamed: 0,reviewer_id,asin,review_text,overall,category,good,bad
0,AI0OAQ6E2O8VF,B002M36R1O,Criterion's most unique release so far This DV...,4.0,15,0,0
1,A2W2GPF65X51SF,B007Y2PMJQ,Extremely disappointing This is one of the fir...,1.0,5,1,0
2,A2WE1FKSL1I38D,B001AQTWF2,Hard to Look Past the Cracks & Strain In Whit'...,1.0,6,0,0
3,A11XUKQIBVXY77,B0015LPS1E,"A little tired, but... I enjoyed this one. I'...",3.0,15,0,0
4,AF3X7J0XC391L,0750933372,How did sheep survive among such wolves? As Je...,4.0,5,1,0


### Prune dataset

In [5]:
df_good = df.loc[df['good'] == 1]
df_good = df_good.sample(frac=1).reset_index(drop=True)
df_good.drop(df_good.index[num_good:], inplace=True)

df_bad = df.loc[df['bad'] == 1]
df_bad = df_bad.sample(frac=1).reset_index(drop=True)
df_bad.drop(df_bad.index[num_bad:], inplace=True)

df_neutral = df.loc[(df['good'] == 0) & (df['bad'] == 0)]
df_neutral = df_neutral.sample(frac=1).reset_index(drop=True)
df_neutral.drop(df_neutral.index[num_neutral:], inplace=True)

print(len(df_good), len(df_bad), len(df_neutral))

100000 50000 50000


In [7]:
df_min = pd.concat([df_good, df_bad, df_neutral], axis=0, join='outer', ignore_index=True)
df_min = df_min.sample(frac=1).reset_index(drop=True)
del df, df_good, df_bad, df_neutral # Free memory
print("Number of entries:", len(df_min))
print("Good count:", len(df_min.loc[df_min['good'] == 1]))
print("Bad count:", len(df_min.loc[df_min['bad'] == 1]))
print("Neutral count:", len(df_min.loc[(df_min['good'] == 0) & (df_min['bad'] == 0)]))
df_min.head()

Number of entries: 200000
Good count: 100000
Bad count: 50000
Neutral count: 50000


Unnamed: 0,reviewer_id,asin,review_text,overall,category,good,bad
0,A23Y9RGNUBONZK,158314045X,I LOVED IT LOVED IT LOVED IT LOVED IT I am not...,5.0,5,1,0
1,A32NXKGY4Z9O4C,B001BX4NR6,One of the better cameras in this price range ...,5.0,10,1,0
2,AEBOD21JROJ4G,B0000CEUM7,Now I use my computer instead of the other way...,5.0,10,1,0
3,A24N1BAS3CU27H,B001KL3GZE,At last!!! I first came across Irish Singer So...,5.0,6,1,0
4,A2QWMT9F3LUSHC,B0000C3I4A,Did I get a different version than everyone he...,5.0,6,1,0


### Normalize review text

In [None]:
stop_words = stopwords.words('english')
word_pattern = re.compile("[A-Za-z]+")
n_entries = len(df_min)
df_norm = pd.DataFrame(columns=col_names, index=range(n_entries))

def normalize_review_text(text):
    def norm_filter(w):
        return w not in stop_words and \
               len(w) > 2
    tokens = nltk.regexp_tokenize(text.lower(), word_pattern)
    return ' '.join(filter(norm_filter, tokens))

for idx in range(n_entries):
    row = df_min.iloc[idx]
    norm_text = normalize_review_text(row['review_text'])
    df_norm.iloc[idx] = [
        row['reviewer_id'],
        row['asin'],
        norm_text,
        row['overall'],
        row['category'],
        row['good'],
        row['bad']
    ]
    if idx % 10000 == 0 or idx + 1 == n_entries:
        print('Entry ' + str(idx + 1) + '/' + str(n_entries) + '.')

print("Finished pre-processing review text.")

Entry 1/200000.
Entry 10001/200000.
Entry 20001/200000.
Entry 30001/200000.
Entry 40001/200000.
Entry 50001/200000.
Entry 60001/200000.


In [11]:
df_norm.to_csv(path_or_buf=final_df_name, sep='|')
print ("Saved to disk!")

Saved to disk!
