## Imports

In [1]:
import numpy as np
import pandas as pd
import re
import json
import time
import datetime

## Data

In [3]:
%%time
data = "dataset/Cell_Phones_and_Accessories_5.json.gz"
df_phones = pd.read_json(data, lines = True, compression = "gzip")
df_phones.head()

Wall time: 12.6 s


Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,7508492919,,5,Looks even better in person. Be careful to not...,"08 4, 2014",A24E3SXTC62LJI,Claudia Valdivia,{'Color:': ' Bling'},Can't stop won't stop looking at it,1407110400,True,
1,7508492919,,5,When you don't want to spend a whole lot of ca...,"02 12, 2014",A269FLZCB4GIPV,sarah ponce,,1,1392163200,True,
2,7508492919,,3,"so the case came on time, i love the design. I...","02 8, 2014",AB6CHQWHZW4TV,Kai,,Its okay,1391817600,True,
3,7508492919,,2,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...,"02 4, 2014",A1M117A53LEI8,Sharon Williams,,CASE,1391472000,True,
4,7508492919,,4,"I liked it because it was cute, but the studs ...","02 3, 2014",A272DUT8M88ZS8,Bella Rodriguez,,Cute!,1391385600,True,


### Load other datasets

In [None]:
%%time
data = "dataset/Video_Games_5.json.gz"
df_videogames = pd.read_json(data, lines = True, compression = "gzip")
df_videogames.head()

In [None]:
%%time
data = "dataset/Automotive_5.json.gz"
df_automotive = pd.read_json(data, lines = True, compression = "gzip")
df_automotive.head()

In [None]:
%%time
data = "dataset/Grocery_and_Gourmet_Food_5.json.gz"
df_grocery = pd.read_json(data, lines = True, compression = "gzip")
df_grocery.head()

## Combine Dataframes

In [None]:
df_videogames.shape

In [None]:
df_phones.shape

In [None]:
df_automotive.shape

In [None]:
df_grocery.shape

In [None]:
df = df_videogames.append(df_phones, sort=True)
df = df.append(df_automotive, sort=True)
df = df.append(df_grocery, sort=True)
df.head()

In [4]:
df.shape

(1128437, 12)

Remove rows with missing values from `reviewText`:

In [5]:
df = df[df.reviewText.notnull()].reindex()

Remove rows associated with non-verified reviews

In [None]:
df = df[df.verified].reindex()

In [6]:
df.shape

(1127672, 12)

Definition of `X` and `y` (class to predict).

In [7]:
X = df.reviewText.values
y = df.overall.values > 3

## Preprocessing/Normalization

#### Emoticons Dictionary

In [8]:
emoticon_repl = {
    # positive emoticons
    r":-?d+": " good ", # :D, :-D, :DD, ecc. in lower case
    r":[- ]?\)+": " good ", # :-), :), :-)), :)), ecc
    r";-?\)+": " good ", # ;). ;)). ;-), ecc.
    r"\(+-?:": " good ", # (:, (-:, ecc.
    r"=\)+" : " good ",
    r"\b<3\b" : " good ",    
    # negative emoticons
    r"[\s\r\t\n]+:/+": " bad ", # :/
    r":\\+": " bad ", # :\
    r"[\s\r\t\n]+\)-?:": " bad ",  # ):, )):, )-:, ecc.
    r":-?\(+": " bad ", # :(, :-(, :((, ecc.
    r"[\s\t\r\n]+d+-?:": " bad "
}
# TODO: add other emoticons?

We have used the following function to check if an emoticon is present in some review:

In [9]:
# Check if re is present in some review
def check_presence(regex, stop = 1, verbose = True):
    j = 0
    tv = False
    for i in range(len(X)):
        if re.search(regex, X[i].lower()):
            tv = True
            j += 1
            sentiment = "positive" if y[i] else "negative"
            if verbose:
                print("Review #",i)
                print("Sentiment: {}".format(sentiment))
                print("Occurrences of the regex {} :".format(regex), re.findall(regex, X[i].lower()))
                print("Review text:", X[i]+"\n")
            if j == stop:
                break
    return tv

In [10]:
check_presence(r":-?d+")

Review # 3248
Sentiment: positive
Occurrences of the regex :-?d+ : [':d']
Review text: Great!!! Love to use this to change my voice to a squeaker :D



True

In [11]:
check_presence(r"[\s\r\t\n]+\)-?:")

Review # 94758
Sentiment: negative
Occurrences of the regex [\s\r\t\n]+\)-?: : [' ):']
Review text: This product was fantastic when I first got it. I could easily transfer my music from my Macbook Pro to my Samsung Galaxy Tab 2.
My SD card became corrupted (not related to the product) so I had to go put everything back on my tablet and this adapter no longer works. ): It seems like it's too loose for the input.



True

Function for checking if an emoticon corresponds to the expected sentiment:

In [12]:
from scipy.stats import norm
from scipy.stats import chi2_contingency

In [13]:
def check_sentiment(regex):
    # https://online.stat.psu.edu/stat414/node/268/
    pos = 0
    neg = 0
    lx = len(X)
    
    for i in range(lx):
        if re.search(regex, X[i].lower()):
            if y[i]:
                pos += 1
            else:
                neg += 1
                
    tot_reviews = len(y)
    tot_pos = np.sum(y)
    tot_neg = tot_reviews - tot_pos
                
    n1 = pos + neg
    n2 = tot_reviews - n1
    
    if emoticon_repl[regex] in [" good ", " great "]:
        test1 = pos
        test2 = tot_pos - pos
        sent = "positive"        
    else:
        test1 = neg
        test2 = tot_neg - neg
        sent = "negative"

        
    # One tailed proportion test    
    p1 = test1/n1
    p2 = test2/n2
    p = (test1 + test2)/(n1+n2)
    num = (p1-p2)
    denom = np.sqrt(p*(1-p)*(1/n1 + 1/n2))
    prop_zstat = num/denom
    prop_pvalue = norm.sf(abs(prop_zstat))
    
    
    # Chisq test
    obs = np.array([[pos, neg],
                    [(tot_pos-pos), (tot_neg-neg)]])
    chi_zstat, chi_pvalue, _, _ = chi2_contingency(obs)
    
    cont_table = "\n{0:>16s} | No {0}\nPos {1:12d} | {2:d}\nNeg {3:12d} | {4:d}\n"\
    .format(regex, pos,tot_pos-pos, neg, tot_neg-neg)
    
    #"Positive reviews containing regex {} : {}; p1 = {:.2f}\n".format(regex, pos, pos/n1) + \
    #"Positive reviews not containing regex {} : {}; p2 = {:.2f}\n".format(regex, tot_pos-pos, (tot_pos-pos)/n2) + \
    print("Contingency table:\n" + \
          cont_table + \
          "\np1: #({} reviews) / #(reviews containing {}) = {:.2f}\n".format(sent, regex, test1/n1) + \
          "p2: #({} reviews) / #(reviews not containing {}) = {:.2f}\n".format(sent, regex, test2/n2) + \
          "H_0 proportion test: p1 - p2 <= 0\n" + \
          "Proportion test p-value = {}\n".format(prop_pvalue) + \
          
          "Chisq test p-value = {}".format(chi_pvalue))

For instance, we would expect that the proportion of positive reviews containing ":)" and similar is larger than the proportion of positive reviews that do not contain ":)".

Similarly, we would expect that the proportion of negative reviews containing ":(" and similar is larger than the proportion of negative reviews that do not contain ":(".

In order to check this, we test the significance of this difference for each emoticon in the dictionary:

In [14]:
# Commented out for slowness and long output, interesting cases follow
# for k in emoticon_repl:
#     print(k)
#     # check_sentiment_chisq(k)
#     check_sentiment(k)
#     print("\n")

Example:

In [15]:
check_sentiment(r":[- ]?\)+")

Contingency table:

       :[- ]?\)+ | No :[- ]?\)+
Pos         8851 | 881935
Neg          588 | 236298

p1: #(positive reviews) / #(reviews containing :[- ]?\)+) = 0.94
p2: #(positive reviews) / #(reviews not containing :[- ]?\)+) = 0.79
H_0 proportion test: p1 - p2 <= 0
Proportion test p-value = 1.131102566014455e-274
Chisq test p-value = 3.545347943822289e-274


There are more negative reviews containing "):" and similar than positive ones, even though the difference is not that big:

In [16]:
check_sentiment(r"[\s\r\t\n]+\)-?:")

Contingency table:

[\s\r\t\n]+\)-?: | No [\s\r\t\n]+\)-?:
Pos            8 | 890778
Neg           13 | 236873

p1: #(negative reviews) / #(reviews containing [\s\r\t\n]+\)-?:) = 0.62
p2: #(negative reviews) / #(reviews not containing [\s\r\t\n]+\)-?:) = 0.21
H_0 proportion test: p1 - p2 <= 0
Proportion test p-value = 2.1032753153533324e-06
Chisq test p-value = 1.4705261732223621e-05


The emoticon is very rare, so we delete it from the list:

In [17]:
del emoticon_repl[r"[\s\r\t\n]+\)-?:"]

The "D:" is very rare, and the difference in proportions is not significant.

In [18]:
#check_sentiment("d+-?:")
check_sentiment(r"[\s\t\r\n]+d+-?:")

Contingency table:

[\s\t\r\n]+d+-?: | No [\s\t\r\n]+d+-?:
Pos            7 | 890779
Neg            4 | 236882

p1: #(negative reviews) / #(reviews containing [\s\t\r\n]+d+-?:) = 0.36
p2: #(negative reviews) / #(reviews not containing [\s\t\r\n]+d+-?:) = 0.21
H_0 proportion test: p1 - p2 <= 0
Proportion test p-value = 0.10558609928990187
Chisq test p-value = 0.3787168023915991


In [19]:
del emoticon_repl[r"[\s\t\r\n]+d+-?:"]

#### Slang and abbreviations/acronyms Dictionary

Old approach: download list of internet acronyms and slang from http://www.netlingo.com/acronyms.php. Resulted in too many terms, with many corresponding to common words.

Manual definition of a dictionary:

In [20]:
slang_repl = {
    "tbh" : "to be honest", "afaik" : "as far as i know",
    "ama" : "ask me anything", "b4" : "before",
    r"baa+d" : " bad ", "brb" : "be right back",
    "btaim" : "be that as it may", "bts" : "behind the scenes",
    "btw" : "by the way", "dyk" : "did you know",
    "eli5" : "explain like i am five", "fomo" : "fear of missing out",
    "ftfy" : "fixed that for you", "ftw" : "for the win",
    "fyi" : "for your information", "g2g" : "got to go",
    r"gooo+d": " good ", "gtg" : "got to go",
    "gg" : "good game", r"gr8t*": " great ",
    "gtr" : "got to run", "hmb" : "hit me back",
    "hmu" : "hit me up", "hth" : "happy to help",
    "icymi" : "in case you missed it", "idc" : "i do not care",
    "idk" : "i do not know", "ikr" : "i know right",
    "ily" : "i love you", "imho" : "in my humble opinion",
    "imo" : "in my opinion", "irl" : "in real life",
    "jk" : "just kidding", "lmao" : "laughing my ass off",
    "lmk" : "let me know", "lol" : "laughing out loud",
    "mfw" : "my face when", "nbd" : "no big deal",
    "nm" : "not much", "nvm" : "never mind",
    "omw" : "on my way", "op" : "original poster",
    "ppl" : "people", "rofl" : "rolling on the floor laughing",
    "roflmao" : "rolling on the floor laughing my ass off", "smh" : "shaking my head",
    "tbbh" : "to be brutally honest", "tfw" : "that feeling when",
    "til" : "today i learned", "tmi" : "too much information",
    "wbu" : "what about you", "yolo" : "you only live once"  
}

In [21]:
# from tqdm.notebook import tqdm
# 
# slang_keys = list(slang_repl.keys())
# 
# present_slang = [k for k in tqdm(slang_keys) if check_presence(r"\b"+k+r"\b", verbose = False)]

In [22]:
# slang_repl_subset = {k : slang_repl[k] for k in present_slang}

In [23]:
# with open('slang_subset_manual.json', 'w') as fid:
#     json.dump(slang_repl_subset, fid)

In [24]:
with open('dataset/slang_subset_manual.json', 'r') as fid:
    slang_repl_subset = json.load(fid)

In [25]:
set(slang_repl.keys())-set(slang_repl_subset.keys())

{'btaim',
 'dyk',
 'eli5',
 'ftfy',
 'gtr',
 'hmb',
 'hmu',
 'icymi',
 'ikr',
 'ily',
 'mfw',
 'omw',
 'roflmao',
 'tbbh',
 'tfw',
 'wbu'}

In [26]:
slang_repl = slang_repl_subset

#### Contracted forms

We will be interested in combinations of "not" + other terms and similar.

In [27]:
contracted_repl = {
    # casi particolari
    r"won\'t" : "will not", r"won\'" : "will not",
    r"can\'t": "can not", r"shan\'t": "shall not",
    r"shan\'": "shall not", r"ain\'t": "is not",
    r"ain\'": "is not",
    # casi generali
    r"n\'t": " not", r"\'t": " not",
    r"n\'": " not",
    r"\'s": " is",
    r"\'ve": " have", 
    r"\'re": " are", 
    r"\'ll": " will", # Might also be "shall", in any case both will be considered stop words
    r"\'d": " would", # Might also be "had", in any case both will be considered stop words
}

### Complete preprocessing function

Complete preprocessing function, including:

- Lowering the text;
- Removing video html part from reviews including a video;
- Removing URLs;
- Substituting emoticons with associated sentiment;
- Translating slang and acronyms;
- Expanding contracted forms;
- Deleting non-alphanumeric characters (excluding the ones useful for tokenization);
- Deleting terms consisting of digits only.

Stop words removal and spelling correction are done after tokenization.

In [28]:
def preprocess(sent, translate_slang = True):
    
    sent = sent.lower()
    sent = re.sub(r'^<div id="video.*>&nbsp;', '', sent) # Video-review part
    sent = re.sub('https?://[A-Za-z0-9./]+', '', sent) # URLs
    
    for k in emoticon_repl:
        sent = re.sub(k, emoticon_repl[k], sent)

    if translate_slang:
        for k in slang_repl:
            sent = re.sub(r"\b"+re.escape(k)+r"\b", slang_repl[k], sent)
            # sent = sent.replace(k, slang_repl[k])
        
    for k in contracted_repl:
        sent = re.sub(k, contracted_repl[k], sent)
    
    sent = re.sub('[/]+', ' ', sent) # word1/word2 to word1 word2
    # Remove non-alphanumeric characters (but not - and _, might be useful for tokenization)
    sent = re.sub('[^A-Za-z0-9-_ ]+', '', sent)

    # Remove words that are digits only
    sent = re.sub('\b\d+\b', '', sent)
    
    return sent

Example:

In [29]:
X[1272]

'<div id="video-block-R2V9ODY7BN8VJ2" class="a-section a-spacing-small a-spacing-top-mini video-block"></div><input type="hidden" name="" value="https://images-na.ssl-images-amazon.com/images/I/B1juE9-rP9S.mp4" class="video-url"><input type="hidden" name="" value="https://images-na.ssl-images-amazon.com/images/I/61S9PJpIfhS.png" class="video-slate-img-url">&nbsp;Great little Usb Hub.  7 ports.  3 on each side, 1 on the front.  Powered by AC or usb.  Lights remain on at all times regardless of what are plugged into the ports.  A must buy if you need more usb ports for your comp.  Works on windows 7.  Have had almost a year with zero issues.'

In [30]:
preprocess(X[1272])

'great little usb hub  7 ports  3 on each side 1 on the front  powered by ac or usb  lights remain on at all times regardless of what are plugged into the ports  a must buy if you need more usb ports for your comp  works on windows 7  have had almost a year with zero issues'

In [31]:
# X[1697]
X[27]

"crystals fell off as nothing :( that's why I really didn't like it but as soon as I saw it I liked it but the stones"

In [32]:
preprocess(X[27])

'crystals fell off as nothing  bad  that is why i really did not like it but as soon as i saw it i liked it but the stones'

In [33]:
%%time
df["reviewTextPreprocessed"] = df["reviewText"].apply(preprocess)

Wall time: 5min 30s


In [34]:
df["reviewText"].loc[1272]

'<div id="video-block-R2V9ODY7BN8VJ2" class="a-section a-spacing-small a-spacing-top-mini video-block"></div><input type="hidden" name="" value="https://images-na.ssl-images-amazon.com/images/I/B1juE9-rP9S.mp4" class="video-url"><input type="hidden" name="" value="https://images-na.ssl-images-amazon.com/images/I/61S9PJpIfhS.png" class="video-slate-img-url">&nbsp;Great little Usb Hub.  7 ports.  3 on each side, 1 on the front.  Powered by AC or usb.  Lights remain on at all times regardless of what are plugged into the ports.  A must buy if you need more usb ports for your comp.  Works on windows 7.  Have had almost a year with zero issues.'

In [35]:
df["reviewTextPreprocessed"].loc[1272]

'great little usb hub  7 ports  3 on each side 1 on the front  powered by ac or usb  lights remain on at all times regardless of what are plugged into the ports  a must buy if you need more usb ports for your comp  works on windows 7  have had almost a year with zero issues'

Delete rows where reviews are empty after preprocessing them:

In [36]:
df = df[~df.reviewTextPreprocessed.str.contains(r"^\s*$")]

39 reviews are empty after preprocessing:

In [41]:
df.shape

(1127633, 13)

In [37]:
len(X)-len(df)

39

Save cleaned dataset:

In [43]:
df[["reviewTextPreprocessed", "overall", "verified"]].to_csv("dataset/preprocessed_dataset.csv.zip",
                                                             index = False, compression='zip')