## Imports

In [1]:
import numpy as np
import pandas as pd
import re
import json
import time
import datetime

## Data

In [2]:
%%time
data = "dataset/Cell_Phones_and_Accessories_5.json.gz"
df_phones = pd.read_json(data, lines = True, compression = "gzip")
df_phones.head()

Wall time: 19.5 s


### Load other datasets

In [3]:
%%time
data = "dataset/Video_Games_5.json.gz"
df_videogames = pd.read_json(data, lines = True, compression = "gzip")
df_videogames.head()

Wall time: 12.4 s


In [4]:
%%time
data = "dataset/Automotive_5.json.gz"
df_automotive = pd.read_json(data, lines = True, compression = "gzip")
df_automotive.head()

Wall time: 25.3 s


In [5]:
%%time
data = "dataset/Grocery_and_Gourmet_Food_5.json.gz"
df_grocery = pd.read_json(data, lines = True, compression = "gzip")
df_grocery.head()

Wall time: 19.1 s


## Combine Dataframes

In [6]:
df_videogames.shape

(497577, 12)

In [7]:
df_phones.shape

(1128437, 12)

In [8]:
df_automotive.shape

(1711519, 12)

In [9]:
df_grocery.shape

(1143860, 12)

In [11]:
df = df_videogames.append(df_phones)
df = df.append(df_automotive)
df = df.append(df_grocery)
df.head()

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,700026657,,5,"This game is a bit hard to get the hang of, bu...","10 17, 2015",A1HP7NVNPFMA4N,Ambrosia075,,but when you do it's great.,1445040000,True,
1,700026657,,4,I played it a while but it was alright. The st...,"07 27, 2015",A1JGAP0185YJI6,travis,,"But in spite of that it was fun, I liked it",1437955200,False,
2,700026657,,3,ok game.,"02 23, 2015",A1YJWEXHQBWK2B,Vincent G. Mezera,,Three Stars,1424649600,True,
3,700026657,,2,"found the game a bit too complicated, not what...","02 20, 2015",A2204E1TH211HT,Grandma KR,,Two Stars,1424390400,True,
4,700026657,,5,"great game, I love it and have played it since...","12 25, 2014",A2RF5B5H74JLPE,jon,,love this game,1419465600,True,


In [12]:
df.shape

(4481393, 12)

Remove rows with missing values from `reviewText`:

In [13]:
df = df[df.reviewText.notnull()].reindex()

Remove rows associated with non-verified reviews

In [14]:
df = df[df.verified].reindex()

In [15]:
df.shape

(3942028, 12)

Definition of `X` and `y` (class to predict).

In [16]:
X = df.reviewText.values
y = df.overall.values > 3

## Preprocessing/Normalization

#### Emoticons Dictionary

In [17]:
emoticon_repl = {
    # positive emoticons
    r":-?d+": " good ", # :D, :-D, :DD, ecc. in lower case
    r":[- ]?\)+": " good ", # :-), :), :-)), :)), ecc
    r";-?\)+": " good ", # ;). ;)). ;-), ecc.
    r"\(+-?:": " good ", # (:, (-:, ecc.
    r"=\)+" : " good ",
    r"\b<3\b" : " good ",    
    # negative emoticons
    r"[\s\r\t\n]+:/+": " bad ", # :/
    r":\\+": " bad ", # :\
    r"[\s\r\t\n]+\)-?:": " bad ",  # ):, )):, )-:, ecc.
    r":-?\(+": " bad ", # :(, :-(, :((, ecc.
    r"[\s\t\r\n]+d+-?:": " bad "
}
# TODO: add other emoticons?

We have used the following function to check if an emoticon is present in some review:

In [18]:
# Check if re is present in some review
def check_presence(regex, stop = 1, verbose = True):
    j = 0
    tv = False
    for i in range(len(X)):
        if re.search(regex, X[i].lower()):
            tv = True
            j += 1
            sentiment = "positive" if y[i] else "negative"
            if verbose:
                print("Review #",i)
                print("Sentiment: {}".format(sentiment))
                print("Occurrences of the regex {} :".format(regex), re.findall(regex, X[i].lower()))
                print("Review text:", X[i]+"\n")
            if j == stop:
                break
    return tv

In [19]:
check_presence(r":-?d+")

Review # 161
Sentiment: positive
Occurrences of the regex :-?d+ : [':d']
Review text: easy to put on and take off, good grip, cheap price, just don't rip it apart to see how strong it is and write a review about it and it's be fine :D



True

In [20]:
check_presence(r"[\s\r\t\n]+\)-?:")

Review # 5841
Sentiment: negative
Occurrences of the regex [\s\r\t\n]+\)-?: : [' ):']
Review text: this product was not as described and  I returned it now they are giving me the run around on getting my money back. ):



True

Function for checking if an emoticon corresponds to the expected sentiment:

In [21]:
from scipy.stats import norm
from scipy.stats import chi2_contingency

In [22]:
def check_sentiment(regex):
    # https://online.stat.psu.edu/stat414/node/268/
    pos = 0
    neg = 0
    lx = len(X)
    
    for i in range(lx):
        if re.search(regex, X[i].lower()):
            if y[i]:
                pos += 1
            else:
                neg += 1
                
    tot_reviews = len(y)
    tot_pos = np.sum(y)
    tot_neg = tot_reviews - tot_pos
                
    n1 = pos + neg
    n2 = tot_reviews - n1
    
    if emoticon_repl[regex] in [" good ", " great "]:
        test1 = pos
        test2 = tot_pos - pos
        sent = "positive"        
    else:
        test1 = neg
        test2 = tot_neg - neg
        sent = "negative"

        
    # One tailed proportion test    
    p1 = test1/n1
    p2 = test2/n2
    p = (test1 + test2)/(n1+n2)
    num = (p1-p2)
    denom = np.sqrt(p*(1-p)*(1/n1 + 1/n2))
    prop_zstat = num/denom
    prop_pvalue = norm.sf(abs(prop_zstat))
    
    
    # Chisq test
    obs = np.array([[pos, neg],
                    [(tot_pos-pos), (tot_neg-neg)]])
    chi_zstat, chi_pvalue, _, _ = chi2_contingency(obs)
    
    cont_table = "\n{0:>16s} | No {0}\nPos {1:12d} | {2:d}\nNeg {3:12d} | {4:d}\n"\
    .format(regex, pos,tot_pos-pos, neg, tot_neg-neg)
    
    #"Positive reviews containing regex {} : {}; p1 = {:.2f}\n".format(regex, pos, pos/n1) + \
    #"Positive reviews not containing regex {} : {}; p2 = {:.2f}\n".format(regex, tot_pos-pos, (tot_pos-pos)/n2) + \
    print("Contingency table:\n" + \
          cont_table + \
          "\np1: #({} reviews) / #(reviews containing {}) = {:.2f}\n".format(sent, regex, test1/n1) + \
          "p2: #({} reviews) / #(reviews not containing {}) = {:.2f}\n".format(sent, regex, test2/n2) + \
          "H_0 proportion test: p1 - p2 <= 0\n" + \
          "Proportion test p-value = {}\n".format(prop_pvalue) + \
          
          "Chisq test p-value = {}".format(chi_pvalue))

For instance, we would expect that the proportion of positive reviews containing ":)" and similar is larger than the proportion of positive reviews that do not contain ":)".

Similarly, we would expect that the proportion of negative reviews containing ":(" and similar is larger than the proportion of negative reviews that do not contain ":(".

In order to check this, we test the significance of this difference for each emoticon in the dictionary:

In [23]:
# Commented out for slowness and long output, interesting cases follow
# for k in emoticon_repl:
#     print(k)
#     # check_sentiment_chisq(k)
#     check_sentiment(k)
#     print("\n")

Example:

In [24]:
check_sentiment(r":[- ]?\)+")

Contingency table:

       :[- ]?\)+ | No :[- ]?\)+
Pos        26114 | 3290422
Neg         1525 | 623967

p1: #(positive reviews) / #(reviews containing :[- ]?\)+) = 0.94
p2: #(positive reviews) / #(reviews not containing :[- ]?\)+) = 0.84
H_0 proportion test: p1 - p2 <= 0
Proportion test p-value = 0.0
Chisq test p-value = 0.0


There are more negative reviews containing "):" and similar than positive ones, even though the difference is not that big:

In [25]:
check_sentiment(r"[\s\r\t\n]+\)-?:")

Contingency table:

[\s\r\t\n]+\)-?: | No [\s\r\t\n]+\)-?:
Pos           18 | 3316518
Neg           30 | 625462

p1: #(negative reviews) / #(reviews containing [\s\r\t\n]+\)-?:) = 0.62
p2: #(negative reviews) / #(reviews not containing [\s\r\t\n]+\)-?:) = 0.16
H_0 proportion test: p1 - p2 <= 0
Proportion test p-value = 4.675277707690704e-19
Chisq test p-value = 5.376302524112476e-18


The emoticon is very rare, so we delete it from the list:

In [26]:
del emoticon_repl[r"[\s\r\t\n]+\)-?:"]

The "D:" is very rare, and the difference in proportions is not significant.

In [27]:
#check_sentiment("d+-?:")
check_sentiment(r"[\s\t\r\n]+d+-?:")

Contingency table:

[\s\t\r\n]+d+-?: | No [\s\t\r\n]+d+-?:
Pos           28 | 3316508
Neg           12 | 625480

p1: #(negative reviews) / #(reviews containing [\s\t\r\n]+d+-?:) = 0.30
p2: #(negative reviews) / #(reviews not containing [\s\t\r\n]+d+-?:) = 0.16
H_0 proportion test: p1 - p2 <= 0
Proportion test p-value = 0.007214829921059048
Chisq test p-value = 0.02574679471755444


In [28]:
del emoticon_repl[r"[\s\t\r\n]+d+-?:"]

#### Slang and abbreviations/acronyms Dictionary

Old approach: download list of internet acronyms and slang from http://www.netlingo.com/acronyms.php. Resulted in too many terms, with many corresponding to common words.

Manual definition of a dictionary:

In [29]:
slang_repl = {
    "tbh" : "to be honest", "afaik" : "as far as i know",
    "ama" : "ask me anything", "b4" : "before",
    r"baa+d" : " bad ", "brb" : "be right back",
    "btaim" : "be that as it may", "bts" : "behind the scenes",
    "btw" : "by the way", "dyk" : "did you know",
    "eli5" : "explain like i am five", "fomo" : "fear of missing out",
    "ftfy" : "fixed that for you", "ftw" : "for the win",
    "fyi" : "for your information", "g2g" : "got to go",
    r"gooo+d": " good ", "gtg" : "got to go",
    "gg" : "good game", r"gr8t*": " great ",
    "gtr" : "got to run", "hmb" : "hit me back",
    "hmu" : "hit me up", "hth" : "happy to help",
    "icymi" : "in case you missed it", "idc" : "i do not care",
    "idk" : "i do not know", "ikr" : "i know right",
    "ily" : "i love you", "imho" : "in my humble opinion",
    "imo" : "in my opinion", "irl" : "in real life",
    "jk" : "just kidding", "lmao" : "laughing my ass off",
    "lmk" : "let me know", "lol" : "laughing out loud",
    "mfw" : "my face when", "nbd" : "no big deal",
    "nm" : "not much", "nvm" : "never mind",
    "omw" : "on my way", "op" : "original poster",
    "ppl" : "people", "rofl" : "rolling on the floor laughing",
    "roflmao" : "rolling on the floor laughing my ass off", "smh" : "shaking my head",
    "tbbh" : "to be brutally honest", "tfw" : "that feeling when",
    "til" : "today i learned", "tmi" : "too much information",
    "wbu" : "what about you", "yolo" : "you only live once"  
}

In [30]:
# from tqdm.notebook import tqdm
# 
# slang_keys = list(slang_repl.keys())
# 
# present_slang = [k for k in tqdm(slang_keys) if check_presence(r"\b"+k+r"\b", verbose = False)]

In [31]:
# slang_repl_subset = {k : slang_repl[k] for k in present_slang}

In [32]:
# with open('slang_subset_manual.json', 'w') as fid:
#     json.dump(slang_repl_subset, fid)

In [33]:
with open('dataset/slang_subset_manual.json', 'r') as fid:
    slang_repl_subset = json.load(fid)

In [34]:
set(slang_repl.keys())-set(slang_repl_subset.keys())

{'btaim',
 'dyk',
 'eli5',
 'ftfy',
 'gtr',
 'hmb',
 'hmu',
 'icymi',
 'ikr',
 'ily',
 'mfw',
 'omw',
 'roflmao',
 'tbbh',
 'tfw',
 'wbu'}

In [35]:
slang_repl = slang_repl_subset

#### Contracted forms

We will be interested in combinations of "not" + other terms and similar.

In [36]:
contracted_repl = {
    # casi particolari
    r"won\'t" : "will not", r"won\'" : "will not",
    r"can\'t": "can not", r"shan\'t": "shall not",
    r"shan\'": "shall not", r"ain\'t": "is not",
    r"ain\'": "is not",
    # casi generali
    r"n\'t": " not", r"\'t": " not",
    r"n\'": " not",
    r"\'s": " is",
    r"\'ve": " have", 
    r"\'re": " are", 
    r"\'ll": " will", # Might also be "shall", in any case both will be considered stop words
    r"\'d": " would", # Might also be "had", in any case both will be considered stop words
}

### Complete preprocessing function

Complete preprocessing function, including:

- Lowering the text;
- Removing video html part from reviews including a video;
- Removing URLs;
- Substituting emoticons with associated sentiment;
- Translating slang and acronyms;
- Expanding contracted forms;
- Deleting non-alphanumeric characters (excluding the ones useful for tokenization);
- Deleting terms consisting of digits only.

Stop words removal and spelling correction are done after tokenization.

In [37]:
def preprocess(sent, translate_slang = True):
    
    sent = sent.lower()
    sent = re.sub(r'^<div id="video.*>&nbsp;', '', sent) # Video-review part
    sent = re.sub('https?://[A-Za-z0-9./]+', '', sent) # URLs
    
    for k in emoticon_repl:
        sent = re.sub(k, emoticon_repl[k], sent)

    if translate_slang:
        for k in slang_repl:
            sent = re.sub(r"\b"+re.escape(k)+r"\b", slang_repl[k], sent)
            # sent = sent.replace(k, slang_repl[k])
        
    for k in contracted_repl:
        sent = re.sub(k, contracted_repl[k], sent)
    
    sent = re.sub('[/]+', ' ', sent) # word1/word2 to word1 word2
    # Remove non-alphanumeric characters (but not - and _, might be useful for tokenization)
    sent = re.sub('[^A-Za-z0-9-_ ]+', '', sent)

    # Remove words that are digits only
    sent = re.sub('\b\d+\b', '', sent)
    
    return sent

Example:

In [38]:
X[1272]

'A really neat and hard to find game that is different from the usual'

In [39]:
preprocess(X[1272])

'a really neat and hard to find game that is different from the usual'

In [40]:
# X[1697]
X[27]

"Crashed in Vista.  Codemasters told me they don't support it in Windows 8.  Couldn't get it to work even after looking on the Internet."

In [41]:
preprocess(X[27])

'crashed in vista  codemasters told me they do not support it in windows 8  could not get it to work even after looking on the internet'

In [42]:
%%time
df["reviewTextPreprocessed"] = df["reviewText"].apply(preprocess)

Wall time: 16min 1s


In [43]:
df["reviewText"].loc[1272]

1272                           Horrible...game was broken
1272                                               Works!
1272    This is our favorite flavor of all the differe...
Name: reviewText, dtype: object

In [44]:
df["reviewTextPreprocessed"].loc[1272]

1272                              horriblegame was broken
1272                                                works
1272    this is our favorite flavor of all the differe...
Name: reviewTextPreprocessed, dtype: object

Delete rows where reviews are empty after preprocessing them:

In [45]:
df = df[~df.reviewTextPreprocessed.str.contains(r"^\s*$")]

98 reviews are empty after preprocessing:

In [46]:
df.shape

(3941930, 13)

In [47]:
len(X)-len(df)

98

Save cleaned dataset:

In [50]:
df[["reviewTextPreprocessed", "overall", "verified"]].to_csv("dataset/preprocessed_dataset.csv",
                                                             index = False)