# Data Analysis

In [99]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import re
import sklearn

In [48]:
pd.options.mode.chained_assignment = None

In [49]:
REVIEWS = './data/cellphone_reviews.json'

## Utils

In [79]:
def flatten(lol):
    return [l for ll in lol for l in ll]

## Pre-processing

In [50]:
reviews = pd.read_json(REVIEWS, lines=True)
reviews['unhelpful'] = reviews['helpful'].apply(lambda x: x[1] - x[0])
reviews['helpful'] = reviews['helpful'].apply(lambda x: x[0])
reviews['reviewText'] = reviews['reviewText'].str.lower()
reviews.drop_duplicates(inplace=True)
reviews

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,unhelpful
0,120401325X,0,4,they look good and stick good! i just don't li...,"05 21, 2014",A30TL5EWN6DFXT,christina,Looks Good,1400630400,0
1,120401325X,0,5,these stickers work like the review says they ...,"01 14, 2014",ASY55RVNIL0UD,emily l.,Really great product.,1389657600,0
2,120401325X,0,5,these are awesome and make my phone look so st...,"06 26, 2014",A2TMXE2AFO7ONB,Erica,LOVE LOVE LOVE,1403740800,0
3,120401325X,4,4,item arrived in great time and was in perfect ...,"10 21, 2013",AWJ0WZQYMYFQ4,JM,Cute!,1382313600,0
4,120401325X,2,5,"awesome! stays on, and looks great. can be use...","02 3, 2013",ATX7CZYFXI1KW,patrice m rogoza,leopard home button sticker for iphone 4s,1359849600,1
5,120401325X,1,3,these make using the home button easy. my daug...,"10 12, 2013",APX47D16JOP7H,RLH,Cute,1381536000,1
6,120401325X,0,5,came just as described.. it doesn't come unstu...,"08 22, 2013",A1JVVYYO7G56DS,Tyler Evans,best thing ever..,1377129600,0
7,3998899561,1,1,it worked for the first week then it only char...,"11 21, 2013",A6FGO4TBZ3QFZ,Abdullah Albyati,not a good Idea,1384992000,1
8,3998899561,2,5,"good case, solid build. protects phone all aro...","09 25, 2013",A2JWEDW5FSVB0F,Adam,Solid Case,1380067200,1
9,3998899561,1,5,this is a fantastic case. very stylish and pro...,"04 3, 2014",A8AJS1DW7L3JJ,Agata Majchrzak,Perfect Case,1396483200,0


## Filtering for negative reviews

In [51]:
negative_reviews = reviews.loc[reviews['overall'] <= 2]
negative_reviews

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,unhelpful
7,3998899561,1,1,it worked for the first week then it only char...,"11 21, 2013",A6FGO4TBZ3QFZ,Abdullah Albyati,not a good Idea,1384992000,1
19,6073894996,0,1,it worked great for the first couple of weeks ...,"05 29, 2013",A2INSXDTE08WSJ,Barbie,Horrible,1369785600,0
26,6073894996,0,2,i am disappointed that the 1a didn't work with...,"06 23, 2014",A2YODGM5RUZ7I1,DCGeek1,This doesn't work with my iPad.,1403481600,0
28,6073894996,0,2,after a week only one side works,"07 3, 2014",A2U5NF3IH4YVKH,ebony winslow,Two Stars,1404345600,0
32,6073894996,0,1,only works one side at a time. when you connec...,"04 29, 2014",ASRM2FSLDPXVX,jose,"don't waste your money, pay more and buy one a...",1398729600,0
35,6073894996,0,2,didn't last very long. worked great when it wo...,"08 23, 2013",A30A04FS5YAES3,Levi Bowen,Works for a while.,1377216000,0
40,6073894996,0,2,i bought this so that i could use and charge m...,"05 5, 2012",A9VL5ER8D0K2Z,Owner,not high power,1336176000,0
44,6073894996,1,2,i could only give this usb car charger 2 stars...,"06 25, 2012",A5A4374R8I7NB,PWB Esq,"Works Fine, But It Died On ME",1340582400,0
46,6073894996,0,1,i bought 2 of this and tried to test first ......,"06 22, 2014",A25TQLMIAPI1T6,Ronaldo,Be careful,1403395200,0
63,7887421268,0,2,i loved the case when i first received it but ...,"08 6, 2012",A1PMGOX24BWDAZ,0902virgo,Loved it at first,1344211200,0


## Feature extraction

### Building n-gram models

Abandoned because `gensim` is not giving any phrases, mostly unigrams.

In [95]:
all_reviews = negative_reviews['reviewText'].values
all_sentences = flatten([sent_tokenize(review) for review in all_reviews])
all_words = [word_tokenize(sent) for sent in all_sentences]
all_valid_words = [[word for word in sent if len(word) >= 3] for sent in all_words]
bigrams = Phrases(all_valid_words, min_count=3, threshold=20)
bigram_mdl = Phraser(bigrams)

In [96]:
for avw in all_valid_words[:100]:
    print(bigram_mdl[avw])

['worked', 'for', 'the', 'first', 'week', 'then', 'only', 'charge', 'phone']
['waste_money']
['worked', 'great', 'for', 'the', 'first', 'couple_weeks', 'then', 'just', 'stopped', 'completely..', 'basically', 'small', 'waste_money']
['disappointed', 'that', 'the', 'did', "n't", 'work', 'with', 'ipad']
['that', 'what', 'get', 'for', 'buying', 'cheap', 'adapter']
['after', 'week', 'only', 'one', 'side', 'works']
['only', 'works', 'one', 'side', 'time']
['when', 'you', 'connect', 'two', 'cables', 'one', 'side', 'stop_working', 'and', 'also', 'overheated', 'burning', 'the', 'fuses']
['purchased', 'two', 'them', 'and', 'the', 'same', 'problem']
['cheap', 'and', 'bad', 'quality']
['did', "n't", 'last', 'very', 'long']
['worked', 'great', 'when', 'worked', 'but', 'cheap', 'piece', 'plastic', 'crap', 'should', "n't", 'have', 'expected', 'last']
['bought', 'this', 'that', 'could', 'use', 'and', 'charge', 'tab', 'the', 'same', 'time']
['tab', 'does', 'not', 'recognize', 'the', 'high', 'power', 'p

### N-gram generation

Remove "no" and "not" from stop words set to retain review intent.

In [108]:
STOP_WORDS = set(stopwords.words('english'))
STOP_WORDS -= {'no', 'not'}

def ngram(sent, n):
    """
    Splits a sentence into n-grams.
    """
    # Split sentence into words
    tokens = [word for word in word_tokenize(sent) \
        if len(word) > 2 and word not in STOP_WORDS]
    # Zip n consecutive elements into tuples
    ngram_toks = zip(*[tokens[i:] for i in range(n)])
    # Concat
    ngrams = [' '.join(tok) for tok in ngram_toks]
    return ngrams

In [None]:
negative_reviews.loc[:, 'reviewSents'] = negative_reviews['reviewText'] \
    .apply(sent_tokenize)
negative_reviews['unigrams'] = negative_reviews['reviewSents'] \
    .apply(lambda sents: flatten([ngram(sent, 1) for sent in sents]))
negative_reviews['bigrams'] = negative_reviews['reviewSents'] \
    .apply(lambda sents: flatten([ngram(sent, 2) for sent in sents]))
negative_reviews['trigrams'] = negative_reviews['reviewSents'] \
    .apply(lambda sents: flatten([ngram(sent, 3) for sent in sents]))
negative_reviews['ngrams'] = negative_reviews['unigrams'] \
    + negative_reviews['bigrams'] \
    + negative_reviews['trigrams']
negative_reviews