In [1]:
# imports
import json
import nltk
import spacy
import re
from nltk import word_tokenize
from nltk import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

In [2]:
# constants should we want that

# taken from wikipedia + stackexchange answer: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he shall / he will",
    "he'll've": "he shall have / he will have",
    "he's": "he has / he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is / how does",
    "I'd": "I had / I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I shall have / I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have"
}

In [3]:
# a bit of set up
lemmatization_model = spacy.load('en_core_web_sm')

In [10]:
# make list of entries
def makeListEntries(filename):
    data = [json.loads(line) for line in open(filename, 'r')]
    
    for entry in data:
        entry['review_body'] = entry['review_body'].lower()
        
        # taking out contractions
        for key in contractions:
            entry['review_body'] = re.sub(key, contractions[key], entry['review_body'])
        
        entry['tokenized'] = []
        
        # removing unnecessary punctuation
        tokens = lemmatization_model(entry['review_body'])
        entry['tokenized'] = [token.lemma_ for token in tokens if token.lemma_ not in {',', '.'}]
        print(entry['tokenized'])
    return data

In [5]:
# vectorize

# makes the list of words a string, adds that to a list
def makeListText(dataSet):
    resList = []
    for entry in dataSet:
        resList.append(" ".join(entry['tokenized']))
    return resList

# deal with target (the stars) as well
def makeListStars(dataSet):
    resList = []
    for entry in dataSet:
        resList.append(entry['stars'])
    return resList

In [6]:
### vader thresholds for scaling (constants)
"""
vader brackets for scaling!

1: [q0, q1)
2: [q1, q2)
3: [q3, q4)
4: [q4, q5)
5: [q5, q6]
"""
q0 = -1
q1 = -0.6
q2 = -0.2
q3 = 0.2
q4 = 0.6
q5 = 1


In [7]:
def doAll(trainFileName, testFileName):
    trainSet = makeListEntries(trainFileName)
    testSet = makeListEntries(testFileName)
    """*************************************"""
    # data
    listTrainText = makeListText(trainSet)
    listTestText = makeListText(testSet)

    # target
    listTrainStars = makeListStars(trainSet)
    listTestStars = makeListStars(testSet)
    """*************************************"""
    # could do CountVectorizer
    cv = CountVectorizer(stop_words = 'english')

    trainCVMatr = cv.fit_transform(listTrainText)
    testCVMatr = cv.transform(listTestText)
    
    # could do TfidfVectorizer
    tv = TfidfVectorizer(stop_words = 'english')

    trainTVMatr = cv.fit_transform(listTrainText)
    testTVMatr = cv.transform(listTestText)
    """*************************************"""
    # using CountVectorizer
    LR_CV_model = LogisticRegression(multi_class = 'multinomial', max_iter=1000)
    LR_CV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    LR_CV_prediction = LR_CV_model.predict(testCVMatr)

    # get accuracy score
    LR_CV_score = metrics.accuracy_score(listTestStars, LR_CV_prediction)

    # this is the bit with the tfidf vectorizer
    LR_TV_model = LogisticRegression(multi_class = 'multinomial', max_iter=1000)
    LR_TV_model.fit(trainTVMatr, listTrainStars)

    # get it to predict
    LR_TV_prediction = LR_TV_model.predict(testTVMatr)

    # get accuracy score
    LR_TV_score = metrics.accuracy_score(listTestStars, LR_TV_prediction)

    # what do the data say?
    print("Multiclass, logistic regression, CountVectorizer: " + str(LR_CV_score))
    print("Multiclass, logistic regression, TfidfVectorizer: " + str(LR_TV_score))
    """*************************************"""
    # using CountVectorizer
    NB_CV_model = MultinomialNB()
    NB_CV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    NB_CV_prediction = NB_CV_model.predict(testCVMatr)

    # get accuracy score
    NB_CV_score = metrics.accuracy_score(listTestStars, NB_CV_prediction)

    # this is the bit with the tfidf vectorizer
    NB_TV_model = MultinomialNB()
    NB_TV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    NB_TV_prediction = NB_TV_model.predict(testTVMatr)

    # get accuracy score
    NB_TV_score = metrics.accuracy_score(listTestStars, NB_TV_prediction)

    # what do the data say?
    print("Naive Bayes, CountVectorizer: " + str(NB_CV_score))
    print("Naive Bayes, TfidfVectorizer: " + str(NB_TV_score))
    """*************************************"""
    sid = SentimentIntensityAnalyzer()
    listOfRes = []

    data2 = [json.loads(line) for line in open(testFileName, 'r')]

    for entry in data2:
        listOfRes.append(sid.polarity_scores(entry['review_body'])['compound']) 

    numCorrect = 0

    scaledRes = []
    for i in range(len(listOfRes)):
        num = listOfRes[i]
        score = -1
        if num >= q0 and num < q1:
            score = 1
        elif num >= q1 and num < q2:
            score = 2
        elif num >= q2 and num < q3:
            score = 3
        elif num >= q3 and num < q4:
            score = 4
        elif num >= q4 and num <= q5:
            score = 5

        # add score back in
        scaledRes.append(score)
        if score == int(data2[i]['stars']):
            numCorrect += 1

    size = len(listOfRes)
    propCorrect = numCorrect/size

    print("Baseline proportion correct: " + str(propCorrect))


In [11]:
# run 'em all
doAll("dataset/smol_train.json", "dataset/smol_test.json")

['arrive', 'broken', 'manufacturer', 'defect', 'two', 'of', 'the', 'leg', 'of', 'the', 'base', 'be', 'not', 'completely', 'form', 'so', 'there', 'be', 'no', 'way', 'to', 'insert', 'the', 'caster', 'I', 'unpackage', 'the', 'entire', 'chair', 'and', 'hardware', 'before', 'notice', 'this', 'so', 'I', "'ll", 'spend', 'twice', 'the', 'amount', 'of', 'time', 'box', 'up', 'the', 'whole', 'useless', 'thing', 'and', 'send', 'it', 'back', 'with', 'a', '1', '-', 'star', 'review', 'of', 'part', 'of', 'a', 'chair', 'I', 'never', 'get', 'to', 'sit', 'in', 'I', 'will', 'go', 'so', 'far', 'as', 'to', 'include', 'a', 'picture', 'of', 'what', 'their', 'injection', 'molding', 'and', 'quality', 'assurance', 'process', 'miss', 'though', 'I', 'will', 'be', 'hesitant', 'to', 'buy', 'again', 'it', 'make', 'I', 'wonder', 'if', 'there', 'be', 'not', '/', 'be', 'not', 'miss', 'structure', 'and', 'support', 'that', 'do', 'not', 'impede', 'the', 'assembly', 'process']
['the', 'cabinet', 'dot', 'be', 'all', 'detach

['only', 'work', 'for', 'about', 'a', 'month', 'and', 'now', 'dead', '?']
['absolutely', 'can', 'not', 'get', 'lotion', 'out', 'of', 'bottle', '...', 'despite', 'store', 'it', 'upside', 'down', '!', 'I', 'will', 'have', 'to', 'throw', 'out.f']
['it', 'typically', 'work', 'for', 'a', 'few', 'turn', 'and', 'then', 'nothing', 'come', 'out', 'after', 'that', 'I', 'would', 'shake', 'it', 'to', 'try', 'to', 'get', 'peppercorn', 'to', 'feed', 'but', 'that', 'never', 'really', 'help', 'much', 'only', 'empty', 'it', 'clear', 'the', 'neck', 'and', 'reload', 'it', 'would', 'get', 'it', 'work', 'again', 'for', 'a', 'few', 'more', 'turn', 'seem', 'like', 'the', 'neck', 'be', 'too', 'narrow', 'and', 'gets', 'clog', 'inside', 'I', 'finally', 'toss', 'it', 'out', 'because', 'even', 'the', 'disposable', 'mccormick', 'pepper', 'grinder', 'work', 'consistently', 'without', 'all', 'the', 'fuss']
['this', 'be', 'a', 'birthday', 'present', 'when', 'she', 'open', 'the', 'box', 'the', 'glass', 'be', 'break', 

['it', 'come', 'with', 'a', 'break', 'clip', 'to', 'hold', 'the', 'leg', '....', 'I', 'have', 'to', 'use', 'a', 'knife', 'to', 'shave', 'the', 'plastic', 'on', 'the', 'leg', 'to', 'get', 'it', 'to', 'slide', 'to', 'lick', 'into', 'place', '....', 'not', 'to', 'happy']
['I', 'buy', '2', 'of', 'these', 'and', 'one', 'of', 'my', 'golden', 'retriever', 'chew', 'a', 'big', 'chunk', 'off', 'I', 'be', 'contact', 'after', 'leave', 'an', 'instagram', 'review', 'and', 'they', 'tell', 'I', 'there', 'be', 'fake', 'company', 'out', 'there', 'make', 'the', 'product', 'with', 'silicone', 'instead', 'of', 'rubber', 'my', 'dog', 'be', 'not', 'a', 'big', 'chewer', 'and', 'easily', 'chew', 'a', 'piece', 'off', '...', 'thankfully', 'spit', 'it', 'out', 'before', 'he', 'swallow', 'any', '...', 'so', 'I', 'be', 'out', '26.00', 'because', 'they', 'both', 'go', 'in', 'the', 'garbage', '....', 'dog', 'owner', 'please', 'be', 'careful']
['the', 'first', 'time', 'be', 'very', 'good', 'very', 'handy', 'but', 'mor

['the', 'product', 'work', 'for', 'maybe', 'five', 'minute', 'then', 'it', 'stop', 'and', 'my', 'iphone', 'say', 'be', 'not', 'compatible', 'with', 'iphone']
['phone', 'do', 'not', 'work', 'when', 'plug', 'in', 'to', 'an', 'active', 'land', 'line', '-', 'no', 'dial', 'tone', 'seller', 'ask', 'for', 'picture']
['I', 'buy', 'these', 'bottle', 'with', 'the', 'intention', 'of', 'mix', 'essential', 'oil', 'in', 'they', 'the', 'spray', 'hardly', 'work', 'and', 'they', 'leak', 'frequently', 'would', 'not', 'recommend', 'for', 'use', 'with', 'essential', 'oil']
['do', 'not', 'work', 'the', 'heating', 'element', 'be', 'too', 'small', 'we', 'have', 'well', 'luck', 'with', 'a', 'faux', '-', 'fur', 'heated', 'blanket']
['love', 'how', 'this', 'phone', 'case', 'look', 'and', 'feel', 'but', 'it', '’', 'crap', '!', 'do', 'n’t', 'buy', 'it', '!', 'it', 'fall', 'apart', 'all', 'the', 'time', 'after', 'time', 'it', 'stretch', 'and', 'make', 'it', 'loose', 'on', 'the', 'phone']
['leak', 'around', 'the', 

['it', 'be', 'not', 'leather', 'it', 'have', '/', 'it', 'be', 'just', 'plastic', 'and', 'have', 'a', 'weird', 'smell', 'too', '!', 'I', 'be', 'easily', 'able', 'to', 'scratch', 'off', 'the', '"', 'leather', '"', 'off', 'of', 'it', 'with', 'my', 'fingernail', '!']
['three', 'of', 'they', 'do', 'not', 'work']
['the', 'fine', 'sand', 'stick', 'to', 'my', 'cat', "'s", 'paw', 'and', 'she', 'will', 'trace', 'it', 'all', 'over', 'the', 'floor', 'that', 'cause', 'dust', 'and', 'when', 'it', 'get', 'wet', 'even', 'bad', '!', 'it', 'start', 'smell', 'funky', 'as', 'soon', 'as', 'I', 'replace', 'with', 'the', 'new', 'sand', '(', 'it', 'be', 'more', 'a', 'sand', 'smell', 'not', 'my', 'cat', "'s", ')', 'I', 'still', 'get', 'the', 'regular', 'double', 'duty', 'one']
['we', 'do', 'n’t', 'realize', 'this', 'product', 'be', 'for', 'an', 'aquarium', 'probably', 'more', 'our', 'mistake']
['way', 'too', 'tight', 'around', 'the', 'hip', 'and', 'too', 'long', 'around', 'your', 'lady', 'bit']
['after', 'less

['the', 'bluetooth', 'reception', 'be', 'rather', 'good', 'but', 'there', 'be', 'a', 'very', 'loud', 'and', 'constant', 'static', 'when', 'pair', 'that', 'be', 'not', 'there', 'with', 'a', 'wire', 'connection', 'I', 'try', 'clean', 'the', 'aux', 'output', 'and', 'then', 'reset', 'the', 'device', 'which', 'make', 'it', 'unusable', 'altogether', 'since', 'it', 'be', 'stick', 'blink', 'red', 'and', 'blue', 'light', 'I', 'initially', 'get', 'this', 'because', 'I', 'be', 'impressed', 'with', 'the', 'different', 'capability', 'of', 'the', 'device', 'but', 'after', 'use', 'it', 'I', 'be', 'unlikely', 'to', 'purchase', 'any', 'product', 'from', 'the', 'manufacturer', 'unless', 'I', 'be', 'in', 'the', 'market', 'for', 'something', 'that', 'flash', 'red', 'and', 'blue', 'light', 'on', 'a', 'five', 'hour', 'battery']
['wrinkle', 'all', 'over', 'would', 'have', 'be', 'good', 'if', 'firm']
['really', 'do', 'not', 'work', 'to', 'deter', 'not', 'enthusiastic', 'wood', 'pecker']
['the', 'size', 'chart

['reason', 'why', 'I', '’m', 'give', '3', 'star', 'be', 'because', 'the', 'hat', 'be', 'a', 'bit', 'uncomfortable', 'since', 'the', 'bow', 'be', 'a', 'bit', 'bulky', 'in', 'the', 'inner', 'part', '...', 'but', 'they', 'look', 'cute', 'overall', 'and', 'keep', 'she', 'warm', 'also', 'the', 'sewing', 'seem', 'like', 'it', '’', 'a', 'bit', 'defective', 'but', 'we', 'will', 'see', 'how', 'long', 'they', 'hold']
['only', 'one', 'of', 'the', 'lock', 'work']
['look', 'for', 'love', 'may', 'lead', 'to', 'healing']
['for', 'some', 'reason', 'a', 'signature', 'be', 'require', 'to', 'have', 'a', 'poster', 'deliver', '...', 'mind', '-', 'boggling', 'anyway', 'I', 'be', 'sure', 'it', 'have', '/', 'it', 'be', 'really', 'nice', 'and', 'exactly', 'what', 'I', 'want', 'maybe', 'I', "'ll", 'actually', 'see', 'it', 'early', 'next', 'week', '?']
['while', 'the', 'order', 'be', 'deliver', 'on', 'time', 'the', 'packaging', 'be', 'open', 'and', 'miss', 'all', 'eraser', 'the', 'only', 'reason', 'I', 'do', 'no

['I', 'buy', 'the', 'black', 'one', 'the', 'star', 'be', 'a', 'bit', 'more', 'fade', 'that', 'the', 'one', 'on', 'the', 'picture', 'they', 'be', 'ok']
['it', 'come', 'out', 'in', 'a', 'few', 'wash']
['I', 'can', 'see', 'why', 'the', 'price', 'be', 'inexpensive', 'I', 'receive', 'yesterday', 'expiration', 'date', 'be', 'for', 'sept', 'of', 'this', 'year', 'no', 'way', 'will', 'we', 'use', 'all', 'of', 'these', 'in', 'that', 'short', 'amount', 'of', 'time', 'very', 'deceptive', '!', '!', '!']
['too', 'bulky', 'and', 'there', 'be', 'a', 'gap', 'behind', 'my', 'infant', 'ear', 'I', '’m', 'sure', 'they', 'would', 'have', 'work', 'okay', 'but', 'I', 'want', 'they', 'to', 'work', 'great', '!', '!']
['not', 'as', 'long', 'as', 'show', 'on', 'picture']
['we', 'use', 'dog', 'poop', 'bag', 'for', 'our', 'kid', 'poop', 'diaper', 'help', 'control', 'a', 'smell', 'in', 'the', 'room', 'these', 'bag', 'work', 'well', 'however', 'these', 'bag', 'do', 'not', 'have', 'any', 'amount', 'of', 'lemon', 'scen

['the', 'ruler', 'and', 'radius', 'gauge', 'be', 'inaccurate', 'good', 'enough', 'for', 'a', 'beginner', 'look', 'to', 'learn', 'basic', 'guitar', 'maintenance']
['this', 'be', 'to', 'hard', 'to', 'hit', 'low', 'slow', 'game', 'be', 'no', 'fun', 'in', 'a', 'video', 'game']
['love', 'the', 'material', 'of', 'this', 'suit', 'the', 'bottom', 'fit', 'perfect', 'unfortunately', 'the', 'top', 'have', 'a', 'side', 'boob', 'issue', 'feel', 'like', 'if', 'I', 'move', 'just', 'right', 'I', 'would', 'pop', 'right', 'out', 'on', 'the', 'side', 'without', 'a', 'strap']
['very', 'cool', 'color', 'but', 'I', 'wish', 'it', 'be', 'package', 'with', 'more', 'care', 'the', 'bottle', 'be', 'bust', 'so', 'it', 'be', 'cover', 'in', 'green', 'dye', 'stain', 'my', 'hand']
['these', 'feel', 'sturdy', 'but', 'they', 'be', 'pretty', 'shallow', 'you', 'would', 'n’t', 'be', 'able', 'to', 'drain', 'a', 'lot', 'of', 'noodle', 'or', 'whatever', 'at', 'a', 'time']
['I', '’m', 'too', 'fat', '!', '!', 'but', 'this', 'dr

['holy', 'crap', 'this', 'will', 'go', 'such', 'a', 'long', 'way', 'be', 'aware', ':', 'this', 'black', 'be', 'a', 'blue', '-', 'base', '(', 'maybe', 'even', 'slightly', 'green', ')', 'and', 'do', 'not', 'end', 'up', 'match', 'the', 'true', 'black', 'leather', 'that', 'I', 'have']
['presumably', 'it', 'take', 'a', 'couple', 'of', 'week', 'for', 'the', 'skin', 'to', 'peel', 'off', 'after', 'you', 'use', 'this', 'exfoliating', 'mask', 'so', 'I', 'do', 'not', 'know', 'if', 'it', 'actually', 'work', 'it', 'do', 'however', 'smell', 'good', 'and', 'make', 'your', 'foot', 'soft']
['our', 'son', 'love', 'this', 'it', '’', 'much', 'small', 'than', 'I', 'expect', 'and', 'it', 'look', 'pretty', 'fragile', 'but', 'so', 'far', 'so', 'good']
['a', 'nice', 'pen', 'very', 'solid', 'combine', 'it', 'with', 'a', 'zebra', '401', 'and', 'fisher', 'space', 'pen', 'to', 'make', 'an', 'awesome', 'pen']
['be', 'look', 'to', 'make', 'my', 'shower', 'look', 'more', 'organized', 'perfect', 'size', 'and', 'color'

['the', 'light', 'be', 'very', 'helpful', 'for', 'dark', 'colored', 'yarns.very', 'comfortable', 'to', 'use']
['need', 'it', 'so', 'worth', 'it']
['this', 'be', 'a', 'great', 'discus', 'for', 'the', 'price']
['the', 'packaging', 'be', 'so', 'cute', 'and', 'they', 'send', 'a', 'free', 'gift', 'the', 'cover', 'be', 'very', 'soft', 'and', 'I', 'recommend', 'it']
['too', 'snug', 'need', 'to', 'get', 'an', '11', 'not', '10']
['great', 'for', 'closeup', 'I', 'put', 'my', 'makeup', 'on', 'well']
['this', 'be', 'a', 'gift', 'for', 'my', 'mom', 'she', 'love', 'they', 'and', 'would', 'recommend', 'these', 'house', 'shoe']
['my', 'favorite', 'vitamin', 'good', 'product']
['good', 'case', 'so', 'far', 'that', 'be', 'very', 'slim', 'to', 'the', 'phone', 'I', 'like', 'that', 'there', 'be', 'not', 'a', 'very', 'bulky', 'case', 'around', 'the', 'phone', 'to', 'hide', 'the', 'beauty', 'of', 'the', 'iphone', 'but', 'you', 'can', 'tell', 'there', 'be', 'protection', 'I', 'have', 'not', 'drop', 'the', 'ph

['it', 'be', 'fantastic', '!', 'I', 'buy', 'similar', 'light', 'from', 'other', 'website', 'and', 'they', 'have', 'poor', 'quality', 'although', 'this', 'one', 'be', 'more', 'expensive', 'than', 'other', 'I', 'think', 'it', 'be', 'worthy', 'it', 'be', 'really', 'a', 'good', 'decoration', 'for', 'dorm', '!', 'I', 'will', 'probably', 'buy', 'one', 'more', 'to', 'give', 'my', 'mother', 'as', 'a', 'gift', '!']
['price', 'very', 'well', 'I', 'trust', 'now', 'product']
['very', 'good', 'algae', 'remover', 'for', 'the', 'price']
['absolutely', 'one', 'of', 'the', 'good', 'hood', 'you', 'can', 'get', '!', '!', '!', 'I', 'be', 'completely', 'blow', 'away', 'by', 'the', 'clarity', 'and', 'field', 'of', 'view', 'provide', 'by', 'this', 'hood', '!', 'I', "'ve", 'try', 'other', 'and', 'this', 'one', 'be', 'unrivaled', 'be', 'use', 'it', 'two', 'month', 'now', 'and', 'that', 'automatic', 'feature', 'be', 'a', 'lifesaver', '!', 'if', 'you', 'have', 'the', 'money', 'definitely', 'get', 'this', 'hood',

['soft', '...', 'which', 'be', 'why', 'I', 'have', 'not', 'throw', 'it', 'away', '..', 'but', 'it', 'do', 'not', 'seem', 'durable', 'and', 'the', 'waist', 'line', 'be', 'uncomfortable', 'I', 'suggest', 'wear', 'these', 'around', 'the', 'house', ':)', 'I', 'have', 'not', 'trust', 'to', 'wear', 'they', 'outside']
['I', 'wish', 'there', 'be', '2.0', 'version', 'with', 'a', 'large', 'canister', 'this', 'only', 'hold', 'about', '12', 'oz', 'coffee', 'so', 'it', '’', 'it', 'really', 'enough', 'to', 'make', 'a', 'large', 'one', 'it', '’', 'only', 'good', 'for', 'one', 'use', 'at', 'a', 'time', 'before', 'it', 'need', 'refreshing', 'I', '’ve', 'try', 'to', 'fill', 'it', '2', 'time', 'in', 'a', 'row', 'and', 'the', 'second', 'pour', 'be', 'extremely', 'watery', 'because', 'the', 'inside', 'have', 'melt', 'and', 'pour', 'out', 'the', 'air', 'vent', 'on', 'the', 'lid']
['last', 'a', 'month', 'and', 'magnetic', 'clasp', 'wear', 'out', 'it', 'now', 'come', 'apart', 'and', 'fall', 'off', 'it', '’', 

['whirpool', 'washer', 'be', 'much', 'well', 'now', 'these', 'thing', 'be', 'a', 'little', 'tricky', 'to', 'install', '-', 'you', 'need', 'two', 'people']
['dog', 'love', 'it', 'and', 'I', 'be', 'confident', 'that', 'she', 'will', 'not', 'run', 'around', 'in', 'the', 'car']
['it', '’', 'a', 'great', 'book', 'talk', 'about', 'thing', 'we', 'take', 'for', 'grant', 'when', 'we', '’re', 'buy', 'a', 'house', 'or', 'get', 'a', 'new', 'job', 'it', 'give', 'great', 'advice', 'I', 'have', 'recommend', 'this', 'book', 'to', 'a', 'handful', 'of', 'people', 'it', '’', 'easy', 'to', 'read', 'short', 'and', 'they', 'go', 'straight', 'to', 'the', 'point', 'love', 'it', '!']
['so', 'far', 'so', 'good', 'they', 'fit', 'my', 'daughter', 'just', 'right', 'and', 'look', 'very', 'nice']
['very', 'handy', 'to', 'have', 'around']
['these', 'be', 'perfect', 'paper', 'coffee', 'cup', '!', 'I', 'love', 'the', 'sleek', 'white', 'design', '!', 'it', 'be', 'also', 'the', 'perfect', 'size', 'for', 'great', 'coffee'

In [9]:
# equalizing - 75-25 split

listSubFiles = [
    ["dataset/prodAnalysis/train_apparel.json", "dataset/prodAnalysis/test_apparel.json"],
    ["dataset/prodAnalysis/train_automotive.json", "dataset/prodAnalysis/test_automotive.json"],
    ["dataset/prodAnalysis/train_baby_product.json", "dataset/prodAnalysis/test_baby_product.json"],
    ["dataset/prodAnalysis/train_beauty.json", "dataset/prodAnalysis/test_beauty.json"],
    ["dataset/prodAnalysis/train_book.json", "dataset/prodAnalysis/test_book.json"],
    ["dataset/prodAnalysis/train_camera.json", "dataset/prodAnalysis/test_camera.json"],
    ["dataset/prodAnalysis/train_digital_ebook_purchase.json", "dataset/prodAnalysis/test_digital_ebook_purchase.json"],
    ["dataset/prodAnalysis/train_digital_video_download.json", "dataset/prodAnalysis/test_digital_video_download.json"],
    ["dataset/prodAnalysis/train_drugstore.json", "dataset/prodAnalysis/test_drugstore.json"],
    ["dataset/prodAnalysis/train_electronics.json", "dataset/prodAnalysis/test_electronics.json"],
    ["dataset/prodAnalysis/train_furniture.json", "dataset/prodAnalysis/test_furniture.json"],
    ["dataset/prodAnalysis/train_grocery.json", "dataset/prodAnalysis/test_grocery.json"],
    ["dataset/prodAnalysis/train_home.json", "dataset/prodAnalysis/test_home.json"],
    ["dataset/prodAnalysis/train_home_improvement.json", "dataset/prodAnalysis/test_home_improvement.json"],
    ["dataset/prodAnalysis/train_industrial_supplies.json", "dataset/prodAnalysis/test_industrial_supplies.json"],
    ["dataset/prodAnalysis/train_jewelry.json", "dataset/prodAnalysis/test_jewelry.json"],
    ["dataset/prodAnalysis/train_kitchen.json", "dataset/prodAnalysis/test_kitchen.json"],
    ["dataset/prodAnalysis/train_lawn_and_garden.json", "dataset/prodAnalysis/test_lawn_and_garden.json"],
    ["dataset/prodAnalysis/train_luggage.json", "dataset/prodAnalysis/test_luggage.json"],
    ["dataset/prodAnalysis/train_musical_instruments.json", "dataset/prodAnalysis/test_musical_instruments.json"],
    ["dataset/prodAnalysis/train_office_product.json", "dataset/prodAnalysis/test_office_product.json"],
    ["dataset/prodAnalysis/train_other.json", "dataset/prodAnalysis/test_other.json"],
    ["dataset/prodAnalysis/train_pc.json", "dataset/prodAnalysis/test_pc.json"],
    ["dataset/prodAnalysis/train_personal_care_appliances.json", "dataset/prodAnalysis/test_personal_care_appliances.json"],
    ["dataset/prodAnalysis/train_pet_products.json", "dataset/prodAnalysis/test_pet_products.json"],
    ["dataset/prodAnalysis/train_shoes.json", "dataset/prodAnalysis/test_shoes.json"],
    ["dataset/prodAnalysis/train_sports.json", "dataset/prodAnalysis/test_sports.json"],
    ["dataset/prodAnalysis/train_toy.json", "dataset/prodAnalysis/test_toy.json"],
    ["dataset/prodAnalysis/train_video_games.json", "dataset/prodAnalysis/test_video_games.json"],
    ["dataset/prodAnalysis/train_watch.json", "dataset/prodAnalysis/test_watch.json"],
    ["dataset/prodAnalysis/train_wireless.json", "dataset/prodAnalysis/test_wireless.json"]
]

"""
doAll(listSubFiles[0][0], listSubFiles[0][1])
doAll(listSubFiles[0][0], listSubFiles[0][1])
doAll(listSubFiles[1][0], listSubFiles[1][1])
doAll(listSubFiles[2][0], listSubFiles[2][1])
doAll(listSubFiles[3][0], listSubFiles[3][1])
doAll(listSubFiles[4][0], listSubFiles[4][1])
doAll(listSubFiles[5][0], listSubFiles[5][1])
doAll(listSubFiles[6][0], listSubFiles[6][1])
doAll(listSubFiles[7][0], listSubFiles[7][1])

doAll(listSubFiles[8][0], listSubFiles[8][1])
doAll(listSubFiles[9][0], listSubFiles[9][1])
doAll(listSubFiles[10][0], listSubFiles[10][1])
doAll(listSubFiles[11][0], listSubFiles[11][1])
doAll(listSubFiles[12][0], listSubFiles[12][1])
doAll(listSubFiles[13][0], listSubFiles[13][1])
doAll(listSubFiles[14][0], listSubFiles[14][1])
doAll(listSubFiles[15][0], listSubFiles[15][1])
doAll(listSubFiles[16][0], listSubFiles[16][1])

doAll(listSubFiles[17][0], listSubFiles[17][1])
doAll(listSubFiles[18][0], listSubFiles[18][1])
doAll(listSubFiles[19][0], listSubFiles[19][1])
doAll(listSubFiles[20][0], listSubFiles[20][1])
doAll(listSubFiles[21][0], listSubFiles[21][1])
doAll(listSubFiles[22][0], listSubFiles[22][1])
doAll(listSubFiles[23][0], listSubFiles[23][1])
doAll(listSubFiles[24][0], listSubFiles[24][1])
doAll(listSubFiles[25][0], listSubFiles[25][1])
doAll(listSubFiles[26][0], listSubFiles[26][1])

doAll(listSubFiles[27][0], listSubFiles[27][1])
doAll(listSubFiles[28][0], listSubFiles[28][1])
doAll(listSubFiles[29][0], listSubFiles[29][1])
doAll(listSubFiles[30][0], listSubFiles[30][1])
"""


Multiclass, logistic regression, CountVectorizer: 0.4340659340659341
Multiclass, logistic regression, TfidfVectorizer: 0.4340659340659341
Naive Bayes, CountVectorizer: 0.43526039178213094
Naive Bayes, TfidfVectorizer: 0.43526039178213094
Baseline proportion correct: 0.29479216435738176


'\nfor fn in listSubFiles:\n    print(fn)\n    doAll(fn[0], fn[1])\n    print()\n'