Yay! Let's get started!

### Design decisions?
- Punctuation looks like it's important, so I think we should keep it? like `!!!!` could actually be indicative of something, as might `???`. Periods, maybe not so much, so I think I'm gonna remove those, commas too
- It might be better to split up contractions, so I thought that could be n i c e
- iffy on stemming and lemmatization. Vader doesn't like it, but we can try it?
    - lemmatization: spacy
- or, try two versions, one with and the other without, see which one performs better
- right, so googles pretrained word2vec exists? to use or not to use, and how?


In [1]:
# All our imports
import json
import nltk
import spacy
import re
from nltk import word_tokenize
from nltk import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# constants should we want that

contractions = {
    "isn't": "is not",
    "aren't": "are not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "you've": "you have",
    "you're": "you are",
    "y'all": "you all",
    "won't": "will not",
    "wouldn't": "would not",
    "I've": "I have",
    "I'm": "I am",
    "I'll": "I will",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "couldn't": "could not",
    "can't": "cannot",
    "it's": "it is",
}

In [3]:
# a bit of set up
lemmatization_model = spacy.load('en_core_web_sm')

# Data Prep

### Tokenizing and Normalizing

In [4]:
# making a class with review_id, review_body, stars, product id and category
class parsedEntry:
    def __init__(self, review_id, text, stars, product_id, product_category):
        self.review_id = review_id
        self.text = text
        self.stars = stars
        self.product_id = product_id
        self.product_category = product_category

    def printAll(self):
        print(self.review_id)
        print("\t TEXT: " + str(self.text))
        print("\t STARS: " + str(self.stars))
        print()

In [5]:
# making a list of parsedEntries
def makeListEntries(fileName):
    
    listEntries = []

    for line in open(fileName, 'r'):
        entry = json.loads(line)

        # GETTING THE PERTINENT BITS #
        review_id = entry['review_id']
        text = entry['review_body']
        stars = entry['stars']
        product_id = entry['product_id']
        product_category = entry['product_category']

        # DEAL WITH THE TEXT PORTION
        text = text.lower()
        listOfChars = []

        # don't see the point in commas
        for i, char in enumerate(text):
            if char != ',':
                listOfChars.append(char)

        nonPunct = "".join(listOfChars)

        # breaking up some contractions: globally replace key with value pair
        nonCont = nonPunct
        for key in contractions:
            nonCont = re.sub(key, contractions[key], nonCont)

        # getting rid of duplicate whitespace
        nonCont = re.sub(" +", " ", nonCont) 
        sentences = sent_tokenize(nonCont)

        tokenized_sentences = []
        for sentence in sentences:
            # makes a list type dealio
            tokens = lemmatization_model(sentence)

           # list just for this sentence
            sentenceList = []

            for token in tokens:
                sentenceList.append(token.lemma_)

            # no periods here
            if (sentenceList[-1] == "."):
                sentenceList.pop()

            # add sentenceList to total sentences
            tokenized_sentences.append(sentenceList)

        newEntry = parsedEntry(review_id, tokenized_sentences, stars, product_id, product_category)

        listEntries.append(newEntry)
    return listEntries
    

In [6]:
trainSet = makeListEntries("dataset/smol_train.json")
testSet = makeListEntries("dataset/smol_test.json")

In [7]:
for i in range(10):
    trainSet[i].printAll()
    testSet[i].printAll()

en_0964290
	 TEXT: [['arrive', 'broken'], ['manufacturer', 'defect'], ['two', 'of', 'the', 'leg', 'of', 'the', 'base', 'be', 'not', 'completely', 'form', 'so', 'there', 'be', 'no', 'way', 'to', 'insert', 'the', 'caster'], ['I', 'unpackage', 'the', 'entire', 'chair', 'and', 'hardware', 'before', 'notice', 'this'], ['so', 'I', "'ll", 'spend', 'twice', 'the', 'amount', 'of', 'time', 'box', 'up', 'the', 'whole', 'useless', 'thing', 'and', 'send', 'it', 'back', 'with', 'a', '1', '-', 'star', 'review', 'of', 'part', 'of', 'a', 'chair', 'I', 'never', 'get', 'to', 'sit', 'in'], ['I', 'will', 'go', 'so', 'far', 'as', 'to', 'include', 'a', 'picture', 'of', 'what', 'their', 'injection', 'molding', 'and', 'quality', 'assurance', 'process', 'miss', 'though'], ['I', 'will', 'be', 'hesitant', 'to', 'buy', 'again'], ['it', 'make', 'I', 'wonder', 'if', 'there', 'be', 'not', 'miss', 'structure', 'and', 'support', 'that', 'do', 'not', 'impede', 'the', 'assembly', 'process']]
	 STARS: 1

en_0199937
	 TEXT

### Vectorizing

In [8]:
# smush all the sentences together
def makeListText(dataSet):
    resList = []
    for entry in dataSet:
        tmp = ""
        for sentence in entry.text:
            for word in sentence:
                tmp += word
                tmp += " "
        resList.append(tmp)
    return resList

# deal with target (the stars) as well
def makeListStars(dataSet):
    resList = []
    for entry in dataSet:
        resList.append(entry.stars)
    return resList



# data
listTrainText = makeListText(trainSet)
listTestText = makeListText(testSet)

# target
listTrainStars = makeListStars(trainSet)
listTestStars = makeListStars(testSet)

#### CountVectorizer

In [9]:
# could do CountVectorizer
cv = CountVectorizer(stop_words = 'english')

trainCVMatr = cv.fit_transform(listTrainText)
testCVMatr = cv.transform(listTrainText)

#print(trainCVMatr)
#print(testCVMatr)

#### TdidfVectorizer

In [12]:
# could do TfidfVectorizer
tv = TfidfVectorizer()

trainTVMatr = cv.fit_transform(listTrainText)
testTVMatr = cv.transform(listTrainText)

#print(trainTVMatr)
#print(testTVMatr)