In [2]:
import pandas
import json
import string
import ijson

In [9]:
class Dataset:
    def __init__(self):
        self.ReviewsLocation = "dataset/review.json"
        self.UnigramBagOfWords = {};
        self.BigramBagOfWords = {};
        self.StopWords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'} #https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
        self.Symbols = string.punctuation
        self.StarCount = {'5':0, '4':0, '3':0, '2':0, '1':0, '0':0}

    def RemoveStopWords(self, reviewWordList):
        '''
        Removes all stop words from a given review word list
        Input:
            reviewWordList: List of preprocessed words in review
        Output:
            cleanReviewList: List of preprocessed words without stopwords
        '''
        cleanReviewList = []
        for word in reviewWordList:
            cleanWord = word.strip()
            if cleanWord not in self.StopWords:
                cleanReviewList.append(cleanWord);
        return cleanReviewList

    def RemoveSymbols(self, reviewString):
        '''
        Removes all symbols from a review string
        Input:
            reviewString: review in string format
        Output:
            reviewString: review without any symbols defined in string.punctuation
        '''
        for symbol in self.Symbols:
            reviewString = reviewString.replace(symbol, '')
        return reviewString

    def WeightedMean(self, probabilities):
        '''
        Calculates the weighted, normalized mean for a set of probabilities for a review rating
        Input:
            probabilities: List of each star and their current probabilities
        Output:
            weightedMean: Returns the highest prediction or weighted mean
        '''
        rankings = [0, 1, 2, 3, 4, 5]
        try:
            norm = [float(i)/sum(probabilities) for i in probabilities]
            weightedMean = [norm[i] * rankings[i] for i in range(0,6)]
        except:
            weightedMean = max(probabilities)
            return weightedMean
        return sum(weightedMean)        
        
        
    def AddUnigramBagOfWords(self, word, stars):
        '''
        Adds a given word and star rating to the unigram bag of words model
        Input:
            word: Word to enter into dictionary
            stars: Star rating for the review corresponding to the word
        '''
        if word not in self.UnigramBagOfWords:
            self.UnigramBagOfWords[word] = {'5':0,'4':0,'3':0,'2':0,'1':0,'0':0}
        self.UnigramBagOfWords[word][stars] += 1
        
    def AddBigramBagOfWords(self, word_one, word_two, stars):
        '''
        Adds a given set of two words and the corresponding star rating to the review to the bigram bag of words model
        Input:
            word_one: First word of set found in the review
            word_two: Second word of set found in the review
            stars: Star rating associated with the review where the words were gathered
        '''
        if word_one not in self.BigramBagOfWords:
            self.BigramBagOfWords[word_one] = {word_two: {'5':0,'4':0,'3':0,'2':0,'1':0,'0':0}}
        if word_two not in self.BigramBagOfWords[word_one]:
            self.BigramBagOfWords[word_one][word_two] = {'5':0,'4':0,'3':0,'2':0,'1':0,'0':0}
        self.BigramBagOfWords[word_one][word_two][stars] += 1

    def GetUnigramReviewBagOfWords(self,count,save=False):
        '''
        Iterates through count number of reviews, parses the stars and text from dataset, preprocesses data,
        and adds in the word and star rating into the unigram
        Input:
            count: Number of reviews to add into the unigram model
            save: Initialized as false, saves the unigram into a json file
        '''
        # Chunk size set to 1, reads in one line at a time
        ReviewsReader = pandas.read_json(self.ReviewsLocation, lines=True, chunksize=1)
        index = -1;
        for review in ReviewsReader:
            index += 1
            try:
                reviewStars = str(review.stars[index]);
                reviewText = review.text[index].strip();
            except:
                print(review.stars)
                print(review.text)
                break
            
            # Remove Symbols
            for symbol in self.Symbols:
                reviewText = reviewText.replace(symbol, '')
            
            # Preprocess text
            reviewText = reviewText.lower()
            reviewText = reviewText.split(' ')
            
            # Remove stop words and add to dict
            for word in reviewText:
                cleanWord = word.strip()
                if cleanWord not in self.StopWords:
                    self.AddUnigramBagOfWords(cleanWord, reviewStars)
                    
            if index >= count:
                break
        
        # Save bagofwords dict
        if save == True:
            with open('unigrambagofwords' + str(count) + '.json', 'w') as fp:
                json.dump(self.UnigramBagOfWords, fp)
                
    def GetBigramReviewBagOfWords(self,save=False):
        '''
        Iterates through count number of reviews, parses the stars and text from dataset, preprocesses data,
        and adds in the word and star rating into the bigram model
        Input:
            save: Initialized as false, saves the bigram into a json file
        '''
        ReviewsReader = pandas.read_json(self.ReviewsLocation, lines=True, chunksize=1)
        index = -1;
        for review in ReviewsReader:
            index += 1
            try:
                reviewStars = str(review.stars[index]);
                reviewText = review.text[index].strip();
            except:
                print(review.stars)
                print(review.text)
                break
                
            reviewText = self.RemoveSymbols(reviewText)
            reviewText = reviewText.lower()
            reviewTextList = reviewText.split(' ')
            
            cleanText = self.RemoveStopWords(reviewTextList)
            
            for i in range(0,len(cleanText)-1):
                self.AddBigramBagOfWords(cleanText[i], cleanText[i+1], reviewStars)
            
        if save == True:
            with open('bigrambagofwords.json', 'w') as fp:
                json.dump(self.BigramBagOfWords, fp)
            
    def LoadBigram(self):
        '''
        Loads in a previously built and saved bigram model from bigrambagofwords.json
        '''
        jsonfile = open('bigrambagofwords.json')
        jsonstr = jsonfile.read()
        self.BigramBagOfWords = json.loads(jsonstr)

    def LoadUnigram(self):
        '''
        Loads in a previously built and saved unigram model from bigrambagofwords.json
        '''
        jsonfile = open('unigrambagofwords.json')
        jsonstr = jsonfile.read()
        self.UnigramBagOfWords = json.loads(jsonstr)
    
    def LoadUnigramCount(self, count):
        '''
        Loads in the unigram model from unigrambagofwords[count].json, where count is the number of reviews
        the data was generated from
        Input:
            count: Size of unigram model to load. Used as input into load file
        '''
        jsonfile = open('unigrambagofwords' + str(count) + '.json')
        jsonstr = jsonfile.read()
        self.UnigramBagOfWords = json.loads(jsonstr)
        

    def UnigramReviewScore(self, review, weightedMean=False):
        '''
        Calculates the star rating of a review using the unigram model
        Input:
            review: Review to give star rating to
            weightedMean: Initialized as false, normalizes the star rating to give an average as opposed to 
                            just a star rating
        Output:
            Star prediction
        '''
        review = self.RemoveSymbols(review)
        review = review.lower()
        review = review.split(' ')
        cleanReview = self.RemoveStopWords(review)

        probabilities = [1, 1, 1, 1, 1, 1]
        for word in cleanReview:
            if word not in self.UnigramBagOfWords:
                continue
            # Calculate total reviews for word
            totalReviews = 0;

            for star in range(0,6):
                starString = str(star)
                totalReviews += self.UnigramBagOfWords[word][starString]
            for star in range(0,6):
                starString = str(star)
                if self.UnigramBagOfWords[word][starString] != 0:
                    try:
                        # Determines the probability of a word showing up in a star review compared to the total number
                        # of star reviews
                        probabilities[star] *= (float(self.UnigramBagOfWords[word][starString] / float(self.StarCount[starString])))
                    except:
                        # If the score doesn't exist, calculates the probability of the word over the total star count
                        probabilities[star] *= (float(self.UnigramBagOfWords[word][starString] / float(self.StarCount['total'])))
                else:
                    # Calculates the probability of a star where the word hasn't shown up before by taking 1/total amount of review ratings
                    probabilities[star] *= (float(1) / float(self.StarCount['total']))

        # Applies the weighted mean if boolean is true, else returns the highest probability for the star rating
        if weightedMean == True:
            return self.WeightedMean(probabilities)
        else:
            return probabilities.index(max(probabilities))

    def BigramReviewScore(self, review):
        '''
        Calculates the star rating of a review using the bigram model
        Input:
            review: Review to give star rating to
        Output:
            Star prediction
        '''
        review = self.RemoveSymbols(review)
        review = review.lower()
        review = review.split(' ')
        cleanReview = self.RemoveStopWords(review)

        probabilities = [1, 1, 1, 1, 1, 1]
        for i in range(0,len(cleanReview)-1):
            word_one = cleanReview[i]
            word_two = cleanReview[i+1]
            
            # Skips the word if the first and second word combination does not show up in the bigram model
            if word_one not in self.BigramBagOfWords:
                continue
            else:
                if word_two not in self.BigramBagOfWords[word_one]:
                    continue;

            # Calculate total reviews for word
            totalReviews = 0;
            for stars in range(0,5):
                totalReviews += self.BigramBagOfWords[word_one][word_two][stars]
            for stars in range(0,5):
                if self.UnigramBagOfWords[word_one][word_two][stars] != 0:
                    # Calculates the probability of the star review over the total amount of reviews for any given word combination
                    probabilties[stars] *= (float(self.BigramBagOfWords[word_one][word_two][stars]) / float(totalReviews))
                else:
                    # If the word doesn't show up, takes a naive probability
                    probabilties[stars] *= (float(1)/float(totalReviews))
        
        # Returns the highest probability
        return probabilities.index(max(probabilities))


    def iJsonParser(self,word):
        '''
        Using iJson, iterates through the unigram model to find a given word and returns the star ratings for that word
        Input:
            word: Word to look for
        Output:
            returnDict: Dictionary containing the ratings for each star of the given word
        '''
        parser = ijson.parse(open('unigrambagofwords.json'))
        dataCollected = 0;
        returnDict = {}
        returnDict[word] = {'5':0,'4':0,'3':0,'2':0,'1':0,'0':0}

        for key, value, pair in parser:
            if word not in key:
                continue
            for star in range(0,6):
                if key==(word + '.' + str(star)):
                    returnDict[word][str(star)] = pair
                    dataCollected += 1
            if dataCollected == 6:
                break
        return returnDict
    
    
    def iJsonParserTwo(self, word_one, word_two):
        '''
        Using iJson, iterates through the biram model to find a given word and returns the star ratings for that word
        Input:
            word_one: First word in bigram
            word_two: Second word in bigram
        Output:
            returnDict: Dictionary containing the ratings for each star of the given word
        '''
        parser = ijson.parse(open('bigrambagofwords.json'))
        dataCollected = 0;
        returnDict = {}
        returnDict[word_one] = {word_two: {'5':0,'4':0,'3':0,'2':0,'1':0,'0':0}} 

        
        for key, value, pair in parser:
            if word_one not in key and word_two not in key:
                continue
            for star in range(0,6):
                if key==(word_one + '.' + word_two + '.' + str(star)):
                    returnDict[word_one][word_two][str(star)] = pair
                    dataCollected += 1
            if dataCollected == 6:
                break
        return returnDict
                
                
    def UnigramReviewScoreiJson(self, review, weightedMean=False):
        '''
        Calculates the review score using the unigram model and the iterative iJson parser
        Input:
            review: review to rate
            weightedMean: Initialized to False, if true, calculates the weighted mean as opposed to a star rating
        Output:
            Prediction
        '''
        review = self.RemoveSymbols(review)
        review = review.lower()
        review = review.split(' ')
        cleanReview = self.RemoveStopWords(review)

        probabilities = [1, 1, 1, 1, 1, 1]
        for word in cleanReview:
            unigram = self.iJsonParser(word)
            if word not in unigram:
                continue
            # Calculate total reviews for word
            totalReviews = 0;

            for star in range(0,6):
                starString = str(star)
                totalReviews += unigram[word][starString]
            for star in range(0,6):
                starString = str(star)
                if unigram[word][starString] != 0:
                    probabilities[star] *= (float(unigram[word][starString]) / float(totalReviews))
                else:
                    try:
                        probabilities[star] *= (float(1) / float(totalReviews))
                    except:
                        probabilities[star] *= (float(1)/float(1000000))

        if weightedMean == True:
            return self.WeightedMean(probabilities)
        else:
            return probabilities.index(max(probabilities))
        
        
    def BigramReviewScoreiJson(self, review, weightedMean=False):
        '''
        Calculates the review score using the bigram model and the iterative iJson parser
        Input:
            review: review to rate
            weightedMean: Initialized to False, if true, calculates the weighted mean as opposed to a star rating
        Output:
            Prediction
        '''
        review = self.RemoveSymbols(review)
        review = review.lower()
        review = review.split(' ')
        cleanReview = self.RemoveStopWords(review)

        probabilities = [1, 1, 1, 1, 1, 1]
        for i in range(0,len(cleanReview)-1):
            word_one = cleanReview[i]
            word_two = cleanReview[i+1]
            bigram = self.iJsonParserTwo(word_one, word_two)

            if word_one not in bigram:
                continue
            else:
                if word_two not in bigram[word_one]:
                    continue;

            # Calculate total reviews for word
            totalReviews = 0;
            for stars in range(0,6):
                stringStars = str(stars)
                totalReviews += bigram[word_one][word_two][stringStars]
            for stars in range(0,6):
                stringStars = str(stars)

                if bigram[word_one][word_two][stringStars] != 0:
                    probabilities[stars] *= (float(bigram[word_one][word_two][stringStars]) / float(totalReviews))
                else:
                    try:
                        probabilities[stars] *= (float(1)/float(totalReviews))
                    except:
                        probabilities[stars] *= (float(1)/float(1000000))
                        
        if weightedMean == True:
            return self.WeightedMean(probabilities)
        else:
            return probabilities.index(max(probabilities))

    def RankUnigramReviews(self,save=False):
        '''
        Ranks a set of reviews using the ijson model
        Input:
            save: Initialized as False, saves unigram predictions
        '''
        predictions = {}
        ReviewsReader = pandas.read_json(self.ReviewsLocation, lines=True, chunksize=1)
        index = -1;
        for review in ReviewsReader:
            index += 1
            try:
                reviewID = str(review.review_id[index]);
                reviewStars = str(review.stars[index]);
                reviewText = review.text[index].strip();
            except:
                print(review.stars)
                print(review.text)
                print(review.review_id)
                break
            
            prediction = self.UnigramReviewScoreiJson(reviewText, weightedMean=True)
            predictions[reviewID] = prediction
        
        # Save bagofwords dict
        if save == True:
            with open('unigramPredictions.json', 'w') as fp:
                json.dump(predictions, fp)
                
                
    def RankUnigramReviewsInRam(self,save=False):
         '''
        Ranks a set of reviews, 50000, by initially loading the unigram model into RAM
        Input:
            save: Initialized as False, saves unigram predictions
        '''
        predictions = {}
        ReviewsReader = pandas.read_json(self.ReviewsLocation, lines=True, chunksize=1)
        index = -1;
        for review in ReviewsReader:
            index += 1
            try:
                reviewID = str(review.review_id[index]);
                reviewStars = str(review.stars[index]);
                reviewText = review.text[index].strip();
            except:
                print(review.stars)
                print(review.text)
                print(review.review_id)
                break
            if index >= 50000:
                break
            prediction = self.UnigramReviewScore(reviewText, weightedMean=True)
            predictions[reviewID] = prediction
            
        
        # Save bagofwords dict
        if save == True:
            with open('unigramPredictions.json', 'w') as fp:
                json.dump(predictions, fp)
                
    def RankBigramReviews(self,save=False, weightedMean=False):
        '''
        Ranks a set of reviews, 15000, using the bigram model
        Input:
            save: Initialized as False, will save model if True to a JSON object
            weightedMean: Initialized as False, will calculate the weighted mean prediction if true
        '''
        predictions = {}
        ReviewsReader = pandas.read_json(self.ReviewsLocation, lines=True, chunksize=1)
        index = -1;
        for review in ReviewsReader:
            index += 1
            try:
                reviewID = str(review.review_id[index]);
                reviewStars = str(review.stars[index]);
                reviewText = review.text[index].strip();
            except:
                print(review.stars)
                print(review.text)
                print(review.review_id)
                break
            if index >= 15000:
                break
            prediction = self.BigramReviewScoreiJson(reviewText)
            predictions[reviewID] = prediction
        
        # Save bagofwords dict
        if save == True:
            with open('bigramPredictions.json', 'w') as fp:
                json.dump(predictions, fp)
                
    def CountStarRankings(self, count): 
        '''
        Calculates the amount of reviews of each star up to a count number of reviews. Saves model.
        Input:
            count: Number of reviews to count ratings of
        '''
        ReviewsReader = pandas.read_json(self.ReviewsLocation, lines=True, chunksize=1)
        index = -1;
        for review in ReviewsReader:
            index += 1
            try:
                reviewStars = str(review.stars[index]);
            except:
                break
            if index >= count:
                break
            self.StarCount[reviewStars] += 1
        
        # Save starCount dict
        total = 0;
        for stars in self.StarCount:
            total += self.StarCount[stars]
        self.StarCount['total'] = total
        with open('starCount' + str(count) + '.json', 'w') as fp:
            json.dump(self.StarCount, fp)

In [4]:
dataset = Dataset()
#dataset.LoadUnigram()
#dataset.GetUnigramReviewBagOfWords(save=True)
#dataset.GetBigramReviewBagOfWords(save=True)

In [40]:
dataset.CountStarRankings(50000)
#dataset.GetUnigramReviewBagOfWords(50000, save=True)
#dataset.RankBigramReviews(save=True, weightedMean = True)
#dataset.RankUnigramReviews(save=True)
#dataset.LoadUnigram()
#dataset.LoadBigram()

50000


In [41]:
dataset.LoadUnigramCount(50000)
dataset.RankUnigramReviewsInRam(save=True)

In [10]:
#dataset.RankBigramReviews(save=True, weightedMean = True)