In [31]:
import math
import os

In [42]:
def read_file(file_path):
    lines = [line.strip() for line in open(file_path)]
    no_of_reviews = len(lines)
    words = []
    
    for l in lines:
        words.extend(l.split())
            
    return words, no_of_reviews

# Setting up bag of words

In [65]:
poswords, num_pos_reviews = read_file('hw2data/pos.train')
negwords, num_neg_reviews = read_file('hw2data/neg.train')

In [71]:
allwords = set(poswords+negwords)

In [44]:
total_reviews = num_pos_reviews + num_neg_reviews

In [72]:
pos_dict = {}
neg_dict = {}

for pos_word in poswords:
    if pos_word in pos_dict:
        pos_dict[pos_word]+=1
    else:
        pos_dict[pos_word]=0

for neg_word in negwords:
    if neg_word in neg_dict:
        neg_dict[neg_word]+=1
    else:
        neg_dict[neg_word]=0




In [73]:
for word in allwords:
    if word not in pos_dict:
        pos_dict[word]=0.5
    if word not in neg_dict:
        neg_dict[word]=0.5

In [74]:
for pos_word in pos_dict:
    pos_dict[pos_word]+=0.5

for neg_word in neg_dict:
    neg_dict[neg_word]+=0.5

In [75]:
len(pos_dict)

48622

In [76]:
len(neg_dict)

48622

In [99]:
log_prior = {}

log_prior['pos'] = math.log2(num_pos_reviews/total_reviews)
log_prior['neg'] = math.log2(num_neg_reviews/total_reviews)

# Naive Bayes implementation

In [103]:
def naive_bayes(review_words):
    p_rev_pos = likelihood(review_words,"pos")
    p_rev_neg = likelihood(review_words,"neg")
    
    total_pos_prob = p_rev_pos + log_prior['pos']
    total_neg_prob = p_rev_neg + log_prior['neg']
    
    class_prediction = 1 if total_pos_prob>total_neg_prob else 0
    
    return class_prediction

In [96]:
def likelihood(word_list, rev_class):
    log_likelihood = 0
    if rev_class == "pos":
        for word in word_list:
            log_likelihood += math.log2(pos_dict[word]/sum(pos_dict.values()))

    if rev_class == "neg":
        for word in word_list:
            log_likelihood += math.log2(neg_dict[word]/sum(neg_dict.values()))
    
    return log_likelihood

In [101]:
log_prior

{'pos': -0.9699680499708202, 'neg': -1.030670426391585}

# Evaluating our algorithm

In [105]:
pos_test_dir = '/Users/karanpraharaj/Desktop/Repos/CSCI6300-ML+Linguistics/Homeworks/HW2/hw2data/test/pos'
preds_for_pos_revs = {'pos':0, 'neg': 0}
for file in os.listdir(pos_test_dir):
    review = os.path.join(pos_test_dir, file)
    lines = [line.strip() for line in open(review)]
    words = []
    review_words =[]
    
    for l in lines:
        words.extend(l.split())
    for word in words:
        if word in allwords:
            review_words.append(word)
    
    preds_for_pos_revs['pos' if naive_bayes(review_words)==1 else 'neg']+=1
    

In [111]:
preds_for_pos_revs

{'pos': 80, 'neg': 20}

In [108]:
neg_test_dir = '/Users/karanpraharaj/Desktop/Repos/CSCI6300-ML+Linguistics/Homeworks/HW2/hw2data/test/neg'
preds_for_neg_revs = {'pos':0, 'neg': 0}
for file in os.listdir(neg_test_dir):
    review = os.path.join(neg_test_dir, file)
    lines = [line.strip() for line in open(review)]
    words = []
    review_words =[]
    
    for l in lines:
        words.extend(l.split())
    for word in words:
        if word in allwords:
            review_words.append(word)
    
    preds_for_neg_revs['pos' if naive_bayes(review_words)==1 else 'neg']+=1
    

In [110]:
preds_for_neg_revs

{'pos': 15, 'neg': 85}

In [119]:
total_accuracy = (preds_for_neg_revs['neg'] + preds_for_pos_revs['pos'])/(sum(preds_for_neg_revs.values()) + sum(preds_for_pos_revs.values()))*100

# Final accuracy on test reviews

In [120]:
total_accuracy

82.5

# Accuracy on rottentomatoes reviews (extra credit)
"fresh" reviews treated as positive. "rotten" reviews treated as negative.

In [121]:
print('it\'s')

it's


In [124]:
rotten_reviews = ['It’s all mildly amusing, mildly exciting and entirely routine, just as you might expect from a director such as the reliably faceless Ruben Fleischer (Venom), who manages to go from one hit to the next without ever showing any particular talent.',
                 'Uncharted is bland and disappointing without being outright disastrous.',
                 'Uncharted drops toward the wrong end of the chart every time Holland and Wahlberg engage in juvenile bickering, which alas is all too often.',
                 'Uncharted reeks of cynicism and presents an inept understanding of its source material.',
                 'The term aggressively mediocre applies to many video game adaptations, but we also do not need further exhibits for people to convince us that the National Treasure movies are good because others are lamer.',
                 'Now that another great franchise has failed to live up to the hype, theres even more evidence that what makes a movie succeed and what makes a video game succeed may be mutually exclusiveno matter how much DNA they seem to share.',
                 'Thus, the critical process is simplified because the film sucks both as an adaptation and on its own merits (or lack thereof).',
                 'Uncharted drops toward the wrong end of the chart every time Holland and Wahlberg engage in juvenile bickering, which alas is all too often.',
                 'So unfailingly mediocre that I suspect even fans of the game will come away disappointed.',
                 'Rather than find a way to surprise the audience, this is more of a film going through the motions and hoping the strength of the leads is enough to help it get by.',
                 'Makes me wonder why someone didnt find the gold before, since it\'s rooted in available history, and the two white men break into ancient buildings in broad daylight with no blowback. Wahlberg says "kid" more than a goat shepherd in the herd\'s nursery.',
                 'Uncharted is not so much unenjoyable as it is curiously empty.',
                 'If less is more, “Uncharted” must be a masterpiece. It’s bloodless, heartless, joyless, sexless and, with one exception, charmless.',
                 'An ungainly excrescence.',
                 'While Wahlberg and Holland have star power and on-screen chemistry, this is the kind of film that should have us on the edge of our seats instead of looking at the clock. Unfortunately Uncharted is uninteresting, untenable and unbelievable.',
                 'The opening scene is so poor with the CGI and shaky cam that it takes me out of this film, and I could not get back into it afterwards.',
                 'Thus, the critical process is simplified because the film sucks both as an adaptation and on its own merits (or lack thereof).',
                 'Uncharted drops toward the wrong end of the chart every time Holland and Wahlberg engage in juvenile bickering, which alas is all too often.',
                 'Wahlberg and Holland have zero chemistry. Plot full of nothing.',
                 'I don\'t hate this Uncharted movie, but if you go into this movie as an Uncharted fan, I feel like you\'re going to find it lacking.',
            ]

fresh_reviews = ['One of those movies that isnt particularly special in the macro but does all the little things right',
                 'Uncharted had a surprising amount of really cool, completely over-the-top action sequences that I found fitting and fully entertaining.',
                 'Moviegoers can forgive a lot if the film has some nice pacing, decent chemistry and a few laughs along the way, and Uncharted ticks all those boxes.',
                 'It\'s far from perfect, but still a good time at the movies with Tom Holland.',
                 'There is enough respect for the video games here to know they care about the fans. For general audiences who just wanted another Indiana Jones-esque treasure romp, youre in for a rollicking thrill ride with a cast of charmers led by Holland and Wahlberg',
                 '"Uncharted" isn\'t a terrible movie it\'s enjoyable enough with a few sprawling action sequences but the wannabe "National Treasure" is a generic action movie that never captures the full excitement of playing through the franchise.',
                 'This film was good clean fun with a little bit of history and sightseeing added to the mix. Holland and Wahlberg have good chemistry--not exactly buddies, but close to mentor and student. The crosses are both gold and double. Buckle up in 4DX if you can.',
                 'This an entertaining experience from start to finish with excellent chemistry between the cast, a number of surprises, and a delightful connection to its inspiration...',
                 'Yes, trust is its own treasure -- and their gold-digging partnership promises a sequel.',
                 'I\'s mostly in good (if not particularly clever) fun, entertaining enough, but with the nutritional value of a bag of Twizzlers.',
                 'Thanks to some nice chemistry between its stars and smart casting of its two more intriguing female roles, it is much better than the script from Rafe Lee Junkins, Art Marcum and Matt Holloway they are handed.',
                 'Uncharted is actually quite entertaining, despite essentially being Indiana Jones Lite. It\'s not attempting to break any new ground in the adventure genre, it\'s simply striving to be an enjoyable popcorn flick and it succeeds.',
                 'A generic, although fun, action movie. If they made another Uncharted movie, I would see it in a heartbeat.',
                 'After over a decade being in development hell, Uncharted movie amounts to a serviceable crowd pleaser. The movie doesnt really gets its mojo until the third act, despite having a ton of action scenes moving at breakneck speed.',
                 'Fleischer consistently displays strong imagination when it comes to the action set pieces, and once we get through the set-up, these come fast and furious.',
                 'Uncharted proved to be a great time at the movies, a film I had fun with every step of the way and one I cant wait to watch again.',
                 'Though theres the occasional fan-service moment, most of Uncharted succeeds as being its own product, something thats truly surprising when it comes to the realms of the video game feature.',
                 'Holland is the real draw here and he gives it all he\'s got, providing a charming, relentlessly likable performance that\'s easily the best thing about this film.',
                 'This movie worked - mainly due to how charismatic and talented Tom Holland is in the role of Nathan Drake. It\'s a fun watch with great action sequences. Worth checking out whether you\'re a fan of the game or not.',
                 'Apart from some missteps, Uncharted is a fun movie centered on a charismatic lead with enjoyable set pieces and engaging relationships.',
            ]

In [127]:
preds_for_rotten_revs = {'pos':0, 'neg': 0}

for review in rotten_reviews:
    words = review.split()
    review_words =[]
    
    for word in words:
        if word in allwords:
            review_words.append(word)
    
    preds_for_rotten_revs['pos' if naive_bayes(review_words)==1 else 'neg']+=1
    

In [128]:
preds_for_rotten_revs

{'pos': 5, 'neg': 15}

In [129]:
preds_for_fresh_revs = {'pos':0, 'neg': 0}

for review in fresh_reviews:
    words = review.split()
    review_words =[]
    
    for word in words:
        if word in allwords:
            review_words.append(word)
    
    preds_for_fresh_revs['pos' if naive_bayes(review_words)==1 else 'neg']+=1
    

In [130]:
preds_for_fresh_revs

{'pos': 14, 'neg': 6}

In [131]:
total_accuracy = (preds_for_rotten_revs['neg'] + preds_for_fresh_revs['pos'])/(sum(preds_for_rotten_revs.values()) + sum(preds_for_fresh_revs.values()))*100

In [132]:
total_accuracy

72.5

### Thus, the total accuracy on reviews from rottentomatoes.com isn't higher than the test accuracy on our test sets, but it is still a very respectable 72.5%.