In [None]:
'''
ML-based Approaches

    Workflow:
        1. Split corpus (training/test datasets)
        2. Define the vocabulary
        3. Extract features
        4. Train classifier
        5. Classify test data

    Classifiers:
        Naive Bayes
            -Supervised classifier
            -A Priori Probabilities
                -Can be dervided by reason alone, before anything specific about data is known
                -Ex: There are more positive movie reviews than negative movie reviews in a dataset
            -Conditional probabilities
                -Ex 1: The word 'worst' appears in two negative movie reviews an 0 times in positive movie reviews
                -Ex 2: The word 'up' appears in two positive movie reviews an 0 times in positive movie reviews
            -How to works:
                -Uses Bayes Theorem to calculate if sample is more likely to be positive or negative
                -Classify sample as whatever it is more likely to be
        SVM
            -Requires a lot of tuning compared to naive bayes
            -Performance might not justify using SVM for this application
            -How it works:
                -Creates a line or plane to separate data
                -Each side of the line or plane represents a class

    Feature selection
        - Key to making an ML model work effectively
        -Feature examples;
            -Word tuples: good for naive bayes
            -Term frequency: good for SVM
            -Inverse document frequency: 
                -unusual words more important
                -commonly used with SVM
'''


In [1]:
# Read in movie review data
positiveReviewsFileName = 'data/rt-polarity.pos'
negativeReviewsFileName = 'data/rt-polarity.neg'

with open(positiveReviewsFileName) as f:
    positiveReviews = f.readlines()
    
with open(negativeReviewsFileName) as f:
    negativeReviews = f.readlines()

In [2]:
# Split data into training and test sets
testTrainingSplitIndex = 2500

testPositiveReviews = positiveReviews[:testTrainingSplitIndex]
testNegativeReviews = negativeReviews[:testTrainingSplitIndex]

trainPositiveReviews = positiveReviews[testTrainingSplitIndex+1:]
trainNegativeReviews = negativeReviews[testTrainingSplitIndex+1:]

In [3]:
# Define the vocabulary

# Put all words from positive and negative reviews into lists
positiveWordList = [word for line in trainPositiveReviews for word in line.split()]
negativeWordList = [word for line in trainNegativeReviews for word in line.split()]

# Make a list of all words found across all reviews
allWordList = [item for sublist in [positiveWordList, negativeWordList] for item in sublist]

# Remove duplicates to get vocabulary
vocabulary = list(set(allWordList))

In [14]:
# Setup the training data

# Split up reviews into dictionaries (key:value => review:label)
posTaggedTrainingReviews = [{'review': review.split(), 'label': 'positive'} for review in trainPositiveReviews]
negTaggedTrainingReviews = [{'review': review.split(), 'label': 'negative'} for review in trainNegativeReviews]

# Concatenate positive and negative review dicts
fullTaggedTrainingData = [review for sublist in [posTaggedTrainingReviews, negTaggedTrainingReviews] for review in sublist]

# Training data is a list of tuples, each tuple being: (review data as list of words, review label)
trainingData = [(review['review'], review['label']) for review in fullTaggedTrainingData]

In [18]:
def extract_features(review):
    '''
    Uses vocabulary to define features and converts a review into a 
    feature vector based on which vocab words are present in the review.
    Feature vector contains true/false values.
    
    param review: review to have features extracted
    param vocabulary: set of words used to define features
    '''
    review_words =set(review)
    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)
    return features

In [22]:
# Train a classifier using extracted features and nltk
import nltk

# Converts training data into tuples of form: (feature vectors, label)
trainingFeatures = nltk.classify.apply_features(extract_features, trainingData)

# Train a Naive Bayes Classifier using the updated form of training data
trainedNBClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)

In [23]:
def naiveBayesSentimentCalculator(review):
    '''
    Takes a review, converts it to a feature vector, 
    then classifies the review based on its features.
    Class is returned.
    
    param review: review from test dataset to be classified
    '''
    problemInstance = review.split()
    problemFeatures = extract_features(problemInstance)
    return trainedNBClassifier.classify(problemFeatures)

def getTestReviewSentiments(naiveBayesSentimentCalculator):
    testPosResults = [naiveBayesSentimentCalculator(review) for review in TestPositiveReviews]
    testNegResults = [naiveBayesSentimentCalculator(review) for review in TestNegativeReviews]
    
    labelToNum = {'positive': 1, 'negative': 0}
    
    numericPosResults = [labelToNum[x] for x in testPositiveResults]
    numericNegResults = [labelToNum[x] for x in testNegativeResults]
    
    return {'results-on-positive': numericPosResults, 'results-on-negative': numericNegResults,}