In [17]:
import nltk
import os
from string import punctuation
from nltk.corpus import stopwords
import math

In [18]:
import urllib.request

In [19]:
URL_PATH = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
DOWNLOADED_FILENAME = 'cornel_sent.tar.gz'
random_state = 101

# load english stopwords
stopword = stopwords.words('english')


In [20]:
def try_download(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        try:
            urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
        except Exception as e:
            print(f'Can not download the specified file!, URL = {url_path}')
            print('try to put file in the current directory of script manually!')
            print('Error code: ', e.code)
        else:
            print('file downloaded correctly from specified URL:', url_path)
    
    

In [21]:
try_download(URL_PATH)

In [22]:
import tarfile

In [23]:
def read_sentiment_file():
    with tarfile.open(DOWNLOADED_FILENAME, 'r:gz') as f:
        tar_names = f.getnames() # tar_names[1] --> 'rt-polaritydata/rt-polarity.neg' tar_names[2] --> 'rt-polaritydata/rt-polarity.pos'
        neg_reviews = f.extractfile(tar_names[1])
        pos_reviews = f.extractfile(tar_names[2])
        
        return pos_reviews.readlines(), neg_reviews.readlines()
        
        

In [24]:
pos_reviews, neg_reviews = read_sentiment_file()

In [25]:
def preprocess(sentencelist):
    clean_sentences = []
    for s in sentencelist:
        # s is in form of byte literal so we must decode it as string
        sentence = str(s, 'latin-1').lower() # convert sentence to lowercase
        sentence = ''.join(c for c in sentence if c not in punctuation) # remove punctuation
        word_tokens = nltk.word_tokenize(sentence)
        word_tokens = [word for word in word_tokens if word not in stopword] # remove stopwords
        clean_sentences.append(' '.join(word_tokens))
    
    return clean_sentences
        

In [26]:
def extract_vocab(sentencelist):
    vocab = set()
    for s in sentencelist:
        vocab.update(s.split())
    return list(vocab)  

In [27]:
all_reviews = preprocess(pos_reviews + neg_reviews)
vocab = extract_vocab(all_reviews)
all_labels = [1]*len(pos_reviews) + [0]*len(neg_reviews)

In [28]:
def extract_features(aSentence):
    features = {}
    words = aSentence.split()
    
    for v in vocab:
        features[v] = math.log(words.count(v) + 1,2)
        
    return features

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(all_reviews, all_labels, test_size=0.1, 
                                                    random_state=random_state)

In [31]:
features = nltk.classify.apply_features(feature_func=extract_features,toks=list(zip(X_train, y_train)))

In [32]:
# take 15 min to train
classifier = nltk.NaiveBayesClassifier.train(features)

In [33]:
def predict(testsentences):
    logit = []
    for index in range(len(testsentences)):
        sentence = testsentences[index]
        featureset = extract_features(sentence)
        pred = classifier.classify(featureset)
        logit.append(pred)
    return logit

In [34]:
from sklearn.metrics import classification_report

In [35]:
print(classification_report(y_true=y_test, y_pred=predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77       525
           1       0.78      0.76      0.77       542

   micro avg       0.77      0.77      0.77      1067
   macro avg       0.77      0.77      0.77      1067
weighted avg       0.77      0.77      0.77      1067

