In [146]:
reset -fs

In [147]:
import os
from urllib.request import urlretrieve

In [148]:
url = "http://www.cs.cornell.edu/People/pabo/movie-review-data/"
filename = "review_polarity.tar.gz"

In [149]:
if not os.path.exists(filename):
    filename, _ = urlretrieve(url+filename, filename)

In [150]:
import tarfile

In [151]:
path = "./txt_sentoken/"

In [152]:
if not os.path.exists(path):
    with tarfile.open(filename, "r:gz") as tar:
        tar.extractall()

------

In [153]:
from sklearn.datasets import load_files

In [154]:
# Load data
sentiment = load_files(path, 
                       random_state=42)
sentiment.target_names

['neg', 'pos']

In [155]:
from sklearn.model_selection import train_test_split

In [156]:
# Create train/test split with labels
train_data, test_data, train_target, test_target = train_test_split(sentiment.data,
                                                                    sentiment.target,
                                                                    random_state=42)

In [157]:
from sklearn.feature_extraction.text import CountVectorizer

In [158]:
# Transform train data from a list of strings into a matrix of frequency counts
vectorizer_count = CountVectorizer()
vectorized_count_train_data = vectorizer_count.fit_transform(train_data)

In [159]:
from sklearn.naive_bayes import MultinomialNB

In [160]:
# Create an instance of the Naive Bayes class 
clf = MultinomialNB()
# Call fit method
clf.fit(vectorized_count_train_data, train_target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

----

In [161]:
from sklearn.metrics import precision_score

In [162]:
precision = precision_score(y_true=test_target,
              y_pred=clf.predict(vectorizer_count.transform(test_data)))

print("The precision on the test data is {:.2%}".format(precision))

The precision on the test data is 85.65%


In [163]:
from sklearn.metrics import recall_score

In [164]:
recall = recall_score(y_true=test_target,
                      y_pred=clf.predict(vectorizer_count.transform(test_data)))

print("The precision on the test data is {:.2%}".format(recall))

The precision on the test data is 78.38%


In [165]:
from sklearn.metrics import f1_score

In [166]:
f1 = f1_score(y_true=test_target,
              y_pred=clf.predict(vectorizer_count.transform(test_data)))
print("The F1 on the test data is {:.2%}".format(f1))

The F1 on the test data is 81.85%


------

Textblob

In [167]:
reset -fs

In [168]:
path = "./txt_sentoken/"

Fit standard senitment model

In [169]:
import glob

from textblob import TextBlob

In [238]:
correct_neg_count = 0
for filename in glob.glob(path+"neg/"+"*.txt"):
    with open(filename) as f:
        if TextBlob(f.read()).sentiment.polarity < 0:
            correct_neg_count += 1
            
total_neg = len(glob.glob(path+"neg/"+"*.txt"))
acc_neg = correct_neg_count / total_neg
print(acc_neg)

0.229


In [239]:
correct_pos_count = 0
for filename in glob.glob(path+"pos/"+"*.txt"):
    with open(filename) as f:
        if TextBlob(f.read()).sentiment.polarity > 0:
            correct_pos_count += 1
            
total_pos = len(glob.glob(path+"pos/"+"*.txt"))
acc_pos = correct_pos_count / total_pos
print(acc_pos)

0.971


In [241]:
acc_overall = (correct_neg_count+correct_pos_count)/(total_neg+total_pos)
print(acc_overall)

0.6


----

Train TextBlob classifer

In [172]:
from textblob.classifiers import NaiveBayesClassifier

In [174]:
from sklearn.datasets import load_files

In [227]:
# Load data
sentiment = load_files(path, 
                       encoding='utf-8',
                       random_state=42)

['neg', 'pos']

In [228]:
from sklearn.model_selection import train_test_split

In [229]:
sentiment.target[0]

1

In [230]:
# Create train/test split with labels
train_data, test_data, train_target, test_target = train_test_split(sentiment.data,
                                                                    sentiment.target,
                                                                    random_state=42)

In [234]:
train_set = [(train_data[0], 'pos'),
            (train_data[1], 'neg')]

In [245]:
cl = NaiveBayesClassifier(zip(train_data, train_target))

In [246]:
cl.show_informative_features()

Most Informative Features


In [237]:
cl.accuracy(zip(test_data, test_target))

0.51800000000000002

-----
Negation Features
----

In [248]:
from functools import lru_cache as memoize

In [250]:
@memoize(maxsize=128)
def negate_sequence(text):
    from nltk.tokenize import word_tokenize
    negation = False
    delims = "?.,!:;"
    result = []
    words = word_tokenize(text)
    for word in words:
        stripped = word.strip(delims).lower()
        negated = "not_" + stripped if negation else stripped
        result.append(negated)

        if word in ("not", "n't", "no"):
            negation = not negation
        
        if any(c in word for c in delims):
            negation = False

    return result

In [253]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [256]:
train_data, test_data, train_target, test_target = train_test_split(sentiment.data,
                                                                    sentiment.target, 
                                                                    random_state=42)
vectorizer_neg = CountVectorizer(tokenizer=negate_sequence,
                                binary=True)

mnb_neg = MultinomialNB()
mnb_neg.fit(vectorizer_neg.fit_transform(train_data), train_target)
print(mnb_neg.score(vectorizer_neg.transform(test_data), test_target))

0.818


<br>
<br> 
<br>

----