# Sentiment Analysis

## Example 1

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('data/training.txt', sep='\t', names=['liked', 'txt'])
df.head()

Unnamed: 0,liked,txt
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [4]:
# TFIDF Vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [5]:
# In thise case our dependent variable will be liked as 0 (didn't like the movie) or 1 (liked the movie)
y = df.liked

In [6]:
# Convert df.txt from text to features
X = vectorizer.fit_transform(df.txt)

In [7]:
#6918 observations x 2011 unique words
print(y.shape)
print(X.shape)

(6918,)
(6918, 2011)


In [8]:
# Train Test Split as usual
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# We will train a naive_bayed classifier
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

In [10]:
# We can test out model's accuracy like this:
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.9979292333245913

In [11]:
movie_reviews_array = np.array(['Jupiter Ascending was a disapointing and terrible movie'])
movie_reviews_vector = vectorizer.transform(movie_reviews_array)

print(clf.predict(movie_reviews_vector))

[0]


## Example 2

In [12]:
# Load and prepare the dataset
import nltk
from nltk.corpus import movie_reviews
import random

documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)


In [14]:
# Define the feature extractor

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [15]:
# Train Naive Bayes classifier
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [16]:
# Test the classifier
print(nltk.classify.accuracy(classifier, test_set))

0.82


In [17]:
# Show the most important features as interpreted by Naive Bayes
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.3 : 1.0
         contains(mulan) = True              pos : neg    =      9.1 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.6 : 1.0
          contains(jedi) = True              pos : neg    =      7.4 : 1.0
