# DSC 550 - Data Mining
# Week 3 Exercise: Sentiment Analysis and Preprocessing Text

## Part 1: Using the TextBlob Sentiment Analyzer

In [1]:
#import pandas and numpy
import pandas as pd
import numpy as np
#import textblob to perform sentiment analysis
from textblob import TextBlob
#import accuracy calculator from sklearn
from sklearn.metrics import accuracy_score
#import nltk library
import nltk
#get the vader sentiment analyzer from the nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [2]:
#1) import the data
df_reviews = pd.read_csv('labeledTrainData.tsv', sep='\t')
#get the dimensions of the data
print(df_reviews.shape)
#view the data
df_reviews.head()

(25000, 3)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
#2) Get the TextBlob sentiment for each review 
#define a function to get the polarity score from textblob
def get_polarity(text):
    return TextBlob(text).sentiment.polarity
df_reviews['tb_polarity'] = df_reviews['review'].apply(get_polarity)
#define textblob sentiment as positive is polarity is greater than or equal to 0
#define textblob sentiment as negative if polarity is less than 0
df_reviews['tb_sentiment'] = (df_reviews['tb_polarity'] >= 0).astype(int)
df_reviews.head()

Unnamed: 0,id,sentiment,review,tb_polarity,tb_sentiment
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0


In [4]:
#3) get the accuracy of the TextBlob Sentiment
tb_acc = accuracy_score(df_reviews['sentiment'], df_reviews['tb_sentiment'])
print('TextBlob Accuracy:', tb_acc)

TextBlob Accuracy: 0.68524


In [5]:
#4) repeat steps (2) and (3) for the Vader sentiment analyzer 
#initialize a vader sentiment analyzer
vader_analyzer = SentimentIntensityAnalyzer()

In [6]:
#4) continued
#apply vader sentiment analyzer to the reviews
df_reviews['vad_polarity'] = df_reviews['review'].apply(lambda review: vader_analyzer.polarity_scores(review))
#define vader sentiment as positive if compond score is greater than or equal to 0; negative otherwise
df_reviews['vad_sentiment'] = df_reviews['vad_polarity'].apply(lambda score_dict: int(score_dict['compound'] >=0))
#view the data
df_reviews.head()

Unnamed: 0,id,sentiment,review,tb_polarity,tb_sentiment,vad_polarity,vad_sentiment
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,"{'neg': 0.13, 'neu': 0.744, 'pos': 0.126, 'com...",0
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,"{'neg': 0.047, 'neu': 0.739, 'pos': 0.214, 'co...",1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,"{'neg': 0.142, 'neu': 0.8, 'pos': 0.058, 'comp...",0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,"{'neg': 0.065, 'neu': 0.879, 'pos': 0.056, 'co...",0
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,"{'neg': 0.119, 'neu': 0.741, 'pos': 0.14, 'com...",1


In [7]:
#4) continued
#get the accuracy of the vader sentiment analyzer
vad_acc = accuracy_score(df_reviews['sentiment'], df_reviews['vad_sentiment'])
print('Vader Accuracy:', vad_acc)

Vader Accuracy: 0.69216


In [8]:
#compare the model results
model_data = {'Model': ['TextBlob', 'Vader'], 'Accuracy': [str(round(100*tb_acc,2)) + '%', str(round(100*vad_acc,2)) + '%' ] }
df_mod_data = pd.DataFrame(model_data)
df_mod_data = df_mod_data.set_index('Model', drop = True)
df_mod_data

Unnamed: 0_level_0,Accuracy
Model,Unnamed: 1_level_1
TextBlob,68.52%
Vader,69.22%


## Part 2: Prepping Text for a Custom Model

In [9]:
#import regular expressions library
import re

#import nltk
import nltk
#import stopwords from NLTK
from nltk.corpus import stopwords
#import word tokenizer from NLTK
from nltk.tokenize import word_tokenize

#import sklearn
import sklearn
#import word count vectorizer from sklearn
from sklearn.feature_extraction.text import CountVectorizer
#import tf-idf vectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
#get a data frame with reviews and sentiments only
df_reviews_preprocessed = pd.DataFrame(df_reviews[['sentiment','review']])
df_reviews_preprocessed.head()

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...


In [11]:
#1) and 2) Convert text to lowercase and romove punctuation/special characters
#define a function to clean the text
def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text 
    Output: text
    """    
    text=text.lower() #makes text lowercase
    text=re.sub('\\d|\\W+|_',' ',text) #removes extra white space
    text=re.sub('[^a-zA-Z0-9]'," ", text) #removes any non-alphanumeric characters
    
    return text

In [12]:
#3) Remove stop words (and tokenize)
def tokenize_and_remove_stop_words(txt):
    """
    takes in a sentence, tokenizes the words into a list,
    and then removes stop words from the tokenized list
    """
    #import the nltk package for tokenizing and removing stop words
    import nltk
    from nltk.tokenize import word_tokenize
    #Note: You may have to run this next commmand to download the NLTK 'punkt' library for the first time
    #nltk.download('punkt')
    from nltk.corpus import stopwords
    #Note: You may need to run this next command to download stopwords for the first time
    #nltk.download('stopwords')
    stop_words = stopwords.words('english')
    
    txt_token = word_tokenize(txt)
    txt_no_stopwords = [word for word in txt_token if word not in stop_words]
    
    return txt_no_stopwords

In [13]:
#4) Apply NLTK's PorterStemmer
#define a function to stem the words
def stem_text(word_list):
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    
    return [porter.stem(word) for word in word_list]

In [14]:
#create new columns in the data frame for each preprocessing step
#apply text cleaning function
df_reviews_preprocessed['review_clean'] = df_reviews_preprocessed['review'].apply(clean_text)
#aplpy tokenizing/removing stop words function
df_reviews_preprocessed['review_tokenized'] = df_reviews_preprocessed['review_clean'].apply(tokenize_and_remove_stop_words)
#apply PorterStemmer function
df_reviews_preprocessed['review_stemmed'] = df_reviews_preprocessed['review_tokenized'].apply(stem_text)
#put the text back together (untokenize)
df_reviews_preprocessed['review_final'] = df_reviews_preprocessed['review_stemmed'].apply(lambda text: ' '.join(text))
#view the pre-processed text
print(df_reviews_preprocessed.shape)
df_reviews_preprocessed.head()

(25000, 6)


Unnamed: 0,sentiment,review,review_clean,review_tokenized,review_stemmed,review_final
0,1,With all this stuff going down at the moment w...,with all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,...","[stuff, go, moment, mj, start, listen, music, ...",stuff go moment mj start listen music watch od...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war of the worlds by timothy hine...,"[classic, war, worlds, timothy, hines, enterta...","[classic, war, world, timothi, hine, entertain...",classic war world timothi hine entertain film ...
2,0,The film starts with a manager (Nicholas Bell)...,the film starts with a manager nicholas bell g...,"[film, starts, manager, nicholas, bell, giving...","[film, start, manag, nichola, bell, give, welc...",film start manag nichola bell give welcom inve...
3,0,It must be assumed that those who praised this...,it must be assumed that those who praised this...,"[must, assumed, praised, film, greatest, filme...","[must, assum, prais, film, greatest, film, ope...",must assum prais film greatest film opera ever...
4,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious ...,"[superbly, trashy, wondrously, unpretentious, ...","[superbl, trashi, wondrous, unpretenti, exploi...",superbl trashi wondrous unpretenti exploit hoo...


In [15]:
#5) create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(df_reviews_preprocessed['review_final'])
#check the shape of the output
bag_of_words.shape

(25000, 49638)

In [16]:
#6) define a function to get the tf-idf vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_reviews_preprocessed['review_final'])
#check the shape of the output
tfidf_matrix.shape

(25000, 49638)