In [1]:
# 3.2 Exercise
# Sentiment Analysis and Preprocessing Text
## Justin Wisniewski

In [2]:
import pandas as pd
 
# Read TSV movie review file into DataFrame
df = pd.read_table('labeledTrainData.tsv')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
from textblob import TextBlob
import matplotlib.pyplot as plot

In [4]:
# How may of each positive and negative reviews are there
# 1 being positive 0 being negative
print(df.sentiment.value_counts())

1    12500
0    12500
Name: sentiment, dtype: int64


In [5]:
# Use TextBlob to classify each movie review as positive or negative
# Add polarity column
def detect_polarity(review):
    return TextBlob(review).sentiment.polarity
df['polarity'] = df.review.apply(detect_polarity)
df.head()

Unnamed: 0,id,sentiment,review,polarity
0,5814_8,1,With all this stuff going down at the moment w...,0.001277
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941
3,3630_4,0,It must be assumed that those who praised this...,0.134753
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842


In [6]:
limit = 0
column_name = 'polarity'
# Get count of values greater than or eaual to 0 in the column 'polarity' 
count = (df[column_name] >= limit).sum()
print('Count of values greater than 0 in Column polarity : ', count)

Count of values greater than 0 in Column polarity :  19017


In [7]:
limit = 0
column_name = 'polarity'
# Get count of values less than 0 in the column 'polarity' 
count = (df[column_name] < limit).sum()
print('Count of values less than 0 in Column polarity : ', count)

Count of values less than 0 in Column polarity :  5983


In [8]:
# This model appears to be better than random guessing.
# Based off sentiment / ratings, we had a 50/50 split.
# Using polarity #, it's more of a 76/24 split.

In [9]:
# Part 2
# Prepping text for a custom model

In [10]:
# Convert all text to lowercase letters
df['review'] = df['review'].apply(str.lower)
df.head()

Unnamed: 0,id,sentiment,review,polarity
0,5814_8,1,with all this stuff going down at the moment w...,0.001277
1,2381_9,1,"\the classic war of the worlds\"" by timothy hi...",0.256349
2,7759_3,0,the film starts with a manager (nicholas bell)...,-0.053941
3,3630_4,0,it must be assumed that those who praised this...,0.134753
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842


In [11]:
# Remove punctuation and special characters from the text
df['review'] = df['review'].str.replace('[^\w\s]', '')
df.head()

  df['review'] = df['review'].str.replace('[^\w\s]', '')


Unnamed: 0,id,sentiment,review,polarity
0,5814_8,1,with all this stuff going down at the moment w...,0.001277
1,2381_9,1,the classic war of the worlds by timothy hines...,0.256349
2,7759_3,0,the film starts with a manager nicholas bell g...,-0.053941
3,3630_4,0,it must be assumed that those who praised this...,0.134753
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842


In [12]:
import nltk
from nltk.corpus import stopwords

In [13]:
# Remove stop words
stop_words = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

Unnamed: 0,id,sentiment,review,polarity
0,5814_8,1,stuff going moment mj ive started listening mu...,0.001277
1,2381_9,1,classic war worlds timothy hines entertaining ...,0.256349
2,7759_3,0,film starts manager nicholas bell giving welco...,-0.053941
3,3630_4,0,must assumed praised film greatest filmed oper...,0.134753
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,-0.024842


In [14]:
# Tokenization
from nltk.tokenize import RegexpTokenizer

regexp = RegexpTokenizer('\w+')

df['review_token']=df['review'].apply(regexp.tokenize)
df.head()

Unnamed: 0,id,sentiment,review,polarity,review_token
0,5814_8,1,stuff going moment mj ive started listening mu...,0.001277,"[stuff, going, moment, mj, ive, started, liste..."
1,2381_9,1,classic war worlds timothy hines entertaining ...,0.256349,"[classic, war, worlds, timothy, hines, enterta..."
2,7759_3,0,film starts manager nicholas bell giving welco...,-0.053941,"[film, starts, manager, nicholas, bell, giving..."
3,3630_4,0,must assumed praised film greatest filmed oper...,0.134753,"[must, assumed, praised, film, greatest, filme..."
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,-0.024842,"[superbly, trashy, wondrously, unpretentious, ..."


In [15]:
# Keep words only longer than two letters
df['review_string'] = df['review_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
df.head()

Unnamed: 0,id,sentiment,review,polarity,review_token,review_string
0,5814_8,1,stuff going moment mj ive started listening mu...,0.001277,"[stuff, going, moment, mj, ive, started, liste...",stuff going moment ive started listening music...
1,2381_9,1,classic war worlds timothy hines entertaining ...,0.256349,"[classic, war, worlds, timothy, hines, enterta...",classic war worlds timothy hines entertaining ...
2,7759_3,0,film starts manager nicholas bell giving welco...,-0.053941,"[film, starts, manager, nicholas, bell, giving...",film starts manager nicholas bell giving welco...
3,3630_4,0,must assumed praised film greatest filmed oper...,0.134753,"[must, assumed, praised, film, greatest, filme...",must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,-0.024842,"[superbly, trashy, wondrously, unpretentious, ...",superbly trashy wondrously unpretentious 80s e...


In [17]:
# Apply NLTKs PorterStemmer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
porter_stemmer = PorterStemmer()

df['tokenized_review'] = df.apply(lambda row: nltk.word_tokenize(row['review_string']), axis=1)
df['stem'] = df['tokenized_review'].apply(lambda x : [porter_stemmer.stem(y) for y in x])
df.head()

Unnamed: 0,id,sentiment,review,polarity,review_token,review_string,tokenized_review,stem
0,5814_8,1,stuff going moment mj ive started listening mu...,0.001277,"[stuff, going, moment, mj, ive, started, liste...",stuff going moment ive started listening music...,"[stuff, going, moment, ive, started, listening...","[stuff, go, moment, ive, start, listen, music,..."
1,2381_9,1,classic war worlds timothy hines entertaining ...,0.256349,"[classic, war, worlds, timothy, hines, enterta...",classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta...","[classic, war, world, timothi, hine, entertain..."
2,7759_3,0,film starts manager nicholas bell giving welco...,-0.053941,"[film, starts, manager, nicholas, bell, giving...",film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving...","[film, start, manag, nichola, bell, give, welc..."
3,3630_4,0,must assumed praised film greatest filmed oper...,0.134753,"[must, assumed, praised, film, greatest, filme...",must assumed praised film greatest filmed oper...,"[must, assumed, praised, film, greatest, filme...","[must, assum, prais, film, greatest, film, ope..."
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,-0.024842,"[superbly, trashy, wondrously, unpretentious, ...",superbly trashy wondrously unpretentious 80s e...,"[superbly, trashy, wondrously, unpretentious, ...","[superbl, trashi, wondrous, unpretenti, 80, ex..."


In [19]:
# Change stem column to string
df['stem_str'] = df['stem'].apply(lambda text: ' '.join(text))
df.head()

Unnamed: 0,id,sentiment,review,polarity,review_token,review_string,tokenized_review,stem,stem_str
0,5814_8,1,stuff going moment mj ive started listening mu...,0.001277,"[stuff, going, moment, mj, ive, started, liste...",stuff going moment ive started listening music...,"[stuff, going, moment, ive, started, listening...","[stuff, go, moment, ive, start, listen, music,...",stuff go moment ive start listen music watch o...
1,2381_9,1,classic war worlds timothy hines entertaining ...,0.256349,"[classic, war, worlds, timothy, hines, enterta...",classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta...","[classic, war, world, timothi, hine, entertain...",classic war world timothi hine entertain film ...
2,7759_3,0,film starts manager nicholas bell giving welco...,-0.053941,"[film, starts, manager, nicholas, bell, giving...",film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving...","[film, start, manag, nichola, bell, give, welc...",film start manag nichola bell give welcom inve...
3,3630_4,0,must assumed praised film greatest filmed oper...,0.134753,"[must, assumed, praised, film, greatest, filme...",must assumed praised film greatest filmed oper...,"[must, assumed, praised, film, greatest, filme...","[must, assum, prais, film, greatest, film, ope...",must assum prais film greatest film opera ever...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,-0.024842,"[superbly, trashy, wondrously, unpretentious, ...",superbly trashy wondrously unpretentious 80s e...,"[superbly, trashy, wondrously, unpretentious, ...","[superbl, trashi, wondrous, unpretenti, 80, ex...",superbl trashi wondrous unpretenti 80 exploit ...


In [21]:
#Converting each entry to word count vector
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag_of_words = count.fit_transform(df['stem_str'])

In [22]:
# Display dimensions of bag-of-words matrix
bag_of_words.shape

(25000, 92068)

In [23]:
# Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
# Create the tf-idf feature matrix
tfidf = TfidfVectorizer()
feature_matrix=tfidf.fit_transform(df['stem_str'])

In [26]:
feature_matrix

<25000x92068 sparse matrix of type '<class 'numpy.float64'>'
	with 2393969 stored elements in Compressed Sparse Row format>