In [1]:
# 5.2 Exercise
# Build your own sentiment analysis model
## Justin Wisniewski

In [2]:
import pandas as pd
 
# Read TSV movie review file into DataFrame
df = pd.read_table('labeledTrainData.tsv')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
# Convert all text to lowercase letters
df['review'] = df['review'].apply(str.lower)
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,with all this stuff going down at the moment w...
1,2381_9,1,"\the classic war of the worlds\"" by timothy hi..."
2,7759_3,0,the film starts with a manager (nicholas bell)...
3,3630_4,0,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...


In [4]:
# Remove punctuation and special characters from the text
df['review'] = df['review'].str.replace('[^\w\s]', '')
df.head()

  df['review'] = df['review'].str.replace('[^\w\s]', '')


Unnamed: 0,id,sentiment,review
0,5814_8,1,with all this stuff going down at the moment w...
1,2381_9,1,the classic war of the worlds by timothy hines...
2,7759_3,0,the film starts with a manager nicholas bell g...
3,3630_4,0,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...


In [5]:
import nltk
from nltk.corpus import stopwords

In [6]:
# Remove stop words
stop_words = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff going moment mj ive started listening mu...
1,2381_9,1,classic war worlds timothy hines entertaining ...
2,7759_3,0,film starts manager nicholas bell giving welco...
3,3630_4,0,must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...


In [7]:
# Tokenization
from nltk.tokenize import RegexpTokenizer

regexp = RegexpTokenizer('\w+')

df['review_token']=df['review'].apply(regexp.tokenize)
df.head()

Unnamed: 0,id,sentiment,review,review_token
0,5814_8,1,stuff going moment mj ive started listening mu...,"[stuff, going, moment, mj, ive, started, liste..."
1,2381_9,1,classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta..."
2,7759_3,0,film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving..."
3,3630_4,0,must assumed praised film greatest filmed oper...,"[must, assumed, praised, film, greatest, filme..."
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,"[superbly, trashy, wondrously, unpretentious, ..."


In [8]:
# Keep words only longer than two letters
df['review_string'] = df['review_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
df.head()

Unnamed: 0,id,sentiment,review,review_token,review_string
0,5814_8,1,stuff going moment mj ive started listening mu...,"[stuff, going, moment, mj, ive, started, liste...",stuff going moment ive started listening music...
1,2381_9,1,classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta...",classic war worlds timothy hines entertaining ...
2,7759_3,0,film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving...",film starts manager nicholas bell giving welco...
3,3630_4,0,must assumed praised film greatest filmed oper...,"[must, assumed, praised, film, greatest, filme...",must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,"[superbly, trashy, wondrously, unpretentious, ...",superbly trashy wondrously unpretentious 80s e...


In [9]:
# Apply NLTKs PorterStemmer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
porter_stemmer = PorterStemmer()

df['tokenized_review'] = df.apply(lambda row: nltk.word_tokenize(row['review_string']), axis=1)
df['stem'] = df['tokenized_review'].apply(lambda x : [porter_stemmer.stem(y) for y in x])
df.head()

Unnamed: 0,id,sentiment,review,review_token,review_string,tokenized_review,stem
0,5814_8,1,stuff going moment mj ive started listening mu...,"[stuff, going, moment, mj, ive, started, liste...",stuff going moment ive started listening music...,"[stuff, going, moment, ive, started, listening...","[stuff, go, moment, ive, start, listen, music,..."
1,2381_9,1,classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta...",classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta...","[classic, war, world, timothi, hine, entertain..."
2,7759_3,0,film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving...",film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving...","[film, start, manag, nichola, bell, give, welc..."
3,3630_4,0,must assumed praised film greatest filmed oper...,"[must, assumed, praised, film, greatest, filme...",must assumed praised film greatest filmed oper...,"[must, assumed, praised, film, greatest, filme...","[must, assum, prais, film, greatest, film, ope..."
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,"[superbly, trashy, wondrously, unpretentious, ...",superbly trashy wondrously unpretentious 80s e...,"[superbly, trashy, wondrously, unpretentious, ...","[superbl, trashi, wondrous, unpretenti, 80, ex..."


In [10]:
# Change stem column to string
df['stem_str'] = df['stem'].apply(lambda text: ' '.join(text))
df.head()

Unnamed: 0,id,sentiment,review,review_token,review_string,tokenized_review,stem,stem_str
0,5814_8,1,stuff going moment mj ive started listening mu...,"[stuff, going, moment, mj, ive, started, liste...",stuff going moment ive started listening music...,"[stuff, going, moment, ive, started, listening...","[stuff, go, moment, ive, start, listen, music,...",stuff go moment ive start listen music watch o...
1,2381_9,1,classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta...",classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta...","[classic, war, world, timothi, hine, entertain...",classic war world timothi hine entertain film ...
2,7759_3,0,film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving...",film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving...","[film, start, manag, nichola, bell, give, welc...",film start manag nichola bell give welcom inve...
3,3630_4,0,must assumed praised film greatest filmed oper...,"[must, assumed, praised, film, greatest, filme...",must assumed praised film greatest filmed oper...,"[must, assumed, praised, film, greatest, filme...","[must, assum, prais, film, greatest, film, ope...",must assum prais film greatest film opera ever...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...,"[superbly, trashy, wondrously, unpretentious, ...",superbly trashy wondrously unpretentious 80s e...,"[superbly, trashy, wondrously, unpretentious, ...","[superbl, trashi, wondrous, unpretenti, 80, ex...",superbl trashi wondrous unpretenti 80 exploit ...


In [11]:
#Converting each entry to word count vector
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag_of_words = count.fit_transform(df['stem_str'])

In [12]:
# Display dimensions of bag-of-words matrix
bag_of_words.shape

(25000, 92068)

In [13]:
# Split this into a training and test set
training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)

In [14]:
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 20000
No. of testing examples: 5000


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# Fit and apply the tf-idf vectorization to the training set
tfidf = TfidfVectorizer()
feature_matrix=tfidf.fit_transform(training_data['stem_str'])

In [18]:
# Apply but DO NOT FIT the tf-idf vectorization to the test set (Why?)
# Usually, you'd want to separate your train, cross-validation and test datasets
# main concern is having some certainty that your model can generalize to some unseen dataset.
test_matrix=tfidf.transform(testing_data['stem_str'])

In [42]:
# For training

X = tfidf.fit_transform(X)

# For test 

X_test = tfidf.transform(X_test)

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1,random_state=50)

In [44]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()