# Machine Learning: Sentiment Analysis
#### Joshua Greenert
#### DSC550-T301 Data Mining
#### 9/15/2022

## Part 1

In [1]:
# Load the libraries
import numpy as np
import pandas as pd
from textblob import TextBlob

bagOfWords = pd.read_csv('labeledTrainData.tsv', sep='\t')
bagOfWords.head(5)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [2]:
# How many of each positive and negative reviews are there?
bagOfWords.groupby(by = 'sentiment').count()

Unnamed: 0_level_0,id,review
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12500,12500
1,12500,12500


In [3]:
# Use TextBlob to classify each movie review as positive or negative. 
# Assume that a polarity score greater than or equal to zero is a positive sentiment and less than 0 is a negative sentiment.
textBlobDF = pd.DataFrame(data = bagOfWords['review'].apply(lambda review: TextBlob(review).sentiment.polarity))
textBlobDF.head(5)

Unnamed: 0,review
0,0.001277
1,0.256349
2,-0.053941
3,0.134753
4,-0.024842


In [4]:
# Check the accuracy of this model. Is this model better than random guessing?
textBlobDFPositive = textBlobDF[textBlobDF['review'] >= 0].count()
textBlobDFNegative = textBlobDF[textBlobDF['review'] <= 0].count()
print(f"The positive score count is {textBlobDFPositive} and the negative score count is {textBlobDFNegative}")
print("According to the scores predicted, the model appears to be severely incorrect based on sentiment value.")

The positive score count is review    19017
dtype: int64 and the negative score count is review    6000
dtype: int64
According to the scores predicted, the model appears to be severely incorrect based on sentiment value.


In [5]:
# For up to five points extra credit, use another prebuilt text sentiment analyzer, e.g., VADER, and repeat steps (3) and (4).
# Using the tensorflow analyzer.
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Create a sentiment analyzer.
sia = SentimentIntensityAnalyzer()

# Get the text from the objects.
reviews = bagOfWords['review']

# Create a list of values from the polarity scores.
polarity_result = []

for i in range(len(reviews)):
    polarity_result.append(sia.polarity_scores(reviews[i]))

for i in range(5):
    print(polarity_result[i])

{'neg': 0.13, 'neu': 0.744, 'pos': 0.126, 'compound': -0.8278}
{'neg': 0.047, 'neu': 0.739, 'pos': 0.214, 'compound': 0.9819}
{'neg': 0.142, 'neu': 0.8, 'pos': 0.058, 'compound': -0.9883}
{'neg': 0.066, 'neu': 0.878, 'pos': 0.056, 'compound': -0.2189}
{'neg': 0.119, 'neu': 0.741, 'pos': 0.14, 'compound': 0.796}


## Part 2

In [6]:
# Convert all text to lowercase letters.
allText = bagOfWords['review']

allTextLower = [string.lower() for string in allText]

# show the results.
allTextLower

["with all this stuff going down at the moment with mj i've started listening to his music, watching the odd documentary here and there, watched the wiz and watched moonwalker again. maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. some of it has subtle messages about mj's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring. some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />the actual feature film bit when it finally st

In [7]:
# Remove punctuation and special characters from the text.
import re

for i in range(len(allTextLower)):
    allTextLower[i] = re.sub('\W+',' ', allTextLower[i] )

# Show the results
allTextLower

['with all this stuff going down at the moment with mj i ve started listening to his music watching the odd documentary here and there watched the wiz and watched moonwalker again maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent moonwalker is part biography part feature film which i remember going to see at the cinema when it was originally released some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay br br visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him br br the actual feature film bit when it finally starts is only on for

In [8]:
# Remove stop words.
from nltk.corpus import stopwords

# Load stop words.
stop_words = stopwords.words('english')

for i in range(len(allTextLower)):
    stringWords = str(allTextLower[i])
    splitWords = stringWords.split(" ")
    wordsWithStops = [word for word in splitWords if word not in stop_words]
    allTextLower[i] = wordsWithStops
    
# Print the results.
allTextLower

[['stuff',
  'going',
  'moment',
  'mj',
  'started',
  'listening',
  'music',
  'watching',
  'odd',
  'documentary',
  'watched',
  'wiz',
  'watched',
  'moonwalker',
  'maybe',
  'want',
  'get',
  'certain',
  'insight',
  'guy',
  'thought',
  'really',
  'cool',
  'eighties',
  'maybe',
  'make',
  'mind',
  'whether',
  'guilty',
  'innocent',
  'moonwalker',
  'part',
  'biography',
  'part',
  'feature',
  'film',
  'remember',
  'going',
  'see',
  'cinema',
  'originally',
  'released',
  'subtle',
  'messages',
  'mj',
  'feeling',
  'towards',
  'press',
  'also',
  'obvious',
  'message',
  'drugs',
  'bad',
  'kay',
  'br',
  'br',
  'visually',
  'impressive',
  'course',
  'michael',
  'jackson',
  'unless',
  'remotely',
  'like',
  'mj',
  'anyway',
  'going',
  'hate',
  'find',
  'boring',
  'may',
  'call',
  'mj',
  'egotist',
  'consenting',
  'making',
  'movie',
  'mj',
  'fans',
  'would',
  'say',
  'made',
  'fans',
  'true',
  'really',
  'nice',
  'br'

In [9]:
# Apply NLTK’s PorterStemmer.
from nltk.stem.porter import PorterStemmer

# Create a porter.
porter = PorterStemmer()

for i in range(len(allTextLower)):
    allTextLower[i] = [porter.stem(word) for word in allTextLower[i]]
    
# Print the results.
allTextLower

[['stuff',
  'go',
  'moment',
  'mj',
  'start',
  'listen',
  'music',
  'watch',
  'odd',
  'documentari',
  'watch',
  'wiz',
  'watch',
  'moonwalk',
  'mayb',
  'want',
  'get',
  'certain',
  'insight',
  'guy',
  'thought',
  'realli',
  'cool',
  'eighti',
  'mayb',
  'make',
  'mind',
  'whether',
  'guilti',
  'innoc',
  'moonwalk',
  'part',
  'biographi',
  'part',
  'featur',
  'film',
  'rememb',
  'go',
  'see',
  'cinema',
  'origin',
  'releas',
  'subtl',
  'messag',
  'mj',
  'feel',
  'toward',
  'press',
  'also',
  'obviou',
  'messag',
  'drug',
  'bad',
  'kay',
  'br',
  'br',
  'visual',
  'impress',
  'cours',
  'michael',
  'jackson',
  'unless',
  'remot',
  'like',
  'mj',
  'anyway',
  'go',
  'hate',
  'find',
  'bore',
  'may',
  'call',
  'mj',
  'egotist',
  'consent',
  'make',
  'movi',
  'mj',
  'fan',
  'would',
  'say',
  'made',
  'fan',
  'true',
  'realli',
  'nice',
  'br',
  'br',
  'actual',
  'featur',
  'film',
  'bit',
  'final',
  'sta

In [14]:
# Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count
# vector for a single movie review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook). 
# Display the dimensions of your bag-of-words matrix. The number of rows in this matrix should be the same
# as the number of rows in your original data frame.
from sklearn.feature_extraction.text import CountVectorizer

# Create the count vectorizer object
count = CountVectorizer()

CVBagOfWords = []

for i in range(len(allTextLower)):
    CVBagOfWords.append(count.fit_transform(allTextLower[i]))

len(CVBagOfWords)

25000

In [None]:
# Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your
# movie reviews (see section 6.9 in the Machine Learning with Python Cookbook). Display the dimensions
# of your tf-idf matrix. These dimensions should be the same as your bag-of-words matrix.
uniqueWords = ()

# Loop through each list and place words in the set.
for li in allTextLower:
    for doc in  li:
        if doc in uniqueWords:
            continue
        else:
            uniqueWords.add(doc)

print('test')
uniqueWords