In [1]:
import numpy as np
import scipy.sparse
import pandas as pd
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#from mpl_toolkits import mplot3d 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from string import punctuation
from sklearn.model_selection import train_test_split
import tensorflow as tf
from collections import Counter
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/justinszaro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Frame the Problem and Look at the Big Picture 

1. Define the objective in business terms. 

    The business objective is to create a machine learning model that takes a movie goer's review and determines if it is positive or negative. The client can use this to quickly determine the overall reaction to a movie showing. 

2. How will your solution be used? 

    The solution will be used by the client to quickly determine reactions to movie showings. 

3. What are the current solutions/workarounds (if any)? 

    The current solution is to manually poll viewers to determine how they felt about the movie. This is a slow process and the review can be very mixed. 

4. How should you frame this problem (supervised/unsupervised, online/offline, ...)?

    This problem is a supervised offline sentiment analysis problem. 

5. How should performance be measured? Is the performance measure aligned with the business objective? 

    Our metric for this problem will be accuracy. The most important function of this model is to accurately measure the sentiment of the review, so it is important that our model has high accuracy.

6. What would be the minimum performance needed to reach the business objective? 

    The minimum performance required is 80% accuracy. 

7. What are comparable problems? Can you reuse experience or tools? 

    A comparable problem was using by a youtuber named Micheal Reeves. He used sentiment analysis to determine if reddit posts on r/wallstreetbets were positive or negative. We can reuse the experience but not the tools he used. There are resuable tools in the textbook for this course that we will be using. 

8. Is human expertise available? 

    No human expertise is availible at this time. 

9. How would you solve the problem manually? 

    To solve this problem manually, we would interview the movie goers individually to determine their feelings about the movie. 

10.  List the assumptions you (or others) have made so far. Verify assumptions if possible. 

    - A review can either be positive or negative. There are no "meh" reviews. 

# Get the Data 

1. List the data you need and how much you need 

    We need movie reviews and a label that determines their sentiment (1 being positive, 0 being negative). We need over 1000.

2. Find and document where you can get that data 

    The data can be gathered form the open source website Kaggle.

3. Get access authorizations 

    The webiste and data is open source, so we have full access to the data.

4. Create a workspace (with enough storage space) 

    This notebook.

5. Get the data 

    The data was downloaded from kaggle and inserted into this repository.
    Link: https://www.kaggle.com/datasets/yasserh/imdb-movie-ratings-sentiment-analysis?resource=download

6. Convert the data to a format you can easily manipulate (without changing the data itself) 

    Made into a pandas dataframe.

7. Ensure sensitive information is deleted or protected (e.g. anonymized) 

    Not applicable to this situation.

8. Check the size and type of data (time series, geographical, ...) 

    - text: object: 40000 entries
    - label: int: 400000 entries
    
9. Sample a test set, put it aside, and never look at it (no data snooping!) 

In [2]:
data = pd.read_csv("movie.csv")

In [3]:
data

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [5]:
train_set, test_set = train_test_split(data, stratify=data['label'], test_size=0.2, random_state=420)

# Explore the Data 

1. Copy the data for exploration, downsampling to a manageable size if necessary. 


2. Study each attribute and its characteristics: Name; Type (categorical, numerical, bounded, text, structured, ...); % of missing values; Noisiness and type of noise (stochastic, outliers, rounding errors, ...); Usefulness for the task; Type of distribution (Gaussian, uniform logarithmic, ...) 

    - text is a object datatype with no missing data.
    - label is an interger value of either 1 or a 0. 1 is positive, 0 is negative. There is no missing data. In the data set, there is almost an equal number of positive and negative labels.


3. For supervised learning tasks, identify the target attribute(s) 

    The target attribute is the label feature. It is the sentiment of the text.

4. Visualize the data 


5. Study the correlations between attributes 


6. Study how you would solve the problem manually 


7. Identify the promising transformations you may want to apply 


8. Identify extra data that would be useful (go back to “Get the Data”) 


9. Document what you have learned 
    - With statification, there is almost an equal number of text with positive/negative sentiment.

In [6]:
train_copy = train_set.copy()

In [7]:
train_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32000 entries, 33632 to 13973
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    32000 non-null  object
 1   label   32000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 750.0+ KB


In [8]:
train_copy['label'].value_counts()

0    16015
1    15985
Name: label, dtype: int64

# Bag of Words

In [44]:
corpus = []
all_stop_words = stopwords.words('english')
all_stop_words.remove('not')
for text in train_copy['text']:
    review = text.lower()
    review = re.sub("<.+>", " ", review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = re.sub("z.+z", " ", review)
    review = review.lower()
    review = review.split()
    review = [word for word in review if word not in set(all_stop_words)]
    ps = PorterStemmer()
    review = " ".join(ps.stem(word) for word in review)
    corpus.append(review)
    


In [58]:
corpus

['best bit film alan pull knicker ran cut throat ra',
 'rain shine outsid enter movi hous make happi not come right light go',
 'memor line short live show view episod line introduc fratern intramur',
 'dawn realli surpris saw far less gruesom horror adventur',
 'part one sequenc water rush sunken plane everyth els happen',
 'idea review talk wonder movi creat sens',
 'think tom hank good actor enjoy read book children',
 'chairman colleg coffeehous one job review group film',
 'despit famou cast anim version dicken tale borest seen enough zap',
 'well known fact gene roddenberri first pitch star trek nbc origin pilot episod',
 'movi consist great emot especi outstand soundtrack coincid film tom cruis',
 'first say go subject',
 'zero interest rap ghetto cultur white like classic rock howev not',
 'grant seen speed racer never realli watch also seen show',
 'year use site movi review final regist imdb could give farscap',
 'know type film sell ticket make profit film maker',
 'hope rob

In [59]:
pop_words = {}
for line in corpus:
    words = line.split()
    for word in words:
        if pop_words.get(word, -1) == -1:
            pop_words[word] = 1
        else:
            pop_words[word] += 1
pop_words

{'best': 1089,
 'bit': 237,
 'film': 8268,
 'alan': 26,
 'pull': 58,
 'knicker': 1,
 'ran': 26,
 'cut': 93,
 'throat': 7,
 'ra': 6,
 'rain': 30,
 'shine': 34,
 'outsid': 40,
 'enter': 36,
 'movi': 11801,
 'hous': 226,
 'make': 1164,
 'happi': 76,
 'not': 3260,
 'come': 570,
 'right': 301,
 'light': 126,
 'go': 858,
 'memor': 61,
 'line': 257,
 'short': 391,
 'live': 434,
 'show': 1282,
 'view': 294,
 'episod': 373,
 'introduc': 38,
 'fratern': 2,
 'intramur': 1,
 'dawn': 22,
 'realli': 1618,
 'surpris': 329,
 'saw': 1465,
 'far': 328,
 'less': 130,
 'gruesom': 10,
 'horror': 758,
 'adventur': 114,
 'part': 432,
 'one': 4151,
 'sequenc': 66,
 'water': 91,
 'rush': 33,
 'sunken': 2,
 'plane': 38,
 'everyth': 224,
 'els': 133,
 'happen': 236,
 'idea': 311,
 'review': 782,
 'talk': 156,
 'wonder': 488,
 'creat': 134,
 'sens': 133,
 'think': 1047,
 'tom': 113,
 'hank': 24,
 'good': 1912,
 'actor': 584,
 'enjoy': 686,
 'read': 662,
 'book': 425,
 'children': 132,
 'chairman': 3,
 'colleg': 9

In [60]:
keys_greater_than = []
for key in pop_words.keys():
    if pop_words.get(key) >= 1000:
        keys_greater_than.append((pop_words.get(key), key))
    
for i in sorted(keys_greater_than, reverse=True):
    print(i)

(11801, 'movi')
(8268, 'film')
(4151, 'one')
(3260, 'not')
(2664, 'watch')
(2526, 'like')
(2083, 'first')
(1931, 'time')
(1912, 'good')
(1710, 'seen')
(1688, 'see')
(1618, 'realli')
(1534, 'love')
(1489, 'great')
(1483, 'stori')
(1465, 'saw')
(1428, 'ever')
(1406, 'bad')
(1285, 'year')
(1282, 'show')
(1242, 'made')
(1191, 'say')
(1164, 'make')
(1144, 'well')
(1109, 'get')
(1089, 'best')
(1059, 'would')
(1047, 'think')
(1034, 'peopl')
(1007, 'mani')
(1006, 'much')


In [61]:
CountVec = CountVectorizer()
x = CountVec.fit_transform(corpus).toarray()

In [35]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [36]:
occurances = CountVec.vocabulary_
occurances

{'best': 3535,
 'bit': 3825,
 'film': 13353,
 'alan': 743,
 'pull': 30377,
 'knicker': 20827,
 'ran': 30908,
 'cut': 8777,
 'throat': 38322,
 'ra': 30704,
 'watch': 41737,
 'girl': 15210,
 'find': 13395,
 'rain': 30817,
 'shine': 34322,
 'outsid': 27695,
 'enter': 12026,
 'movi': 25527,
 'hous': 17874,
 'make': 23139,
 'happi': 16569,
 'not': 26796,
 'come': 7483,
 'right': 31998,
 'light': 22036,
 'go': 15372,
 'settl': 33965,
 'bar': 2789,
 'ice': 18220,
 'cream': 8374,
 'move': 25522,
 'pictur': 29019,
 'begin': 3248,
 'flicker': 13641,
 'screen': 33529,
 'feel': 13109,
 'content': 7865,
 'dark': 9044,
 'back': 2509,
 'time': 38465,
 'sit': 34808,
 'around': 1884,
 'campfir': 5573,
 'look': 22436,
 'modern': 25059,
 'version': 41077,
 'flame': 13554,
 'per': 28633,
 'second': 33667,
 'share': 34138,
 'joy': 19965,
 'discov': 10206,
 'unknown': 40258,
 'turn': 39456,
 'twist': 39545,
 'scenario': 33260,
 'rest': 31730,
 'clan': 7001,
 'spectat': 35776,
 'write': 42720,
 'comment': 75

In [37]:
keys_greater_than = []
for key in occurances.keys():
    if occurances.get(key) >= 4000:
        keys_greater_than.append((occurances.get(key), key))


In [38]:
for i in sorted(keys_greater_than, reverse=True):
    print(i)

(43319, 'zwrite')
(43318, 'zwart')
(43317, 'zuucka')
(43316, 'zurich')
(43315, 'zurer')
(43314, 'zuotian')
(43313, 'zuniga')
(43312, 'zulu')
(43311, 'zuleta')
(43310, 'zula')
(43309, 'zukor')
(43308, 'zugsmith')
(43307, 'zuf')
(43306, 'zue')
(43305, 'zudina')
(43304, 'zuckerman')
(43303, 'zucker')
(43302, 'zucco')
(43301, 'zucchini')
(43300, 'zuber')
(43299, 'zubeidaa')
(43298, 'zu')
(43297, 'zsigmond')
(43296, 'zschere')
(43295, 'zsa')
(43294, 'zp')
(43293, 'zorro')
(43292, 'zori')
(43291, 'zorak')
(43290, 'zor')
(43289, 'zoot')
(43288, 'zoom')
(43287, 'zoolog')
(43286, 'zooland')
(43285, 'zooey')
(43284, 'zoo')
(43283, 'zonk')
(43282, 'zone')
(43281, 'zombi')
(43280, 'zoey')
(43279, 'zoe')
(43278, 'zodsworth')
(43277, 'zodiac')
(43276, 'zo')
(43275, 'zlotoff')
(43274, 'zkkeonjpo')
(43273, 'zivagho')
(43272, 'ziva')
(43271, 'zippo')
(43270, 'zippier')
(43269, 'zippi')
(43268, 'zipperfac')
(43267, 'zipper')
(43266, 'zipless')
(43265, 'zip')
(43264, 'zionist')
(43263, 'zinnemann')
(4326

In [39]:
#plt.plot(keys_greater_than, values)
print(keys_greater_than[0:10])


[(41737, 'watch'), (41077, 'version'), (40258, 'unknown'), (42720, 'write'), (41197, 'view'), (40674, 'use'), (41795, 'way'), (40234, 'uniqu'), (42955, 'year'), (41968, 'west')]


In [40]:
print(x.shape)
print(type(x))

(32000, 43320)
<class 'numpy.ndarray'>
