In [1]:
import numpy as np
import scipy.sparse
import pandas as pd
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#from mpl_toolkits import mplot3d 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from string import punctuation
from sklearn.model_selection import train_test_split
import tensorflow as tf
from collections import Counter
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harisnaveed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Frame the Problem and Look at the Big Picture 

1. Define the objective in business terms. 

    The business objective is to create a machine learning model that takes a movie goer's review and determines if it is positive or negative. The client can use this to quickly determine the overall reaction to a movie showing. 

2. How will your solution be used? 

    The solution will be used by the client to quickly determine reactions to movie showings. 

3. What are the current solutions/workarounds (if any)? 

    The current solution is to manually poll viewers to determine how they felt about the movie. This is a slow process and the review can be very mixed. 

4. How should you frame this problem (supervised/unsupervised, online/offline, ...)?

    This problem is a supervised offline sentiment analysis problem. 

5. How should performance be measured? Is the performance measure aligned with the business objective? 

    Our metric for this problem will be accuracy. The most important function of this model is to accurately measure the sentiment of the review, so it is important that our model has high accuracy.

6. What would be the minimum performance needed to reach the business objective? 

    The minimum performance required is 80% accuracy. 

7. What are comparable problems? Can you reuse experience or tools? 

    A comparable problem was using by a youtuber named Micheal Reeves. He used sentiment analysis to determine if reddit posts on r/wallstreetbets were positive or negative. We can reuse the experience but not the tools he used. There are resuable tools in the textbook for this course that we will be using. 

8. Is human expertise available? 

    No human expertise is availible at this time. 

9. How would you solve the problem manually? 

    To solve this problem manually, we would interview the movie goers individually to determine their feelings about the movie. 

10.  List the assumptions you (or others) have made so far. Verify assumptions if possible. 

    - A review can either be positive or negative. There are no "meh" reviews. 

# Get the Data 

1. List the data you need and how much you need 

    We need movie reviews and a label that determines their sentiment (1 being positive, 0 being negative). We need over 1000.

2. Find and document where you can get that data 

    The data can be gathered form the open source website Kaggle.

3. Get access authorizations 

    The webiste and data is open source, so we have full access to the data.

4. Create a workspace (with enough storage space) 

    This notebook.

5. Get the data 

    The data was downloaded from kaggle and inserted into this repository.
    Link: https://www.kaggle.com/datasets/yasserh/imdb-movie-ratings-sentiment-analysis?resource=download

6. Convert the data to a format you can easily manipulate (without changing the data itself) 

    Made into a pandas dataframe.

7. Ensure sensitive information is deleted or protected (e.g. anonymized) 

    Not applicable to this situation.

8. Check the size and type of data (time series, geographical, ...) 

    - text: object: 40000 entries
    - label: int: 400000 entries
    
9. Sample a test set, put it aside, and never look at it (no data snooping!) 

In [2]:
data = pd.read_csv("movie.csv")

In [3]:
data

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [5]:
train_set, test_set = train_test_split(data, stratify=data['label'], test_size=0.2, random_state=420)

# Explore the Data 

1. Copy the data for exploration, downsampling to a manageable size if necessary. 


2. Study each attribute and its characteristics: Name; Type (categorical, numerical, bounded, text, structured, ...); % of missing values; Noisiness and type of noise (stochastic, outliers, rounding errors, ...); Usefulness for the task; Type of distribution (Gaussian, uniform logarithmic, ...) 

    - text is a object datatype with no missing data.
    - label is an interger value of either 1 or a 0. 1 is positive, 0 is negative. There is no missing data. In the data set, there is almost an equal number of positive and negative labels.


3. For supervised learning tasks, identify the target attribute(s) 

    The target attribute is the label feature. It is the sentiment of the text.

4. Visualize the data 


5. Study the correlations between attributes 


6. Study how you would solve the problem manually 


7. Identify the promising transformations you may want to apply 


8. Identify extra data that would be useful (go back to “Get the Data”) 


9. Document what you have learned 
    - With statification, there is almost an equal number of text with positive/negative sentiment.

In [6]:
train_copy = train_set.copy()

In [7]:
train_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32000 entries, 33632 to 13973
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    32000 non-null  object
 1   label   32000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 750.0+ KB


In [8]:
train_copy['label'].value_counts()

0    16015
1    15985
Name: label, dtype: int64

In [9]:
def preprocess(X_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch

# Bag of Words

In [25]:
corpus = []
all_stop_words = stopwords.words('english')
all_stop_words.remove('not')
for text in train_copy['text']:
    review = re.sub("<br\\s*/?>", " ", text)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = re.sub("z{3}", " ", review)
    review = review.lower()
    review = review.split()
    #ps = PorterStemmer()
    review = [word for word in review if word not in set(all_stop_words)]
    review = " ".join(review)
    corpus.append(review)
    


In [27]:
corpus

['best bit film alan pulled knickers ran cut throat razor bum cheeks around bum hole also brilliant see alan bum going like fiddler elbow later film alan tough hell like got annoyed pushed four eyed wimp onto sofa laughing days cut throat razor bit brilliant idea script writers alan must brought back eastenders peggy alan back time armed razor watch girl finds pulls knickers',
 'rain shine outside enter movie house makes happy not come right lights go settle bar ice cream moving pictures begin flicker screen feel content dark back beginning time sitting around campfire looking modern version flickering flames times per second sharing joy discovering unknown turns twists scenario rest clan spectators not happy not write comments long live romantic comedies',
 'memorable line short lived show viewing episode line introduced fraternity intramural flag football team started using line break huddles offense instead ready break quarter back said football rest squad responded bet fun way brea

In [28]:
CountVec = CountVectorizer(ngram_range=(1,1),  # to use bigrams ngram_range=(2,2)\n",
                              stop_words='english')
x = CountVec.fit_transform(corpus)

In [29]:
x

<32000x82556 sparse matrix of type '<class 'numpy.int64'>'
	with 2723468 stored elements in Compressed Sparse Row format>

In [30]:
occurances = CountVec.vocabulary_
occurances

{'best': 6648,
 'bit': 7142,
 'film': 26151,
 'alan': 1532,
 'pulled': 57588,
 'knickers': 39989,
 'ran': 58627,
 'cut': 16819,
 'throat': 73358,
 'razor': 58957,
 'bum': 9599,
 'cheeks': 12078,
 'hole': 33725,
 'brilliant': 8998,
 'going': 29989,
 'like': 42139,
 'fiddler': 26039,
 'elbow': 22471,
 'later': 41135,
 'tough': 74331,
 'hell': 32830,
 'got': 30286,
 'annoyed': 2681,
 'pushed': 57801,
 'eyed': 24890,
 'wimp': 80643,
 'sofa': 67610,
 'laughing': 41202,
 'days': 17471,
 'idea': 34979,
 'script': 64090,
 'writers': 81391,
 'brought': 9220,
 'eastenders': 21959,
 'peggy': 53727,
 'time': 73619,
 'armed': 3489,
 'watch': 79618,
 'girl': 29551,
 'finds': 26253,
 'pulls': 57597,
 'rain': 58463,
 'shine': 65520,
 'outside': 52109,
 'enter': 23365,
 'movie': 48248,
 'house': 34300,
 'makes': 44005,
 'happy': 32034,
 'come': 13951,
 'right': 61266,
 'lights': 42127,
 'settle': 64896,
 'bar': 5337,
 'ice': 34929,
 'cream': 16048,
 'moving': 48279,
 'pictures': 54688,
 'begin': 6146,


In [31]:
keys_greater_than = []
for key in occurances.keys():
    if int(occurances[key]) >= 9000:
        keys_greater_than.append((occurances[key], key))


In [32]:
for i in sorted(keys_greater_than, reverse=True):
    print(i)

(82555, 'zzzzzzzzzzzzz')
(82554, 'zzzzzzzzzz')
(82553, 'zzzzzzzz')
(82552, 'zz')
(82551, 'zyada')
(82550, 'zy')
(82549, 'zwrite')
(82548, 'zwick')
(82547, 'zwarts')
(82546, 'zwart')
(82545, 'zvorkov')
(82544, 'zvonimir')
(82543, 'zvezda')
(82542, 'zuucka')
(82541, 'zurich')
(82540, 'zurer')
(82539, 'zuotian')
(82538, 'zunz')
(82537, 'zuniga')
(82536, 'zuni')
(82535, 'zungia')
(82534, 'zumhofe')
(82533, 'zulu')
(82532, 'zuleta')
(82531, 'zuleika')
(82530, 'zula')
(82529, 'zukovic')
(82528, 'zukor')
(82527, 'zukhov')
(82526, 'zugurt')
(82525, 'zugsmith')
(82524, 'zuger')
(82523, 'zuf')
(82522, 'zues')
(82521, 'zudina')
(82520, 'zuckers')
(82519, 'zuckerman')
(82518, 'zucker')
(82517, 'zucher')
(82516, 'zuccon')
(82515, 'zucco')
(82514, 'zucchini')
(82513, 'zuber')
(82512, 'zubeidaa')
(82511, 'zu')
(82510, 'zsrs')
(82509, 'zsrr')
(82508, 'zsigmond')
(82507, 'zshornack')
(82506, 'zschering')
(82505, 'zsa')
(82504, 'zp')
(82503, 'zozo')
(82502, 'zowee')
(82501, 'zow')
(82500, 'zounds')
(824

In [None]:
#plt.plot(keys_greater_than, values)
print(keys_greater_than)


['film', 'pull', 'knicker', 'ran', 'cut', 'throat', 'razor', 'hole', 'like', 'fiddler', 'elbow', 'later', 'tough', 'hell', 'got', 'push', 'eye', 'wimp', 'sofa', 'laugh', 'day', 'idea', 'script', 'writer', 'eastend', 'peggi', 'time', 'watch', 'girl', 'rain', 'shine', 'outsid', 'enter', 'movi', 'hous', 'make', 'happi', 'come', 'right', 'light', 'settl', 'ice', 'cream', 'pictur', 'flicker', 'screen', 'feel', 'content', 'dark', 'sit', 'look', 'modern', 'version', 'flame', 'second', 'share', 'joy', 'discov', 'unknown', 'turn', 'twist', 'scenario', 'rest', 'clan', 'spectat', 'write', 'comment', 'long', 'live', 'romant', 'comedi', 'memor', 'line', 'short', 'view', 'episod', 'introduc', 'fratern', 'intramur', 'flag', 'footbal', 'team', 'start', 'use', 'huddl', 'offens', 'instead', 'readi', 'quarter', 'said', 'squad', 'respond', 'fun', 'way', 'oppon', 'scratch', 'head', 'uniqu', 'element', 'season', 'forget', 'colleg', 'year', 'small', 'pretti', 'stinker', 'memori', 'pi', 'kappa', 'phi', 'west'

In [None]:
print(x.shape)
print(type(x))

(32000, 56737)
<class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
print(x.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
