In [8]:
import numpy as np
import scipy.sparse
import pandas as pd
#from mpl_toolkits import mplot3d 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
import tensorflow as tf


# Frame the Problem and Look at the Big Picture 

1. Define the objective in business terms. 

    The business objective is to create a machine learning model that takes a movie goer's review and determines if it is positive or negative. The client can use this to quickly determine the overall reaction to a movie showing. 

2. How will your solution be used? 

    The solution will be used by the client to quickly determine reactions to movie showings. 

3. What are the current solutions/workarounds (if any)? 

    The current solution is to manually poll viewers to determine how they felt about the movie. This is a slow process and the review can be very mixed. 

4. How should you frame this problem (supervised/unsupervised, online/offline, ...)?

    This problem is a supervised offline sentiment analysis problem. 

5. How should performance be measured? Is the performance measure aligned with the business objective? 

    Our metric for this problem will be accuracy. The most important function of this model is to accurately measure the sentiment of the review, so it is important that our model has high accuracy.

6. What would be the minimum performance needed to reach the business objective? 

    The minimum performance required is 80% accuracy. 

7. What are comparable problems? Can you reuse experience or tools? 

    A comparable problem was using by a youtuber named Micheal Reeves. He used sentiment analysis to determine if reddit posts on r/wallstreetbets were positive or negative. We can reuse the experience but not the tools he used. There are resuable tools in the textbook for this course that we will be using. 

8. Is human expertise available? 

    No human expertise is availible at this time. 

9. How would you solve the problem manually? 

    To solve this problem manually, we would interview the movie goers individually to determine their feelings about the movie. 

10.  List the assumptions you (or others) have made so far. Verify assumptions if possible. 

    - A review can either be positive or negative. There are no "meh" reviews. 

# Get the Data 

1. List the data you need and how much you need 

    We need movie reviews and a label that determines their sentiment (1 being positive, 0 being negative). We need over 1000.

2. Find and document where you can get that data 

    The data can be gathered form the open source website Kaggle.

3. Get access authorizations 

    The webiste and data is open source, so we have full access to the data.

4. Create a workspace (with enough storage space) 

    This notebook.

5. Get the data 

    The data was downloaded from kaggle and inserted into this repository.
    Link: https://www.kaggle.com/datasets/yasserh/imdb-movie-ratings-sentiment-analysis?resource=download

6. Convert the data to a format you can easily manipulate (without changing the data itself) 

    Made into a pandas dataframe.

7. Ensure sensitive information is deleted or protected (e.g. anonymized) 

    Not applicable to this situation.

8. Check the size and type of data (time series, geographical, ...) 

    - text: object: 40000 entries
    - label: int: 400000 entries
    
9. Sample a test set, put it aside, and never look at it (no data snooping!) 

In [9]:
data = pd.read_csv("movie.csv")

In [10]:
data

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [12]:
train_set, test_set = train_test_split(data, stratify=data['label'], test_size=0.2, random_state=420)

# Explore the Data 

1. Copy the data for exploration, downsampling to a manageable size if necessary. 


2. Study each attribute and its characteristics: Name; Type (categorical, numerical, bounded, text, structured, ...); % of missing values; Noisiness and type of noise (stochastic, outliers, rounding errors, ...); Usefulness for the task; Type of distribution (Gaussian, uniform logarithmic, ...) 

    - text is a object datatype with no missing data.
    - label is an interger value of either 1 or a 0. 1 is positive, 0 is negative. There is no missing data. In the data set, there is almost an equal number of positive and negative labels.


3. For supervised learning tasks, identify the target attribute(s) 

    The target attribute is the label feature. It is the sentiment of the text.

4. Visualize the data 


5. Study the correlations between attributes 


6. Study how you would solve the problem manually 


7. Identify the promising transformations you may want to apply 


8. Identify extra data that would be useful (go back to “Get the Data”) 


9. Document what you have learned 
    - With statification, there is almost an equal number of text with positive/negative sentiment.

In [13]:
train_copy = train_set.copy()

In [14]:
train_copy['label'].value_counts()

0    16015
1    15985
Name: label, dtype: int64

In [15]:
CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)\n",
                              stop_words='english')
CountVec.fit(train_copy['text'])

CountVectorizer(stop_words='english')

In [28]:
occurances = CountVec.vocabulary_
occurances

{'best': 7922,
 'bit': 8418,
 'film': 27585,
 'alan': 2776,
 'pulled': 59270,
 'knickers': 41552,
 'ran': 60329,
 'cut': 18168,
 'throat': 75122,
 'razor': 60658,
 'bum': 10884,
 'cheeks': 13394,
 'hole': 35236,
 'brilliant': 10279,
 'going': 31450,
 'like': 43711,
 'fiddler': 27473,
 'elbow': 23882,
 'later': 42705,
 'br': 9829,
 'tough': 76100,
 'hell': 34339,
 'got': 31748,
 'annoyed': 3938,
 'pushed': 59483,
 'eyed': 26302,
 'wimp': 82454,
 'sofa': 69347,
 've': 79919,
 'laughing': 42773,
 'days': 18838,
 'idea': 36499,
 'script': 65796,
 'writers': 83208,
 'brought': 10503,
 'eastenders': 23368,
 'peggy': 55389,
 'time': 75386,
 'armed': 4752,
 'watch': 81425,
 'girl': 31013,
 'finds': 27689,
 'pulls': 59279,
 'rain': 60165,
 'shine': 67243,
 'outside': 53753,
 'enter': 24777,
 'movie': 49863,
 'house': 35814,
 'makes': 45607,
 'happy': 33538,
 'come': 15290,
 'right': 62957,
 'lights': 43699,
 'settle': 66609,
 'bar': 6608,
 'ice': 36448,
 'cream': 17393,
 'moving': 49895,
 'pict

In [58]:
keys_greater_than = []
for key in occurances.keys():
    if int(occurances[key]) > 84400:
        keys_greater_than.append(key)

values = []
for key in keys_greater_than:
    values.append(occurances[key])

print(len(keys_greater_than))

45


In [61]:
#plt.plot(keys_greater_than, values)
print(keys_greater_than)
print(occurances['åmål'])


['åmål', 'über', 'æon', 'özdemir', 'úber', 'émigré', 'æsthetic', 'était', 'étoile', 'übermenschlich', 'élan', 'þorleifsson', 'étienne', 'þór', 'époque', 'émigrés', 'être', 'ánd', 'écoffey', 'étc', 'ïn', 'ünfaithful', 'óli', 'ýs', 'żmijewski', 'äänekoski', 'zübert', 'âge', 'ünel', 'ís', 'übermensch', 'überwoman', 'æbler', 'יגאל', 'כרמון', 'évery', 'önsjön', 'zázvorková', 'üvegtigris', 'ànd', 'état', 'üzümcü', 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', 'är']
84410


In [17]:
vector = CountVec.transform(train_copy['text'])

In [18]:
print(vector.shape)
print(type(vector))
print(vector.toarray())

(32000, 84446)
<class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
