In [3]:
%load_ext watermark
%watermark -p torch,lightning,pandas --conda

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
torch    : 2.1.1+cpu
lightning: 2.1.3
pandas   : 1.4.4

conda environment: base



In [4]:
import os.path as op

import numpy as np
import pandas as pd

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe
from local_dataset_utilities import partition_dataset, IMDBDataset

In [5]:
download_dataset()

In [6]:
df = load_dataset_into_to_dataframe()

df.head()

100%|██████████| 50000/50000 [08:00<00:00, 104.00it/s]

Class distribution:





Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
0,Actor turned director Bill Paxton follows up h...,1
0,As a recreational golfer with some knowledge o...,1
0,"I saw this film in a sneak preview, and it is ...",1
0,Bill Paxton has taken the true story of the 19...,1


In [7]:
df.iloc[0]

text     I went and saw this movie last night after bei...
label                                                    1
Name: 0, dtype: object

In [8]:
partition_dataset(df)

In [9]:
df_train = pd.read_csv("train.csv")
df_train.tail()

Unnamed: 0,index,text,label
34995,0,Frank Capra's creativity must have been just a...,0
34996,0,Just saw the film tonight in a preview and it'...,0
34997,0,"If you love Japanese monster movies, you'll lo...",1
34998,0,Because it came from HBO and based on the IMDb...,0
34999,0,"WARNING!!! SOME POSSIBLE PLOT SPOILERS, AS IF ...",0


In [11]:
np.bincount(df_train["label"])

array([17452, 17548], dtype=int64)

In [12]:
df_val = pd.read_csv("val.csv")
df_val.tail()

Unnamed: 0,index,text,label
4995,0,The Matador is a strange film. Its main charac...,1
4996,0,Not bad performances. Whoopi plays the wise/wa...,0
4997,0,I was surprised when I saw this film. I'd hear...,0
4998,0,When great director/actor combinations are tal...,0
4999,0,This show is non Stop hilarity. the first joke...,1


In [13]:
np.bincount(df_val["label"])

array([2542, 2458], dtype=int64)

In [14]:
df_test = pd.read_csv("test.csv")
df_test.tail()

Unnamed: 0,index,text,label
9995,0,Every generation fully believes it is living i...,0
9996,0,Possibly the most brilliant thing about Che: P...,1
9997,0,I was unsure of this movie before renting and ...,1
9998,0,"Just got out of an advance screening, and wow ...",1
9999,0,I sense out there a mix of confusion and varyi...,1


In [16]:
np.bincount(df_test["label"])

array([5006, 4994], dtype=int64)

## 2) Bag-of-Words Model

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv = CountVectorizer(lowercase=True, max_features=10_000, stop_words="english")

In [20]:
cv.fit(df_train["text"])

CountVectorizer(max_features=10000, stop_words='english')

In [21]:
cv.vocabulary_

{'started': 8515,
 'watching': 9725,
 'series': 7957,
 'cable': 1320,
 'idea': 4488,
 'hate': 4191,
 'character': 1544,
 'hold': 4339,
 'beautifully': 892,
 'developed': 2574,
 'understand': 9375,
 'react': 7196,
 'frustration': 3737,
 'fear': 3439,
 'greed': 4020,
 'temptation': 8974,
 'way': 9736,
 'viewer': 9574,
 'experiencing': 3280,
 'christopher': 1656,
 'learning': 5199,
 'br': 1151,
 'abuse': 188,
 'physically': 6608,
 'emotionally': 3046,
 'just': 4963,
 'read': 7199,
 'newspaper': 6088,
 'women': 9880,
 'tolerate': 9134,
 'behavior': 915,
 'dream': 2831,
 'house': 4418,
 'endless': 3074,
 'supply': 8779,
 'expensive': 3276,
 'things': 9036,
 'sure': 8791,
 'loving': 5426,
 'faithful': 3371,
 'husband': 4465,
 'maybe': 5640,
 'watch': 9719,
 'doesn': 2754,
 'matter': 5630,
 'times': 9104,
 'episode': 3140,
 'missed': 5813,
 'episodes': 3141,
 'sequence': 7950,
 'season': 7869,
 'late': 5151,
 'night': 6101,
 'commercials': 1874,
 'language': 5133,
 'reruns': 7427,
 'movie': 5

In [22]:
X_train = cv.transform(df_train["text"])
X_val = cv.transform(df_val["text"])
X_test = cv.transform(df_test["text"])

In [24]:
X_train.shape

(35000, 10000)

In [25]:
X_train[0]

<1x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 74 stored elements in Compressed Sparse Row format>

In [26]:
feat_vec = np.array(X_train[0].todense())[0]
feat_vec

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [27]:
feat_vec.shape

(10000,)

In [28]:
np.bincount(feat_vec)

array([9926,   67,    5,    0,    1,    0,    1], dtype=int64)

In [29]:
df_train["text"]

0        When we started watching this series on cable,...
1        Steve Biko was a black activist who tried to r...
2        My short comment for this flick is go pick it ...
3        As a serious horror fan, I get that certain ma...
4        Robert Cummings, Laraine Day and Jean Muir sta...
                               ...                        
34995    Frank Capra's creativity must have been just a...
34996    Just saw the film tonight in a preview and it'...
34997    If you love Japanese monster movies, you'll lo...
34998    Because it came from HBO and based on the IMDb...
Name: text, Length: 35000, dtype: object