In [72]:
import pandas as pd
import warnings
import re
import string
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')

In [73]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head(5)
#you need to modify this part of code if you are not using kaggle notebook

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [74]:
df["review"] = df['review'].str.lower()

In [75]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [76]:
df['review'] = df['review'].apply(remove_html_tags)


In [77]:
df.head(5)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [78]:
exclude = string.punctuation

In [79]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [80]:
df['review'] = df['review'].apply(remove_punc)

In [81]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [82]:
total_words = sum(len(r.split()) for r in df['review'])
total_words


11312801

In [83]:
vocab = set()
for r in df['review']:
    vocab.update(r.split())
print(len(vocab))

222751


**One_hot_encodind**

In [84]:
df_labels_ohe = pd.get_dummies(df['sentiment'], dtype=int) 

print(df_labels_ohe.head())


   negative  positive
0         0         1
1         0         1
2         0         1
3         1         0
4         0         1


In [85]:
ohe = OneHotEncoder(sparse=False)  # returns a numpy array
labels = df['sentiment'].values.reshape(-1,1)
labels_encoded = ohe.fit_transform(labels)


In [86]:
print(labels_encoded[:5])
print(ohe.categories_)

[[0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]]
[array(['negative', 'positive'], dtype=object)]


**Bag Of Word**

In [87]:
cv = CountVectorizer(stop_words='english')

In [88]:
bow = cv.fit_transform(df["review"])
bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4454260 stored elements and shape (50000, 221125)>

In [89]:
vocab = list(cv.vocabulary_.items())
vocab[0:10]

[('reviewers', 163031),
 ('mentioned', 123352),
 ('watching', 211869),
 ('just', 105575),
 ('oz', 143069),
 ('episode', 64473),
 ('youll', 219813),
 ('hooked', 93574),
 ('right', 163803),
 ('exactly', 66424)]

In [90]:
print(bow[0].toarray())
print(bow[2].toarray())
#.
#.
#.
print(bow[49000].toarray())

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


**Bi_gram**

In [91]:
cv = CountVectorizer(ngram_range=(2,2),stop_words='english')
#bi_gram (1,2) uni and bi_gram

In [92]:
bi_gram = cv.fit_transform(df['review'])
bi_gram

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 5182219 stored elements and shape (50000, 3175777)>

In [93]:
vocab = list(cv.vocabulary_.items())
vocab[0:10]

[('reviewers mentioned', 2334171),
 ('mentioned watching', 1782849),
 ('watching just', 3052042),
 ('just oz', 1505628),
 ('oz episode', 2008482),
 ('episode youll', 896726),
 ('youll hooked', 3164515),
 ('hooked right', 1335012),
 ('right exactly', 2345509),
 ('exactly happened', 920784)]

In [94]:
print(bi_gram[0].toarray())
print(bi_gram[2].toarray())
#.
#.
#.
print(bi_gram[47010].toarray())

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


**Tri_gram**

In [95]:
cv = CountVectorizer(ngram_range=(3,3),stop_words='english')
cv

In [96]:
tri_gram = cv.fit_transform(df['review'])
tri_gram

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 5201694 stored elements and shape (50000, 4930628)>

In [97]:
vocab = list(cv.vocabulary_.items())
vocab[0:10]

[('reviewers mentioned watching', 3631892),
 ('mentioned watching just', 2770327),
 ('watching just oz', 4725996),
 ('just oz episode', 2300631),
 ('oz episode youll', 3145728),
 ('episode youll hooked', 1319790),
 ('youll hooked right', 4909017),
 ('hooked right exactly', 2038936),
 ('right exactly happened', 3648039),
 ('exactly happened methe', 1353369)]

In [98]:
print(tri_gram[0].toarray())
print(tri_gram[29].toarray())
#.
#.
#.
print(tri_gram[46510].toarray())

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


**TF-IDF**

In [99]:
tfidf = TfidfVectorizer(stop_words='english')

In [100]:
tfidf.fit_transform(df['review']).toarray()
tfidf

In [101]:
vocab = list(tfidf.vocabulary_.items())
vocab[0:10]

[('reviewers', 163031),
 ('mentioned', 123352),
 ('watching', 211869),
 ('just', 105575),
 ('oz', 143069),
 ('episode', 64473),
 ('youll', 219813),
 ('hooked', 93574),
 ('right', 163803),
 ('exactly', 66424)]

In [102]:
print(len(tfidf.vocabulary_))

221125


In [103]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[ 9.87388814 10.721186   11.1266511  ... 11.1266511  11.1266511
 11.1266511 ]
['00' '000' '0000000000001' ... 'þór' 'יגאל' 'כרמון']
