In [1]:
import numpy as np 
import pandas as pd
import re 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.porter import *
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score
import pickle

In [2]:
df = pd.read_csv("/kaggle/input/imdb-dataset/IMDB Dataset.csv")

In [3]:
df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


#### seeing if dataset is balanced

In [4]:
df.groupby("sentiment").count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


#### encoding sentiment values

In [5]:
df["sentiment_encoding"] = df["sentiment"].apply(lambda x: 1 if "positive" else 0)
df.head(3)

Unnamed: 0,review,sentiment,sentiment_encoding
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1


#### splitting dataset

In [6]:
train_dataset = df.sample(frac=0.8,random_state=0)
test_dataset = df.drop(train_dataset.index)

train_reviews = train_dataset.review
train_sentis = train_dataset.sentiment

test_reviews = test_dataset.review
test_sentis = test_dataset.sentiment

print(train_reviews.shape, train_sentis.shape)
print(test_reviews.shape, test_sentis.shape)

(40000,) (40000,)
(10000,) (10000,)


#### text preprocessing

In [7]:
tokenizer = ToktokTokenizer()

# remove html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# keep only alphabetic words
def alpha_text(text):
    return re.sub('^[a-z]+$', '', text)

def clean_words(text):
    text = alpha_text(strip_html(text))
    return text

df["review"] = df["review"].apply(clean_words)



### removing stopwords, stemming

In [8]:
stop_list = stopwords.words("english")

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stop_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stop_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

df["review"] = df["review"].apply(remove_stopwords)

In [9]:
def stemmer(text):
    stemmer = PorterStemmer()
    doc_stemmed = [stemmer.stem(w) for w in text.split()]

df["review"] = df["review"].apply(stemmer)