### Import Required Libraries

In [None]:
import re
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

### Load Data

In [None]:
data_frame = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

#### Preview Data

In [None]:
data_frame.head()

#### Encode Sentiment Variable

In [None]:
data_frame["sentiment"] = data_frame["sentiment"].apply(lambda x: 1 if x=="positive" else 0)

#### Distribution of Target Variable

In [None]:
sns.countplot(x="sentiment", data=data_frame)
plt.show()

#### Lets look at sample review

In [None]:
data_frame.iloc[1, :].values

In [None]:
stop_words = set(stopwords.words("english"))

# Method to remove html tags and special characters in the data.
def clean_text(text):
    
    text = text.split()
    
    text = " ".join(text)
    
    clean_text = re.sub(re.compile('<.*?>'), '', text)
    clean_text = re.sub(f'[{re.escape(string.punctuation)}]', '', clean_text)
    word_token = word_tokenize(clean_text)
    clean_text = [word.lower() for word in word_token if not word in stop_words]
    clean_text = " ".join(clean_text)
    return clean_text

In [None]:
data_frame["review"] = data_frame["review"].apply(clean_text)

#### After cleaning text

In [None]:
data_frame.iloc[1, :].values

In [None]:
def model_evalution(y_test, y_hat, model_name):
    
    accuracy = accuracy_score(y_test, y_hat)
    roc_auc = roc_auc_score(y_test, y_hat)
    print("Model Name: {}".format(model_name))
    print("Model Accuracy: ", accuracy)
    print("ROC AUC Score: ", roc_auc)
    
    print(classification_report(y_test, y_hat))
    
    sns.heatmap(confusion_matrix(y_test, y_hat), annot = True, fmt = ".0f", cmap = "YlGnBu")
    plt.xlabel("Predicted Values")
    plt.ylabel("Actual Values")
    plt.title("{} Validation Matrix\n\n".format(model_name))
    plt.show()
    
    return accuracy, roc_auc

In [None]:
X = data_frame["review"].values
Y = data_frame["sentiment"].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
tfidf = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfidf.fit(X)
x_train_vec = tfidf.transform(x_train)
x_test_vec = tfidf.transform(x_test)

In [None]:
lr_model = LogisticRegression()
lr_model.fit(x_train_vec, y_train)

In [None]:
lr_preds = lr_model.predict(x_test_vec)

In [None]:
lr_accuracy, lr_roc_score = model_evalution(y_test, lr_preds, "Logistic Regression")

In [None]:
xgb_model = XGBClassifier()
xgb_model.fit(x_train_vec, y_train)

In [None]:
xgb_preds = xgb_model.predict(x_test_vec)
xgb_accuracy, xgb_roc_score = model_evalution(y_test, xgb_preds, "XGB Classifier")

In [None]:
lgb_model = LGBMClassifier()
lgb_model.fit(x_train_vec, y_train)

In [None]:
lgb_preds = lgb_model.predict(x_test_vec)
lgb_accuracy, lgb_roc_score = model_evalution(y_test, lgb_preds, "LightGBM Classifier")

In [None]:
mnb_model = MultinomialNB()
mnb_model.fit(x_train_vec, y_train)

In [None]:
mnb_preds = mnb_model.predict(x_test_vec)
mnb_accuracy, mnb_roc_score = model_evalution(y_test, mnb_preds, "Multinomial Naive Bayes")

In [None]:
x = ["Logistic Regression", "XGB Classifier", "LGBM Classifier", "Multinomial NB"]
y = [lr_accuracy, xgb_accuracy, lgb_accuracy, mnb_accuracy]
plt.bar(x=x, height=y)
plt.title("Algorithm Accuracy Comparison")
plt.xticks(rotation=15)
plt.xlabel("Algorithms")
plt.ylabel("Accuracy")
plt.show()

In [None]:
x = ["Logistic Regression", "XGB Classifier", "LGBM Classifier", "Multinomial NB"]
y = [lr_roc_score, xgb_roc_score, lgb_roc_score, mnb_roc_score]
plt.bar(x=x, height=y)
plt.title("Algorithm ROC AUC Score Comparison")
plt.xticks(rotation=15)
plt.xlabel("Algorithms")
plt.ylabel("ROC AUC Score")
plt.show()