Importing Libraries

In [66]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

Loading Dataset

In [105]:
# Loading TSV file
df_amazon = pd.read_csv("/content/IMDB Dataset.csv")

In [106]:
df_amazon['sentiment'] = df_amazon['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
df_amazon = df_amazon.iloc[:5000:] #i did this because my training was crossing the max_iter , you can remove it if you want

In [107]:
df_amazon.shape

(5000, 2)

In [108]:
df_amazon.tail()

Unnamed: 0,review,sentiment
4995,An interesting slasher film with multiple susp...,0
4996,i watched this series when it first came out i...,1
4997,Once again Jet Li brings his charismatic prese...,1
4998,"I rented this movie, after hearing Chris Gore ...",0
4999,This was a big disappointment for me. I think ...,0


Preprocessing data

In [70]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [71]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [73]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [74]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

train-test-split data

In [109]:
from sklearn.model_selection import train_test_split

X = df_amazon['review'] # the features we want to analyze
ylabels = df_amazon['sentiment'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

creating pipeling

In [111]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=20000)

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier),])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7ff04dc8f190>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 t...\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7ff047b78e60>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
               

In [112]:
predicted = pipe.predict(X_test)

Saving and Loading model

In [121]:
import joblib
joblib.dump(pipe,"/content/model.pkl")

['/content/model.pkl']

In [122]:
model = joblib.load("/content/model.pkl")

Testing our own data in the model

In [118]:
print(model.predict(["i don't hate the movie we were watching"]))

[1]


In [125]:
print(model.predict(["I am disappointed by the movie"]))

[0]
