<h2>Very strange, but I have not tested how the model will work with unnormalized data.</h2>

<h4>Import modules and load data</h4>

In [21]:
import pandas as pd

In [22]:
data = pd.read_csv("IMDB_dataset.csv")

<h4>Split data and make model pipeline</h4>

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data["review"], data["sentiment"], random_state=7)

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [26]:
model = Pipeline([
  ("vec", TfidfVectorizer()),
  ("classifier", LogisticRegression()) 
])

In [31]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('vec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sc

<h4>Lets watch at the most important features</h4>

In [48]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=["negative", "positive"])

In [49]:
exp = explainer.explain_instance(X_test.iloc[0], model.predict_proba, num_features=10)

  self.as_list = [s for s in splitter.split(self.raw) if s]


In [50]:
y_test

29430    negative
27750    positive
47782    positive
10498    negative
24747    positive
           ...   
27724    positive
7648     negative
17607    positive
9337     negative
47433    positive
Name: sentiment, Length: 12500, dtype: object

In [51]:
exp.as_list()

[('bad', -0.06753685618152307),
 ('Nothing', -0.05134465987067746),
 ('only', -0.04318437118727395),
 ('Oh', -0.04059057234648972),
 ('love', 0.03870124351921595),
 ('garbage', -0.031599705686315954),
 ('have', -0.03002197998428627),
 ('it', 0.028327229771188903),
 ('hours', -0.024872527827906537),
 ('least', -0.023811985590418076)]

In [56]:
y_train = y_train.map({"positive": 1, "negative": 0})
y_test = y_test.map({"positive": 1, "negative": 0})

In [52]:
from sklearn import metrics
metrics = {
    "accuracy": metrics.accuracy_score,
    "precision": metrics.precision_score,
    "recall": metrics.recall_score,
    "f1": metrics.f1_score
}

In [57]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('vec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sc

In [58]:
def get_metrics_values(model, X, y, metric):
  y_pred = model.predict(X)
  return metric(y_pred, y)

In [59]:
print("Model metrics")
for metric_name, metric in metrics.items():
  print(f"{metric_name} value is {get_metrics_values(model, X_test, y_test, metric)}")

Model metrics
accuracy value is 0.89416
precision value is 0.9037690457097033
recall value is 0.886284995281535
f1 value is 0.894941634241245


<h4>Now lets normalize data and fit model</h4>

In [60]:
import re
import string
from spacy.lang.en.stop_words import STOP_WORDS as stop_words
from spacy.lang.en import English

In [61]:
parser = English()
punctuations = string.punctuation

In [62]:
def spacy_text_normalizer(text):
    text = re.sub(r"<.*>", "", text) #Remove all tags
    tokens = parser(text) #Get doc from text
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ] #Normalize words
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ] #Remove stop words and punctuation
    return " ".join(tokens)

In [63]:
from sklearn.base import TransformerMixin

In [66]:
X_train = X_train.apply(spacy_text_normalizer)
X_test = X_test.apply(spacy_text_normalizer)

In [67]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sc

In [68]:
print("Model metrics")
for metric_name, metric in metrics.items():
  print(f"{metric_name} value is {get_metrics_values(model, X_test, y_test, metric)}")

Model metrics
accuracy value is 0.8568
precision value is 0.872654370489174
recall value is 0.8452695354979027
f1 value is 0.8587436868686869


In [69]:
exp = explainer.explain_instance(X_test.iloc[0], model.predict_proba, num_features=10)

  self.as_list = [s for s in splitter.split(self.raw) if s]


In [70]:
exp.as_list()

[('bad', -0.1601615589537048),
 ('garbage', -0.09699252638014245),
 ('love', 0.07413598207119855),
 ('don', -0.049779669011829085),
 ('life', 0.04437483913105965),
 ('canadian', 0.039792487093273474),
 ('acting', -0.03610956443061801),
 ('prepared', 0.03325497384887356),
 ('bean', 0.03184214977724494),
 ('seriously', -0.031702106078309096)]

<h4>The non-normalized data gave a better result, it may be necessary to reconsider the construction of the model.</h4>

<h4>What if we make tfidf with bigrams</h4>

In [86]:
X_train, X_test, y_train, y_test = train_test_split(data["review"], data["sentiment"], random_state=7)

In [87]:
model = Pipeline([
  ("vec", TfidfVectorizer(ngram_range=(1, 2))),
  ("classifier", LogisticRegression()) 
])

In [90]:
data["sentiment"]

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [94]:
y_train = y_train.map({"positive": 1, "negative": 0})
y_test = y_test.map({"positive": 1, "negative": 0})

In [97]:
y_train.head()

17552    0
20467    0
49715    1
31896    1
11953    1
Name: sentiment, dtype: int64

In [98]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sc

In [99]:
print("Model metrics")
for metric_name, metric in metrics.items():
  print(f"{metric_name} value is {get_metrics_values(model, X_test, y_test, metric)}")

Model metrics
accuracy value is 0.8948
precision value is 0.9077786688051324
recall value is 0.884375
f1 value is 0.8959240205777602


In [100]:
exp = explainer.explain_instance(X_test.iloc[0], model.predict_proba, num_features=10)

  self.as_list = [s for s in splitter.split(self.raw) if s]


In [101]:
exp.as_list()

[('bad', -0.0683469707661816),
 ('Nothing', -0.03869948530360005),
 ('could', -0.03306958329835676),
 ('only', -0.03142686495621501),
 ('have', -0.029132448485572632),
 ('was', -0.0272307158833128),
 ('sit', -0.026830442888631956),
 ('Oh', -0.025459961344797048),
 ('love', 0.025213165598701157),
 ('so', -0.025015381921450672)]

<h4>Nothing has changed</h4>

In [102]:
X_train = X_train.apply(spacy_text_normalizer)
X_test = X_test.apply(spacy_text_normalizer)

In [103]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sc

In [104]:
print("Model metrics")
for metric_name, metric in metrics.items():
  print(f"{metric_name} value is {get_metrics_values(model, X_test, y_test, metric)}")

Model metrics
accuracy value is 0.85264
precision value is 0.8800320769847634
recall value is 0.8337638656739097
f1 value is 0.8562734082397004


In [105]:
exp = explainer.explain_instance(X_test.iloc[0], model.predict_proba, num_features=10)

  self.as_list = [s for s in splitter.split(self.raw) if s]


In [106]:
exp.as_list()

[('bad', -0.1645802619815616),
 ('love', 0.07631155208874402),
 ('garbage', -0.07149844312966842),
 ('don', -0.053789628647735135),
 ('life', 0.0432815885970151),
 ('acting', -0.03674254760344241),
 ('big', -0.03318199725161448),
 ('seriously', -0.023882311109545657),
 ('prepared', 0.020053678086675446),
 ('action', 0.01801978481976002)]

<h4>Although the accuracy of the model is lower, the results for what it is trained on look much more plausible. I think it is necessary to increase the dataset.</h4>