In [12]:
#!pip install fasttext lime

In [13]:
import fasttext
import re
import lime.lime_text
import numpy as np
import webbrowser
from pathlib import Path
from gensim.utils import simple_preprocess, simple_tokenize, deaccent
import glob

In [5]:
positive_train = glob.glob("../day_7/data/aclImdb/train/pos/*.txt")
negative_train = glob.glob("../day_7/data/aclImdb/train/neg/*.txt")

In [None]:
## create fasttext format file to use for training

In [6]:
with open("./data/fasttext_train.txt", "w") as fh:
    for file in positive_train:
        text = open(file).read()
        text = simple_preprocess(text)
        text = " ".join(text)
        text = "__label__positive " + text + "\n"
        
        fh.write(text)
        
    for file in negative_train:
        text = open(file).read()
        text = simple_preprocess(text)
        text = " ".join(text)
        text = "__label__negative " + text + "\n"
        
        fh.write(text)
        

In [7]:
!wc -l ./data/fasttext_train.txt

   25000 ./data/fasttext_train.txt


In [8]:
model = fasttext.train_supervised('./data/fasttext_train.txt', epoch=10, ws=5)

In [9]:
!mkdir models

In [10]:
model.save_model("./models/imbd_reviews.bin")

In [40]:


# This function regularizes a piece of text so it's in the same format
# that we used when training the FastText classifier.
def strip_formatting(string):
    string = string.lower()
    string = deaccent(string)
    return string

# LIME needs to be able to mimic how the classifier splits
# the string into words. So we'll provide a function that
# mimics how FastText works.
def tokenize_string(string):
    return simple_preprocess(string)

# Load our trained FastText classifier model (created in Part 2)
classifier = fasttext.load_model('./models/imbd_reviews.bin')

# Create a LimeTextExplainer. This object knows how to explain a text-based
# prediction by dropping words randomly.
explainer = lime.lime_text.LimeTextExplainer(
    # We need to tell LIME how to split the string into words. We can do this
    # by giving it a function to call to split a string up the same way FastText does it.
    split_expression=tokenize_string,
    # Our FastText classifer uses bigrams (two-word pairs) to classify text. Setting
    # bow=False tells LIME to not assume that our classifier is based on single words only.
    bow=False,
    # To make the output pretty, tell LIME what to call each possible prediction from our model.
    class_names=[ "Positive", "Negative"]
)

# LIME is designed to work with classifiers that generate predictions
# in the same format as Scikit-Learn. It expects every prediction to have
# a probability value for every possible label.
def fasttext_prediction_in_sklearn_format(classifier, texts):
    res = []
    # Ask FastText for the top  most likely labels for each piece of text.
    # Here we can increase the number of the returned labels if we have multilabel classification
    labels, probabilities = classifier.predict(texts, 2)

    # For each prediction, sort the probabaility scores into the same order
    # (I.e. POSITIVE, NEGATIVE etc). This is needed because FastText
    # returns predicitons sorted by most likely instead of in a fixed order.
    for label, probs, text in zip(labels, probabilities, texts):
        order = np.argsort(np.array(label))
        res.append(probs[order])

    return np.array(res)




In [41]:
def explain_review(review, output_filename):
    
    # Pre-process the text of the review so it matches the training format
    preprocessed_review = strip_formatting(review)

    # Make a prediction and explain it!
    exp = explainer.explain_instance(
        # The review to explain
        preprocessed_review,
        # The wrapper function that returns FastText predictions in scikit-learn format
        classifier_fn=lambda x: fasttext_prediction_in_sklearn_format(classifier, x),
        # How many labels to explain. We just want to explain the single most likely label.
        top_labels=1,
        # How many words in our sentence to include in the explanation. You can try different values.
        num_features=20,
    )

    # Save the explanation to an HTML file so it's easy to view.
    # You can also get it to other formats: as_list(), as_map(), etc.
    # See https://lime-ml.readthedocs.io/en/latest/lime.html#lime.explanation.Explanation

    full_path = os.path.dirname(os.path.realpath('__file__')) + "/data/" + output_filename

    output_filename = Path( full_path )
    exp.save_to_file(output_filename)

    # Open the explanation html in our web browser.
    webbrowser.open(output_filename.as_uri())

In [49]:
review = "If you can keep both eyes open through its whole three-hour length you're a better man than I am."
explain_review(review, "output_1.html")

In [50]:
review = """I love beautiful movies. If a film is eye-candy with carefully designed decorations, masterful camerawork, lighting, and architectural frames, I can forgive anything else in the movie. The lack or even absence of logic, weak dialogue and characters, cliche storylines–I can live with all that if I like what I see."""
explain_review(review, "output_2.html")
