<a href="https://colab.research.google.com/github/kcalizadeh/phil_nlp/blob/master/dash_app_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [13]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

# install relevent libraries not included with colab
!pip install lime
!pip install symspellpy
!pip install jupyter-dash
!pip install dash-bootstrap-components


drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/Phil_NLP'

sys.path.append(drive_path)

Mounted at /gdrive


In [14]:
%load_ext autoreload
%autoreload 2
from functions import *

np.random.seed(17)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Text Classifier

In order to set up the text classifier via Lime, we need to build a pipeline that can tokenize and pad text for use with our neural network models. 

In [15]:
class Padder(BaseEstimator, TransformerMixin):
    def __init__(self, maxlen=500):
        self.maxlen = maxlen
        self.max_index = None
        
    def fit(self, X, y=None):
        self.max_index = pad_sequences(X, maxlen=self.maxlen).max()
        return self
    
    def transform(self, X, y=None):
        X = pad_sequences(X, maxlen=self.maxlen)
        # X[X > self.max_index] = 0
        return X

In [None]:
class TextsToSequences(BaseEstimator, TransformerMixin):
    def __init__(self,  tokenizer):
        self.tokenizer = tokenizer
        
    def fit(self, texts, y=None):
        return self
    
    def transform(self, texts, y=None):
        return np.array(self.tokenizer.texts_to_sequences(texts))

The following cell runs the app in the notebook. At this point it is unformatted, but the callbacks work and it will display a text analysis breakdown.

In [None]:
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State
from tensorflow.keras.models import load_model

# Load Data
df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/Phil_NLP/phil_nlp.csv')

model_path = '/gdrive/MyDrive/Colab_Projects/Phil_NLP/checkpoints/NN_weights_epoch:07_0.7678.hdf5'
model = load_model(model_path)

with open('/gdrive/MyDrive/Colab_Projects/Phil_NLP/baseline_tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# set up classification explanation pipeline
padder = Padder(450)
sequencer = TextsToSequences(tokenizer)
pipeline = make_pipeline(sequencer, padder, model)

# set up labels
school_label_dict = {'analytic': 0,
 'aristotle': 1,
 'capitalism': 2,
 'communism': 3,
 'continental': 4,
 'empiricism': 5,
 'german_idealism': 6,
 'phenomenology': 7,
 'plato': 8,
 'rationalism': 9}
flipped_dict = {value:key for key, value in school_label_dict.items()}

# search bar object
search_bar = html.Div(id="search-bar-container", children=
    [
        dbc.Input(id="search-bar", placeholder="enter text to classify", type="text"),
        dbc.Button("SUBMIT", id="search-bar-submit-button", color="primary", className="mr-1", n_clicks=0)
    ])


# the app itself
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H1("Text Classification"),
    search_bar,
    html.Div(id="search-bar-output", children=[])  
])


# callback for search bar
@app.callback(Output(component_id="search-bar-output", component_property="children"),
              [Input(component_id="search-bar-submit-button", component_property="n_clicks")],
              [State(component_id="search-bar", component_property="value")])
def generate_explainer_html(n_clicks, text):
    empty_obj = html.Iframe(
        srcDoc='''<div>Enter input text to see LIME explanations.</div>''',
        width='100%',
        height='100px',
        style={'border': '2px #d3d3d3 solid'},
        hidden=True,
    )
    if n_clicks < 1 or text == '':
      return empty_obj
    else:
      explainer = lime_text.LimeTextExplainer(class_names=list(school_label_dict.keys()))
      exp = explainer.explain_instance(text, pipeline.predict, num_features=10, labels=[0,1,2,3,4,5,6,7,8,9])
      obj = html.Iframe(
          srcDoc=exp.as_html(),
          width='100%',
          height='800px',
          style={'border': '2px #d3d3d3 solid'},
      )
      return obj


# Run app and display result inline in the notebook
app.run_server(mode='inline')

### W2V Explorer

This app is designed to enable exploration of the texts via w2v models. Users can submit text to see how different philosophers use key terms. 

In [16]:
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State
from tensorflow.keras.models import load_model

custom_vectors = KeyedVectors.load('/gdrive/MyDrive/Colab_Projects/Phil_NLP/w2v_models/test_w2v.wordvectors')

# search bar object
search_bar = html.Div(id="search-bar-container", children=
    [
        dbc.Input(id="search-bar", placeholder="enter text to classify", type="text"),
        dbc.Button("SUBMIT", id="search-bar-submit-button", color="primary", className="mr-1", n_clicks=0)
    ])


# the app itself
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H1("Word Similarity Search"),
    search_bar,
    html.Div(id="search-bar-output", children=[])  
])


# callback for search bar
@app.callback(Output(component_id="search-bar-output", component_property="children"),
              [Input(component_id="search-bar-submit-button", component_property="n_clicks")],
              [State(component_id="search-bar", component_property="value")])
def generate_explainer_html(n_clicks, text):
    empty_obj = html.Iframe(
        srcDoc='''<div>Enter input text to see LIME explanations.</div>''',
        width='100%',
        height='100px',
        style={'border': '2px #d3d3d3 solid'},
        hidden=True,
    )
    if n_clicks < 1 or text == '':
      return empty_obj
    else:
      try:
        similar_words = custom_vectors.most_similar(text)
        formatted = [f'{x[0].title()}, {round(x[1], 3)}\n\n' for x in similar_words]
        return formatted
      except:
        return 'Sorry, that word or phrase is not in the vocabulary'


# Run app and display result inline in the notebook
app.run_server(mode='external')

# app

Dash app running on:


<IPython.core.display.Javascript object>

In [None]:
from tensorflow.keras.models import load_model


In [None]:
model_path = '/gdrive/MyDrive/Colab_Projects/Phil_NLP/checkpoints/NN_weights_epoch:07_0.7678.hdf5'
model = load_model(model_path)



In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         10844288  
_________________________________________________________________
lstm (LSTM)                  (None, 50)                35800     
_________________________________________________________________
dense (Dense)                (None, 25)                1275      
_________________________________________________________________
dropout (Dropout)            (None, 25)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                260       
Total params: 10,881,623
Trainable params: 10,881,623
Non-trainable params: 0
_________________________________________________________________


In [None]:
with open('/gdrive/MyDrive/Colab_Projects/Phil_NLP/baseline_tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
to_classify = 'Knowledge of the Idea of the absolute ethical order depends entirely on the establishment of perfect adequacy between intuition and concept, because the Idea itself is nothing other than the identity of the two. But if this identity is to be actually known, it must be thought as a made adequacy.'

In [None]:
pipeline = make_pipeline(sequencer, padder, model)

In [None]:
pd.Series(to_classify)

0    Knowledge of the Idea of the absolute ethical ...
dtype: object

In [None]:
tokenized = tokenizer.texts_to_sequences(pd.Series(to_classify))

In [None]:
padded = sequence.pad_sequences(tokenized, maxlen=450)

In [None]:
flipped_dict[pipeline.predict(pd.Series(to_classify)).argmax()]

In [None]:
prediction_num = model.predict(padded, verbose=1)



In [None]:
prediction_num.argmax()

6

In [None]:
flipped_dict[prediction_num.argmax()]

In [None]:
list(school_label_dict.keys())

['analytic',
 'aristotle',
 'capitalism',
 'communism',
 'continental',
 'empiricism',
 'german_idealism',
 'phenomenology',
 'plato',
 'rationalism']

In [None]:
to_classify = """Hi Michelle and Kourosh-

 

Happy New Year to you both!

 

Was wondering if we could schedule a short touch base in the next few weeks? No concerns—rather—just want to get some feedback on Cole’s progress.

 

Could you suggest a few times that could work for you both?

 

thanks"""

In [None]:
explainer = lime_text.LimeTextExplainer(class_names=list(school_label_dict.keys()))
exp = explainer.explain_instance(to_classify, pipeline.predict, num_features=10, labels=[0,1,2,3,4,5,6,7,8,9])

exp.show_in_notebook(text=True)

In [None]:
for i in prediction_num.argmax(axis=1):
  print(flipped_dict[i])

In [None]:
school_label_dict = {'analytic': 0,
 'aristotle': 1,
 'capitalism': 2,
 'communism': 3,
 'continental': 4,
 'empiricism': 5,
 'german_idealism': 6,
 'phenomenology': 7,
 'plato': 8,
 'rationalism': 9}
flipped_dict = {value:key for key, value in school_label_dict.items()}

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.pipeline import TransformerMixin
from sklearn.base import BaseEstimator

class TextsToSequences(BaseEstimator, TransformerMixin):
    """ Sklearn transformer to convert texts to indices list 
    (e.g. [["the cute cat"], ["the dog"]] -> [[1, 2, 3], [1, 4]])"""
    def __init__(self,  tokenizer, **kwargs):
        self.tokenizer = tokenizer
        
    def fit(self, texts, y=None):
        return self
    
    def transform(self, texts, y=None):
        return np.array(self.tokenizer.texts_to_sequences(texts))
        
sequencer = TextsToSequences(tokenizer, num_words=450)

In [None]:
class Padder(BaseEstimator, TransformerMixin):
    """ Pad and crop uneven lists to the same length. 
    Only the end of lists longer than the maxlen attribute are
    kept, and lists shorter than maxlen are left-padded with zeros
    
    Attributes
    ----------
    maxlen: int
        sizes of sequences after padding
    max_index: int
        maximum index known by the Padder, if a higher index is met during 
        transform it is transformed to a 0
    """
    def __init__(self, maxlen=500):
        self.maxlen = maxlen
        self.max_index = None
        
    def fit(self, X, y=None):
        self.max_index = pad_sequences(X, maxlen=self.maxlen).max()
        return self
    
    def transform(self, X, y=None):
        X = pad_sequences(X, maxlen=self.maxlen)
        # X[X > self.max_index] = 0
        return X

padder = Padder(450)