# Set-up

### 1. Install Prerequisite Libraries
* python-igraph  
* scikit-learn 
* umap-learn 
* plotly 
* shap 
* pandas  
* numpy 
* scipy 

Also make sure jupyter notebook with Python 3.6 Kernel is installed and that you are using python 3. https://ipython.readthedocs.io/en/latest/install/kernel_install.html

### 2. Decompress texts.pkl.zip

### 3. Load Libraries

In [104]:
import igraph as ig # python-igraph
import json
import urllib3
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from collections import Counter
import numpy as np
import pandas as pd
from umap import UMAP
from sklearn.decomposition import KernelPCA as KPCA
from sklearn.pipeline import Pipeline
import plotly.graph_objs as go
import plotly.offline as py
import shap
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Demo 1: 3-D Network Visualization of Les Miserables Characters

### 1. Load Network Graph Data

In [105]:
data = []
http = urllib3.PoolManager()
req = http.request('GET',"https://raw.githubusercontent.com/plotly/datasets/master/miserables.json")
data = json.loads(req.data)





### 2. Generate Graph

In [106]:
G = generate_graph(data)

### 3. Plot Les Miserables Network

In [107]:
plot_les_miserables(data, G, 'Les-Miserables.html')
    

# Demo 2: 3-D Book Clustering By Shared Words 
Book is a data point and the closer it is to another book, the more shared content

### 1. Load Texts

In [10]:
txts=pd.read_pickle("./texts.pkl")

### 2. Create TFIDF matrix

In [11]:
tfidf_matrix, words = return_tfidf_matrix(txts)
words=np.squeeze(words)

### 3. Transform Data using UMAP

In [108]:
t_data = transform(tfidf_matrix)



invalid value encountered in sqrt



### 4. Load Transformed Data into DataFrame

In [109]:
t_data_df = pd.DataFrame(np.vstack((txts['author'].values,txts['title'].values,t_data.T)).T,columns=['author','title','x','y','z'])


### 5. Plot Document Clusters

In [110]:
plotly_plot(t_data_df, 'book_clusters.html', False, 'title', 'author')


# Demo 3: Find Words that Differentiate Authors' Works
Make a bar plot of most important words that define each author.
This will be an interactive tutorial in creating a simple plotly plot.

### 1. Return Data Frame Containing top words and scores per author

In [96]:
top_words_per_author=return_top_words_per_author_pipeline(txts, tfidf_matrix)

### 2. Visualize Head of Data Frame

In [97]:
top_words_per_author.head()

Unnamed: 0,author,top_words,top_scores
0,"Balzac, Honoré de",balzac,0.018426
1,"Balzac, Honoré de",provincial,0.012286
2,"Balzac, Honoré de",honore,0.011195
3,"Balzac, Honoré de",addendum,0.008491
4,"Balzac, Honoré de",courtesan,0.007811


### 3. Create function that inputs author name and data frame and returns a "Bar" plotly object

In [None]:
# Fill in
def create_plot_object(author,df):
    plot = go.Bar(x=,y=,name=,text=)
    return plot

In [98]:
# Answer
def create_plot_object(author,df):
    plot = go.Bar(x=df['top_words'],y=df['top_scores'],name=author,text=df['top_words'])
    return plot

### 4. Create list of bar plot objects

In [None]:
plots = []

for author,df in top_words_per_author.groupby('author'):
    # Fill in here

In [99]:
# Answer

plots = []

for author,df in top_words_per_author.groupby('author'):
    plot = create_plot_object(author,df)
    plots.append(plot)
    

### 5. Generate Layout Object that Formats Plots

In [None]:
# Fill in
layout = go.Layout(
    barmode='group',
    title=,
    yaxis={'title':},
    xaxis={'title':},
)

In [100]:
# Answer
layout = go.Layout(
    barmode='group',
    title='Top Word Importances for Each Author',
    yaxis={'title':'Mean SHAP Score'},
    xaxis={'title':'Words'},
)


### 6. Create Figure Object that stores data (bar plot objects) and layout object

In [None]:
# Fill in
fig = go.Figure(data=, layout=)


In [101]:
# Answer
fig = go.Figure(data=plots, layout=layout)

### 7. Plot Figure Object

In [None]:
# Fill in
py.plot(, filename=, auto_open=False)

In [102]:
# Answer
py.plot(fig, filename='SHAP-Top-Words-Authors.html', auto_open=False)

'file:///Users/joshualevy/Documents/GitHub/plotly_presentation/SHAP-Top-Words-Authors.html'

# Functions

In [95]:
def generate_graph(data):
    L=len(data['links'])
    
    Edges=[(data['links'][k]['source'], data['links'][k]['target']) for k in range(L)]

    G=ig.Graph(Edges, directed=False)
    adjacency_matrix=np.array(list(G.get_adjacency()))
    return G

def plot_les_miserables(data, G, filename='Les-Miserables.html'):
    N=len(data['nodes'])
    L=len(data['links'])
    Edges=[(data['links'][k]['source'], data['links'][k]['target']) for k in range(L)]
    labels=[]
    group=[]
    for node in data['nodes']:
        labels.append(node['name'])
        group.append(node['group'])
    layt=G.layout('kk', dim=3) 
    Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
    Yn=[layt[k][1] for k in range(N)]# y-coordinates
    Zn=[layt[k][2] for k in range(N)]# z-coordinates
    Xe=[]
    Ye=[]
    Ze=[]
    for e in Edges:
        Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
        Ye+=[layt[e[0]][1],layt[e[1]][1], None]  
        Ze+=[layt[e[0]][2],layt[e[1]][2], None]  

    trace1=go.Scatter3d(x=Xe,
                   y=Ye,
                   z=Ze,
                   mode='lines',
                   line=dict(color='rgb(125,125,125)', width=1),
                   hoverinfo='none'
                   )

    trace2=go.Scatter3d(x=Xn,
                   y=Yn,
                   z=Zn,
                   mode='markers',
                   name='actors',
                   marker=dict(symbol='circle',
                                 size=6,
                                 color=group,
                                 colorscale='Viridis',
                                 line=dict(color='rgb(50,50,50)', width=0.5)
                                 ),
                   text=labels,
                   hoverinfo='text'
                   )

    axis=dict(showbackground=False,
              showline=False,
              zeroline=False,
              showgrid=False,
              showticklabels=False,
              title=''
              )

    layout = go.Layout(
             title="Network of coappearances of characters in Victor Hugo's novel<br> Les Miserables (3D visualization)",
             width=1000,
             height=1000,
             showlegend=False,
             scene=dict(
                 xaxis=dict(axis),
                 yaxis=dict(axis),
                 zaxis=dict(axis),
            ),
         margin=dict(
            t=100
        ),
        hovermode='closest',
        annotations=[
               dict(
               showarrow=False,
                text="Data source: <a href='http://bost.ocks.org/mike/miserables/miserables.json'>[1] miserables.json</a>",
                xref='paper',
                yref='paper',
                x=0,
                y=0.1,
                xanchor='left',
                yanchor='bottom',
                font=dict(
                size=14
                )
                )
            ],    )
    data=[trace1, trace2]
    fig=go.Figure(data=data, layout=layout)

    py.plot(fig, filename=filename, auto_open=False)
    
def return_tfidf_matrix(txts):
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer.fit(txts['text'])
    initial_count_matrix = count_vectorizer.transform(txts['text'])
    feature_names = np.array(count_vectorizer.get_feature_names())
    count_matrix = TfidfTransformer().fit_transform(initial_count_matrix)
    return count_matrix, feature_names

def transform(tfidf_matrix):
    transformer_pipeline = Pipeline([('umap',UMAP(n_components=3,n_neighbors=10, min_dist=.4))])
    t_data = transformer_pipeline.fit_transform(tfidf_matrix)
    return t_data

def plotly_plot(t_data_df, output_fname, axes_off=False, text_column='text', color_column=None):
    plots = []
    if color_column == None:
        plots.append(
            go.Scatter3d(x=t_data_df['x'], y=t_data_df['y'],
                         z=t_data_df['z'],
                         name='', mode='markers',
                         marker=dict(color=t_data_df[color_column], size=2, colorscale='Viridis',
                         colorbar=dict(title='Colorbar')), text=t_data_df[text_column]))
    else:
        
        colors = t_data_df[color_column].unique()
        c = ['hsl(' + str(h) + ',50%' + ',50%)' for h in np.linspace(0, 360, len(colors) + 2)]
        color_dict = {name: c[i] for i,name in enumerate(sorted(colors))}

        for name,col in color_dict.items():
            plots.append(
                go.Scatter3d(x=t_data_df['x'][t_data_df[color_column]==name], y=t_data_df['y'][t_data_df[color_column]==name],
                             z=t_data_df['z'][t_data_df[color_column]==name],
                             name=str(name), mode='markers',
                             marker=dict(color=col, size=2), text=t_data_df[text_column][t_data_df[color_column]==name]))
    if axes_off:
        fig = go.Figure(data=plots,layout=go.Layout(scene=dict(xaxis=dict(title='',autorange=True,showgrid=False,zeroline=False,showline=False,ticks='',showticklabels=False),
            yaxis=dict(title='',autorange=True,showgrid=False,zeroline=False,showline=False,ticks='',showticklabels=False),
            zaxis=dict(title='',autorange=True,showgrid=False,zeroline=False,showline=False,ticks='',showticklabels=False))))
    else:
        fig = go.Figure(data=plots)
    py.plot(fig, filename=output_fname, auto_open=False)
    
def generate_model(txts, tfidf_matrix):
    le = LabelEncoder()
    le.fit(txts['author'].values)
    y=le.transform(txts['author'].values)
    model=RandomForestClassifier(n_estimators=100)
    model.fit(tfidf_matrix,y)
    return model, le

def return_top_word_importances(tfidf_matrix, model):
    explainer=shap.TreeExplainer(model)
    shap_values = explainer.shap_values(tfidf_matrix.toarray())
    return shap_values

def return_top_words_per_author(shap_matrix,txts,le):
    top_words_dict={}
    for author in txts['author'].unique():
        shap_matrix_subset = shap_matrix[le.transform([author])[0]][np.isin(txts['author'],author),:]
        mean_shap=np.squeeze(np.asarray(shap_matrix_subset.mean(0)))
        sorted_shap_idx=np.argsort(mean_shap*-1)
        top_words = words[sorted_shap_idx][:30]
        top_shap = mean_shap[sorted_shap_idx][:30]
        top_words_dict[author]=pd.DataFrame(dict(top_words=top_words,top_scores=top_shap))
        top_words_dict[author]['author']=author
    df=pd.concat(list(top_words_dict.values()))[['author','top_words','top_scores']]
    return df

def return_top_words_per_author_pipeline(txts, tfidf_matrix):
    model, label_encoder=generate_model(txts, tfidf_matrix)
    top_word_importances = return_top_word_importances(tfidf_matrix, model)
    top_words_per_author = return_top_words_per_author(top_word_importances,txts,label_encoder)
    return top_words_per_author