In [6]:
import itertools
import time
import cmocean

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.plotly as py
import networkx as nx

from nltk.corpus import stopwords
from bokeh.sampledata.autompg import autompg
from bokeh.models import LinearColorMapper, ColorBar
from bokeh.palettes import Viridis256
from bokeh.plotting import figure, save
from bokeh.models import ColumnDataSource, LabelSet, HoverTool, mappers
from bokeh.io import output_notebook, show
from bokeh.transform import factor_cmap
from bokeh import palettes
from plotly.graph_objs import *

from utils_lotr import *


%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 8
plt.style.use('ggplot')

In [7]:
def cmocean_to_plotly(cmap, pl_entries):
    h = 1.0/(pl_entries-1)
    pl_colorscale = []
    
    for k in range(pl_entries):
        C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
        
    return pl_colorscale

In [2]:
scrape = False
save = False
base_url = 'http://www.tk421.net/lotr/film/'


n_top_words = 7
n_topics = 15
n_iter = 500
tsne_components = 2
tsne_perplexity = 20

vect_mode = 'Count'
transform_mode = 'LDA'
threshold_confidence = True
threshold = 0.5

cv_params = {
    'stop_words': 'english', 
    'min_df': 2, 
    'max_df': 0.9,
    'ngram_range': (1,3),
    'analyzer': 'word',
}

stops = set(stopwords.words('english'))

In [3]:
script_subsets = scrape_lotr(base_url, scrape, save)
df = clean_transcript(script_subsets)
df.drop_duplicates(['text'], inplace=True)

df = lowercase(df)
df = remove_stops(df, stops)
df['text'] = df.text.apply(lambda x: ' '.join(x))
df['character'] = df.character.apply(lambda x: ' '.join(x))

df_interactions, interact_matrix = get_interactions_df(df)


X_topics, reducer, cv = transform_text_data(df, n_topics, cv_params, 
                                            vect_mode=vect_mode,
                                            transform_mode=transform_mode)

LoTR transcripts scraped, time it took: 0.450
Interactions computed, time it took: 13.997



n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21





LDA transformation done, time it took: 6.790


In [4]:
df_interactions, interact_matrix = get_interactions_df(df)


df_interactions['character1'] = df_interactions.apply(lambda x: x['characters'].split('_')[0], axis=1)
df_interactions['character2'] = df_interactions.apply(lambda x: x['characters'].split('_')[1], axis=1)
df_interactions.reset_index(inplace=True, drop=True)


characters = df.character.unique()

df_inter_group = df_interactions.groupby('character1').sum()
df_inter_group.num_interactions.fillna(0, inplace=True)
df_inter_group['character'] = df_inter_group.index
df_inter_group2 = df_interactions.groupby('character2').sum()
df_inter_group2.num_interactions.fillna(0, inplace=True)
df_inter_group2['character'] = df_inter_group2.index
df_inter_group_full = df_inter_group.merge(df_inter_group2, on='character', how='outer')
df_inter_group_full['num_interactions'] = df_inter_group_full.num_interactions_x.values + df_inter_group_full.num_interactions_y.values
df_inter_group_full['num_interactions'] = df_inter_group_full.max(axis=1)
df_inter_group_full = df_inter_group_full[df_inter_group_full.character != 'narrator']

df_inter_narrator = df_interactions[df_interactions.character1 == 'narrator']

df_inter_highest_group = df_inter_group_full[df_inter_group_full.num_interactions >= 5].reset_index(
    drop=True).sort_values('num_interactions', ascending=False).reset_index(drop=True)


N = df_inter_highest_group.character.nunique()
popular_chars = df_inter_highest_group.character.unique()

Interactions computed, time it took: 14.105


In [9]:
dfclean = df[df.character.isin(popular_chars)]
dfn = get_network_interactions_df(dfclean)

dfn.drop(['target'], axis=1, inplace=True)
s = dfn.apply(lambda x: pd.Series(x['targets']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'target'
dfn = dfn.drop('targets', axis=1).join(s)

G = nx.from_pandas_dataframe(dfn, 'character', 'target', ['text'])
pos=nx.kamada_kawai_layout(G) 
nx.set_node_attributes(G, pos, 'pos')

max_len = len(G.nodes())
matter = cmocean_to_plotly(cmocean.cm.matter, max_len)


dmin=1
ncenter=0
for n in pos:
    x,y=pos[n]
    d=(x-0.5)**2+(y-0.5)**2
    if d<dmin:
        ncenter=n
        dmin=d

edge_trace = Scatter(
    x=[],
    y=[],
    line=Line(width=0.5,color='#888'),
    hoverinfo='character',
    mode='lines')

for edge in G.edges():
    x0, y0 = G.node[edge[0]]['pos']
    x1, y1 = G.node[edge[1]]['pos']
    edge_trace['x'] += [x0, x1, None]
    edge_trace['y'] += [y0, y1, None]

node_trace = Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=Marker(
        showscale=True,
        # colorscale options
        # 'Greys' | 'Greens' | 'Bluered' | 'Hot' | 'Picnic' | 'Portland' |
        # Jet' | 'RdBu' | 'Blackbody' | 'Earth' | 'Electric' | 'YIOrRd' | 'YIGnBu'
        colorscale=matter,
        reversescale=True,
        color=[],
        size=[],
        opacity=0.95,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=2)))

char_list = []
for node in G.nodes():
    x, y = G.node[node]['pos']
    node_trace['x'].append(x)
    node_trace['y'].append(y)
    char_list.append(node)
    
for node, adjacencies in G.adjacency():
    node_trace['marker']['color'].append(len(adjacencies))
    node_trace['marker']['size'].append(len(adjacencies))
    node_info = '{} - number of connections: {}'.format(node, str(len(adjacencies)))
    node_trace['text'].append(node_info)


fig = Figure(data=Data([edge_trace, node_trace]),
             layout=Layout(
                title='Lord of the Rings Connections Graph',
                titlefont=dict(size=18),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False),
             autosize=False,
             width=1200,
             height=900))


py.iplot(fig, filename='LotrNetworkX_V22')