# Synopsis

# Configuration

In [1]:
slug = 'moby'
db_file = 'moby.db'
# OCHO = ['chap_num', 'para_num', 'sent_num', 'token_num']
vocab_weight = 'tfidf_sum'
vocab_weight_quantile = .94
vocab_min_n = 3
kde_kernel = 'gaussian'
kde_bandwidth = 5000 #2000
# kde_samples = 1000
kde_samples = 1000

# Libraries

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.neighbors import KernelDensity as KDE

# Pragmas

In [3]:
from IPython.display import display, HTML

In [4]:
%matplotlib inline
%pwd

'/Users/leonardramsey/Dropbox/DS5559/Notes'

# Functions

In [5]:
def get_term_id(vocab, term_str):
    return vocab[vocab.term_str == term_str].index[0]

def get_term_str(vocab, term_id):
    return vocab.loc[term_id].term_str

# Process

## Import tables from database

In [6]:
with sqlite3.connect(db_file) as db:
    V = pd.read_sql("SELECT * FROM vocab WHERE stop = 0", db, index_col='term_id')
    K = pd.read_sql("SELECT term_id, term_str FROM token WHERE term_id >= 0", db)

## Prepare Tokens index

We don't have to flatten the index since we just grabbed one column from the table.

In [7]:
K.rename_axis('offset', inplace=True)

In [8]:
K.head(11).T

offset,0,1,2,3,4,5,6,7,8,9,10
term_id,5005,14441,1987,0,8383,3099,15981,15076,0,6389,12718
term_str,etymology,supplied,by,a,late,consumptive,usher,to,a,grammar,school


## Filter Vocab

In [9]:
V1 = V[(V[vocab_weight] > V[vocab_weight].quantile(vocab_weight_quantile)) & (V.n >= vocab_min_n)]

KeyError: 'tfidf_sum'

In [None]:
V1.shape[0]

In [None]:
V1.sort_values(vocab_weight, ascending=False).head(10).T

## Get list of top terms

We'll use this later.

In [None]:
TOP_TERMS = V.sort_values(vocab_weight, ascending=False).term_str.head(20).tolist()

In [None]:
TOP_TERMS

## Filter Tokens by Vocab

In [None]:
K = K[K.term_id.isin(V1.index)]

In [None]:
K.head(11).T

## Create arrays of offsets for each term

In [None]:
B = K.reset_index().groupby(['term_str']).offset.apply(lambda x: x.tolist()).to_frame()

In [None]:
B['x'] = B.apply(lambda x: np.array(x.offset)[:, np.newaxis], 1)

In [None]:
B.head()

## Get KDE for each term

In [None]:
scale_max = K.index.max() # THIS IS CRUCIAL
x_axis = np.linspace(0, scale_max, kde_samples)[:, np.newaxis]
B['kde'] = B.apply(lambda row: KDE(kernel=kde_kernel, bandwidth=kde_bandwidth).fit(row.x), 1)
B['scores'] = B.apply(lambda row: row.kde.score_samples(x_axis), axis=1)
# B['scaled'] = B.apply(lambda row: np.exp(row.scores) * (scale_max / kde_samples), axis=1)

## Visualize KDE plots

In [None]:
PLOTS = B.apply(lambda row: pd.Series(np.exp(row.scores) * (scale_max / kde_samples)), axis=1)

In [None]:
FIG = dict(figsize=(15, 5))

In [None]:
PLOTS.loc['ahab'].plot(**FIG)
PLOTS.loc['whale'].plot(**FIG)

In [None]:
PLOTS.loc['ahab'].plot(**FIG)
PLOTS.loc['deck'].plot(**FIG)

In [None]:
PLOTS.loc['ocean'].plot(**FIG)
PLOTS.loc['boat'].plot(**FIG)

## Score Pairs

We generate only unique combinations of pairs not permutation, i.e. we treat `a,b == b,a`.

In [None]:
pairs = pd.DataFrame([(x,y) for x in B.index for y in B.index if y > x] , columns=['x','y'])

In [None]:
pairs.head(10).T

## Compute overlap

This takes a while to run.

In [None]:
def overlap(row):
    kde1 = PLOTS.loc[row.x]
    kde2 = PLOTS.loc[row.y]
    overlap = np.minimum(kde1, kde2)
    return np.trapz(overlap)

In [None]:
pairs['overlap'] = pairs.apply(overlap, axis=1)

In [None]:
pairs[pairs.x == 'whalemen'].sort_values('overlap', ascending=False).head(10)

In [None]:
def paircorr(row):
    return PLOTS.T[[row.x,row.y]].corr().values[0][1]

In [None]:
pairs['corr'] = pairs.apply(paircorr, axis=1)

In [None]:
# pairs3[pairs3.x.isin(TOP_TERMS)].set_index(['x','y']).sort_values(['x','overlap'], ascending=False)

In [None]:
# pairs3[pairs3.x.isin(TOP_TERMS)].groupby(['x']).overlap.apply(lambda x: x.sort_values().head(10)).to_frame()

## Skim Top Pairs

In [None]:
pairs.overlap.plot.hist()

In [None]:
pairs[pairs.overlap > .6 ].sort_values('overlap', ascending=False)

In [None]:
pairs2 = pairs.copy().rename(columns={'x':'y', 'y':'x'})
pairs3 = pd.concat([pairs, pairs2], sort=True)

In [None]:
pairs3.query("x == 'whale'").sort_values('overlap', ascending=False).head(10)

In [None]:
pairs3.query("x == 'ahab'").sort_values('overlap', ascending=False).head(10)

## See related terms for top terms

In [None]:
DETAIL = '<table>'
for i, term in enumerate(TOP_TERMS):
    friends = pairs3[pairs3.x == term].sort_values('overlap', ascending=False).head(10)
    DETAIL += "<tr><td colspan=1><b>{}. {}</b></td></tr>".format(i+1, term)
    for row in friends.reset_index(drop=True)[['y', 'overlap']].values:
        bar = round(row[1] * 100) * '|'
        DETAIL += "<tr><td>{}</td><td style='text-align:left;'>{} ({})</td></tr>".format(row[0], bar, row[1])
DETAIL += "</table>"

In [None]:
display(HTML(DETAIL))

## Explore term correlations

In [None]:
CORR = pd.crosstab(pairs3.x, pairs3.y, pairs3.overlap, aggfunc='sum').fillna(1)

In [None]:
CORR.head()

In [None]:
def corr_plot_terms(terms, dtm, title='Foo'):
    plt.figure(figsize = (20,20))
    print(title)
    corr = dtm[terms].corr()
    sns.heatmap(corr, vmax=.3, annot=True, center=0, 
              cmap='RdYlGn',
              square=True, linewidths=.5, 
              cbar_kws={"shrink": .5})
    plt.show()

In [None]:
terms = V.sort_values(vocab_weight, ascending=False).term_str.head(20).tolist()

In [None]:
corr_plot_terms(terms, PLOTS.T, title='TEST')

## Export graphs

In [None]:
import networkx as nx
G = nx.Graph()
edges = pairs[['x','y','overlap']].sort_values('overlap', ascending=False).head(1000).apply(lambda x: (x.x, x.y, x.overlap), axis=1).values
G.add_weighted_edges_from(edges)
nx.write_gexf(G, "{}.gexf".format(slug))

# Save

In [None]:
with sqlite3.connect(db_file) as db:
    pairs.to_sql('term_pair', db, if_exists='replace', index=True)
    PLOTS.T.to_sql('term_kde', db, if_exists='replace', index=True)
#     vocab.to_sql('vocab', db, if_exists='replace', index=True)

In [None]:
# END