In [1]:
import sys
import os

In [2]:
from importlib import reload

In [3]:
sys.path.append(os.path.join(os.path.abspath(os.path.join('../..')), 'src'))

In [4]:
import plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import figure_factory as FF
import plotly.graph_objs as pogo
from plotly.graph_objs import Marker, Line, Data

init_notebook_mode(connected=True)

In [5]:
from IPython.display import display, HTML

In [6]:
main_repo_dir = os.path.abspath(os.path.join('../..'))
sys.path.append(os.path.join(main_repo_dir, 'src'))

In [7]:
import numpy
import scipy
import pandas

In [8]:
import mysql_utils
import build_local_collection
import find_best_related_docs
import graph_plot_utils

In [9]:
reload(build_local_collection)

<module 'build_local_collection' from '/home/immersinn/gits/rssfeed_link_collector/src/build_local_collection.py'>

In [10]:
reload(find_best_related_docs)

<module 'find_best_related_docs' from '/home/immersinn/gits/rssfeed_link_collector/src/find_best_related_docs.py'>

## Get the Data, Run the Process

In [11]:
url = "http://www.cnn.com/2017/02/23/politics/fbi-refused-white-house-request-to-knock-down-recent-trump-russia-stories/index.html"

In [12]:
doc = {"title" : "FBI refused White House request to knock down recent Trump-Russia stories",
       "summary" : ("Washington (CNN) The FBI rejected a recent White House request to publicly knock " 
       "down media reports about communications between Donald Trump's associates and Russians known "
       "to US intelligence during the 2016 presidential campaign, multiple US officials briefed on the "
       "matter tell CNN.")}

In [94]:
docs, words, word_info, bow = build_local_collection.get_localWordsAndDocs(doc, 
                                                                           l01_ndoc_cutoff=50,
                                                                           l02_restrict={'n_docs_bg': {
                                                                                                      'max' : 50
                                                                                                      },
                                                                                        'n_docs_qw' : {
                                                                                                       'min' : 3
                                                                                                       },
                                                                                        'ratio' : 25
                                                                                        },
                                                                           verbose=True)

Init-ing tools...
Starting Level 0...
Starting Level 1...
Starting Level 2...
Collecting data, getting BOWs and finishing up...


In [95]:
len(docs['l01'])

97

In [96]:
len(docs['l02'])

98

In [97]:
results = find_best_related_docs.compare_methods(docs, words, bow, 
                                                 L=1.1, cutoff=0.03, min_grp_size=10,
                                                 use_orig_bow_words=True,
                                                 verbose=True,
                                                 graph_edge_cutoff=0.1)

Collecting words and creating BOWS...
Calculating scores ...
Finding best docs...
	JMS Method...
	Spectral Method...


## Formatting Tools

In [59]:
def format_title(title, max_len):
    if len(title) > max_len:
        return(title[:max_len] + "...")
    else:
        return(title)

ft = lambda tit: format_title(tit, 80)
fs = lambda val: "{:.3f}".format(val)

In [60]:
def adjust_column_widths(ff_table, n_cols=2, new_xs=[1.]):
    for i,entry in enumerate(ff_table['layout']['annotations']):
        if i % n_cols!=0:
            entry['x'] = new_xs[(i % n_cols)-1]

In [61]:
def score_table_prep(df, filename):
    
    df['Title'] = df.title.apply(ft)
    df['Score'] = df.score.apply(fs)

    table = FF.create_table(df[['Title', 'Score']])
    adjust_column_widths(table)

    url = plot(table, filename=filename, auto_open=False)
    url = url[7:]
    
    return(url)

# Content Format

In [62]:
def word_ids_2_df(word_ids):
    df = mysql_utils.query_idWordLookup(list(word_ids))
    df = [{'word_id' : k, 'word' : w} for k,w in df.items()]
    df = pandas.DataFrame(df)
    df.index = df['word_id']
    df = df[['word']]
    return(df)

## Main Article

In [98]:
orig_article_title = doc['title']
orig_article_content = doc['summary']

In [99]:
wil01 = word_info['l01']
wil01.index = wil01['word_id']
cols = list(wil01.columns)
cols.remove('word_id')
wil01 = wil01[cols]

In [100]:
words_l01 = word_ids_2_df(list(wil01.index))

In [101]:
wil01 = wil01.join(words_l01)

In [128]:
wil01 = wil01[['word', 'n_docs', 'n_total', 'used']]

In [129]:
wil01.head()

Unnamed: 0_level_0,word,n_docs,n_total,used
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
27973,knock,11,13,True
12578,briefed,13,15,True
6327,publicly,35,36,True
4148,trumprussia,15,15,True
2900,russians,34,40,True


## Level 1 Information

In [130]:
qwl01 = word_ids_2_df(words['l01'])
qwl01

Unnamed: 0_level_0,word
word_id,Unnamed: 1_level_1
12578,briefed
4148,trumprussia
27973,knock
2294,associates
6327,publicly
2900,russians


In [131]:
wil01['used'] = wil01.word.apply(lambda w: w in set(qwl01.word))
wil01 = wil01.sort_values(by='used', ascending=False)

In [132]:
wil01_table = FF.create_table(wil01)
wil01_url = plot(wil01_table, filename="images/wil01_table_table.html", auto_open=False)
wil01_url = wil01_url[7:]

In [133]:
nl01_docs = len(docs['l01'])

In [134]:
wil02 = word_info['l02']
wil02.index = wil02.word_id_bg
cols = list(wil02.columns)
cols.remove('word_id_bg')
wil02.head()

Unnamed: 0_level_0,word_id_bg,n_docs_bg,n_total_bg,frac_bg,word_id_qw,n_docs_qw,n_total_qw,frac_qw,ratio
word_id_bg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,3,7586,11670,0.168867,16.0,1.0,1,0.008929,0.052873
11,11,803,932,0.017875,65.0,4.0,4,0.035714,1.997999
14,14,635,694,0.014135,71.0,1.0,1,0.008929,0.631651
16,16,435,563,0.009683,75.0,4.0,4,0.035714,3.688259
18,18,364,390,0.008103,82.0,1.0,1,0.008929,1.101918


In [135]:
wil02.shape

(1984, 9)

In [136]:
qwl02 = word_ids_2_df(words['l02'])

In [137]:
qwl02

Unnamed: 0_level_0,word
word_id,Unnamed: 1_level_1
1279,selfishness
1189,handed
183,columnist
344,barker
455,jacob
1207,newsmaking
1630,worldviews
143,communication


In [138]:
qwl02 = qwl02.join(wil02, how='left')

In [139]:
qwl02['ratio'] = qwl02['ratio'].apply(lambda val:"{:.1f}".format(val))
qwl02['frac_bg'] = qwl02['frac_bg'].apply(lambda val:"{:.4f}".format(val))
qwl02['frac_qw'] = qwl02['frac_qw'].apply(lambda val:"{:.4f}".format(val))
qwl02 = qwl02[['word', 'n_docs_bg', 'n_total_bg', 'frac_bg', 'n_docs_qw', 'n_total_qw', 'frac_qw', 'ratio']]

In [140]:
qwl02

Unnamed: 0_level_0,word,n_docs_bg,n_total_bg,frac_bg,n_docs_qw,n_total_qw,frac_qw,ratio
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1279,selfishness,2,2,0.0,1.0,1,0.0089,200.5
1189,handed,21,22,0.0005,9.0,9,0.0804,171.9
183,columnist,41,42,0.0009,11.0,11,0.0982,107.6
344,barker,3,3,0.0001,2.0,2,0.0179,267.4
455,jacob,7,7,0.0002,10.0,10,0.0893,573.0
1207,newsmaking,2,2,0.0,1.0,1,0.0089,200.5
1630,worldviews,2,2,0.0,1.0,1,0.0089,200.5
143,communication,47,56,0.001,22.0,24,0.1964,187.7


In [143]:
qwl02_table = FF.create_table(qwl02)
qwl02_url = plot(qwl02_table, filename="images/qwl02_table_table.html", auto_open=False)
qwl02_url = qwl02_url[7:]

## Section 3: Results

### JW Smoothing Results

In [79]:
jms_docs = results['jms_score']['docs_info']

In [80]:
jms_n_docs = jms_docs.shape[0]
jms_n_docs

219

In [81]:
for limit in [5, 10, 15, 25, 50]:
    jms_oo1 = sum([i not in docs['l01'] for i in jms_docs.index[:limit]])
    print("Nbr of docs in top {} not from 1st wave: {}".format(limit, jms_oo1))

Nbr of docs in top 5 not from 1st wave: 1
Nbr of docs in top 10 not from 1st wave: 1
Nbr of docs in top 15 not from 1st wave: 2
Nbr of docs in top 25 not from 1st wave: 5
Nbr of docs in top 50 not from 1st wave: 18


In [82]:
jms_docs[['title', 'summary', 'score']].head(10)

Unnamed: 0_level_0,title,summary,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
36469,FBI refused White House request to knock down ...,The FBI rejected a recent White House request ...,1.0
15773,Trump aides spoke regularly to Russian officia...,High-level advisers close to then-presidential...,0.326538
966,Donald Trump Denounces ‘Un-American’ Intellige...,"<a href=""http://www.breitbart.com/big-governme...",0.319277
15552,US officials corroborate aspects of Russia dos...,"For the first time, US investigators say they ...",0.318796
15698,White House was warned Flynn could be blackmai...,The Justice Department warned the Trump admini...,0.256764
15880,Former Clinton aides apoplectic over Trump cam...,Hillary Clinton's former campaign aides are ap...,0.256621
672,CNN’s Acosta: Trump Only Calling on Conservati...,"<a href=""http://www.breitbart.com/video/2017/0...",0.245172
8145,"Dems see disparity in handling of Clinton, Rus...",WASHINGTON (AP) -- Democrats are critic...,0.227264
115,"Maxine Waters: Trump’s Administration, Associa...","<a href=""http://www.breitbart.com/video/2017/0...",0.218727
274,KLEIN – Trump-Russia Claims Rethought: How the...,"<a href=""http://www.breitbart.com/jerusalem/20...",0.204384


In [83]:
jms_dt_url = score_table_prep(jms_docs, filename="images/jms_docs_table.html")

### Spectral Results

In [84]:
spec_docs = results['spectral']['docs_info']
spec_out = results['spectral']['cluster_info']
ig = results['spectral']['igraph']
A = results['spectral']['A']

In [85]:
len(ig.es())

1276

In [86]:
spec_n_docs = spec_docs.shape[0]
spec_n_docs

109

In [87]:
for limit in [5, 10, 15, 25, 50]:
    spec_oo1 = sum([i not in docs['l01'] for i in spec_docs.index[:limit]])
    print("Nbr of docs in top {} not from 1st wave: {}".format(limit, spec_oo1))

Nbr of docs in top 5 not from 1st wave: 1
Nbr of docs in top 10 not from 1st wave: 1
Nbr of docs in top 15 not from 1st wave: 2
Nbr of docs in top 25 not from 1st wave: 5
Nbr of docs in top 50 not from 1st wave: 19


In [88]:
spec_docs[['title', 'summary', 'score']].head(10)

Unnamed: 0_level_0,title,summary,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
36469,FBI refused White House request to knock down ...,The FBI rejected a recent White House request ...,1.0
15773,Trump aides spoke regularly to Russian officia...,High-level advisers close to then-presidential...,0.326538
966,Donald Trump Denounces ‘Un-American’ Intellige...,"<a href=""http://www.breitbart.com/big-governme...",0.319277
15552,US officials corroborate aspects of Russia dos...,"For the first time, US investigators say they ...",0.318796
15698,White House was warned Flynn could be blackmai...,The Justice Department warned the Trump admini...,0.256764
15880,Former Clinton aides apoplectic over Trump cam...,Hillary Clinton's former campaign aides are ap...,0.256621
672,CNN’s Acosta: Trump Only Calling on Conservati...,"<a href=""http://www.breitbart.com/video/2017/0...",0.245172
8145,"Dems see disparity in handling of Clinton, Rus...",WASHINGTON (AP) -- Democrats are critic...,0.227264
115,"Maxine Waters: Trump’s Administration, Associa...","<a href=""http://www.breitbart.com/video/2017/0...",0.218727
274,KLEIN – Trump-Russia Claims Rethought: How the...,"<a href=""http://www.breitbart.com/jerusalem/20...",0.204384


In [89]:
spec_dt_url = score_table_prep(spec_docs, filename="images/spec_docs_table.html")

In [90]:
fig = graph_plot_utils.create_graph_fig(ig, show_groups=True)

iplot(fig)

In [91]:
graph_plot_url = plot(fig, filename='images/spec23_groupsGraph.html', auto_open=False,)
graph_plot_url = graph_plot_url[7:]

## Generate HTML as String and Write to File

In [144]:
html_string = '''
<html>
    <head>
        <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
        <style>body{ margin:0 100; background:whitesmoke; }</style>
    </head>
    <body>
        <h1>Comparison of JMS and Spectral Graph Methods</h1>
        
        <!-- *** Section 1 *** --->
        <h2>Document Overview</h2>
        
            
            <h4>Document Source:</h4>
                <a href="''' + url + '''">Link</a>
            <h4>Document Title:</h4>
                <p><b>''' + doc['title'] + '''</b></p>
            <h4>Document Content:</h4>
                <p>"''' + doc['summary'] + '''"</p>
                
        <h2>Words Used</h2>
        
            <h3>Words from Document</h3>
        
            <iframe style="padding:40px" width="100%" height="480" frameborder="0" seamless="seamless" scrolling="yes" align="left"\
    src="''' + wil01_url + '''"></iframe>
    
            <h3>Words from Query Level 2</h3>

            <iframe style="padding:40px" width="100%" height="480" frameborder="0" seamless="seamless" scrolling="yes" align="right"\
    src="''' + qwl02_url + '''"></iframe>
        
            

        <!-- *** Section 3 *** --->
        <h2>"Top Docs" Review</h2>
        
                
            <table width="100%">
                <tr>
                    <td width="100%"><h3>JMS Top Docs:</h3></td>
                </tr>
                <tr>
                    <td width="100%"><h4>Total Docs: ''' + str(jms_n_docs) + '''</h4></td>
                </tr>
            </table>


            <iframe style="padding:40px" width="100%" height="480" frameborder="0" seamless="seamless" scrolling="yes" "\
    src="''' + jms_dt_url + '''"></iframe>
    
    
            <table width="100%">
                <tr>
                    <td width="100%"><h3>Spectral Cluster Top Docs:</h3></td>
                </tr>
                <tr>
                    <td width="100%"><h4>Total Docs: ''' + str(spec_n_docs) + '''</h4</td>
                </tr>
            </table>


            <iframe style="padding:40px" width="100%" height="480" frameborder="0" seamless="seamless" scrolling="yes" \
    src="''' + spec_dt_url + '''"></iframe>
    
    
    
            <h3>Spectral23 Communities Graph</h3>
    
            <iframe style="padding:40px" width="1500" height="1500" frameborder="0" seamless="seamless" scrolling="no"\
    src="''' + graph_plot_url + '''"></iframe>


    
    </body>
</html>'''

In [145]:
f = open(os.path.join(main_repo_dir,'reports/dashboards/MethCompare.html'),'w')
f.write(html_string)
f.close()