In [1]:
import sys
import os

In [2]:
from importlib import reload

In [3]:
sys.path.append(os.path.join(os.path.abspath(os.path.join('../..')), 'src'))

In [4]:
import plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import figure_factory as FF
import plotly.graph_objs as pogo
from plotly.graph_objs import Marker, Line, Data

init_notebook_mode(connected=True)

In [5]:
from IPython.display import display, HTML

In [6]:
main_repo_dir = os.path.abspath(os.path.join('../..'))
sys.path.append(os.path.join(main_repo_dir, 'src'))

In [7]:
import numpy
import scipy
import pandas

In [8]:
import mysql_utils
import build_local_collection
import find_best_related_docs
import graph_plot_utils

In [9]:
reload(build_local_collection)

<module 'build_local_collection' from '/home/immersinn/gits/rssfeed_link_collector/src/build_local_collection.py'>

In [10]:
reload(find_best_related_docs)

<module 'find_best_related_docs' from '/home/immersinn/gits/rssfeed_link_collector/src/find_best_related_docs.py'>

## Get the Data, Run the Process

In [11]:
url = "http://www.cnn.com/2017/02/23/politics/fbi-refused-white-house-request-to-knock-down-recent-trump-russia-stories/index.html"

In [12]:
doc = {"title" : "FBI refused White House request to knock down recent Trump-Russia stories",
       "summary" : ("Washington (CNN) The FBI rejected a recent White House request to publicly knock " 
       "down media reports about communications between Donald Trump's associates and Russians known "
       "to US intelligence during the 2016 presidential campaign, multiple US officials briefed on the "
       "matter tell CNN.")}

In [13]:
docs, words, word_info, bow = build_local_collection.get_localWordsAndDocs(doc, 
                                                                           l01_ndoc_cutoff=50,
                                                                           l02_restrict={'n_docs_bg': {
                                                                                                      'max' : 50
                                                                                                      },
                                                                                        'n_docs_qw' : {
                                                                                                       'min' : 3
                                                                                                       },
                                                                                        'ratio' : 10
                                                                                        },
                                                                           verbose=True)

Init-ing tools...
Starting Level 0...
Starting Level 1...
Starting Level 2...
Collecting data, getting BOWs and finishing up...


In [14]:
len(docs['l01'])

97

In [15]:
len(docs['l02'])

225

In [16]:
results = find_best_related_docs.compare_methods(docs, words, bow, 
                                                 L=1.1, cutoff=0.029, min_grp_size=10,
                                                 use_orig_bow_words=True,
                                                 verbose=True)

Collecting words and creating BOWS...
Calculating scores ...
Finding best docs...
	JMS Method...
	Spectral Method...


## Formatting Tools

In [17]:
def format_title(title, max_len):
    if len(title) > max_len:
        return(title[:max_len] + "...")
    else:
        return(title)

ft = lambda tit: format_title(tit, 80)
fs = lambda val: "{:.3f}".format(val)

In [18]:
def adjust_column_widths(ff_table, n_cols=2, new_xs=[1.]):
    for i,entry in enumerate(ff_table['layout']['annotations']):
        if i % n_cols!=0:
            entry['x'] = new_xs[(i % n_cols)-1]

In [19]:
def score_table_prep(df, filename):
    
    df['Title'] = df.title.apply(ft)
    df['Score'] = df.score.apply(fs)

    table = FF.create_table(df[['Title', 'Score']])
    adjust_column_widths(table)

    url = plot(table, filename=filename, auto_open=False)
    url = url[7:]
    
    return(url)

# Content Format

## Section 3: Results

### JW Smoothing Results

In [20]:
jms_docs = results['jms_score']['docs_info']

In [21]:
jms_docs.shape

(223, 5)

In [22]:
for limit in [5, 10, 15, 25, 50]:
    jms_oo1 = sum([i not in docs['l01'] for i in jms_docs.index[:limit]])
    print("Nbr of docs in top {} not from 1st wave: {}".format(limit, jms_oo1))

Nbr of docs in top 5 not from 1st wave: 1
Nbr of docs in top 10 not from 1st wave: 1
Nbr of docs in top 15 not from 1st wave: 2
Nbr of docs in top 25 not from 1st wave: 5
Nbr of docs in top 50 not from 1st wave: 18


In [23]:
jms_docs[['title', 'summary', 'score']].head(10)

Unnamed: 0_level_0,title,summary,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
36469,FBI refused White House request to knock down ...,The FBI rejected a recent White House request ...,1.0
15773,Trump aides spoke regularly to Russian officia...,High-level advisers close to then-presidential...,0.326538
966,Donald Trump Denounces ‘Un-American’ Intellige...,"<a href=""http://www.breitbart.com/big-governme...",0.319277
15552,US officials corroborate aspects of Russia dos...,"For the first time, US investigators say they ...",0.318796
15698,White House was warned Flynn could be blackmai...,The Justice Department warned the Trump admini...,0.256764
15880,Former Clinton aides apoplectic over Trump cam...,Hillary Clinton's former campaign aides are ap...,0.256621
672,CNN’s Acosta: Trump Only Calling on Conservati...,"<a href=""http://www.breitbart.com/video/2017/0...",0.245172
8145,"Dems see disparity in handling of Clinton, Rus...",WASHINGTON (AP) -- Democrats are critic...,0.227264
115,"Maxine Waters: Trump’s Administration, Associa...","<a href=""http://www.breitbart.com/video/2017/0...",0.218727
274,KLEIN – Trump-Russia Claims Rethought: How the...,"<a href=""http://www.breitbart.com/jerusalem/20...",0.204384


In [24]:
jms_dt_url = score_table_prep(jms_docs, filename="images/jms_docs_table.html")

### Spectral Results

In [25]:
spec_docs = results['spectral']['docs_info']
spec_out = results['spectral']['cluster_info']
ig = results['spectral']['igraph']

In [26]:
spec_docs.shape

(109, 5)

In [27]:
for limit in [5, 10, 15, 25, 50]:
    spec_oo1 = sum([i not in docs['l01'] for i in spec_docs.index[:limit]])
    print("Nbr of docs in top {} not from 1st wave: {}".format(limit, spec_oo1))

Nbr of docs in top 5 not from 1st wave: 1
Nbr of docs in top 10 not from 1st wave: 1
Nbr of docs in top 15 not from 1st wave: 2
Nbr of docs in top 25 not from 1st wave: 5
Nbr of docs in top 50 not from 1st wave: 19


In [28]:
spec_docs[['title', 'summary', 'score']].head(10)

Unnamed: 0_level_0,title,summary,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
36469,FBI refused White House request to knock down ...,The FBI rejected a recent White House request ...,1.0
15773,Trump aides spoke regularly to Russian officia...,High-level advisers close to then-presidential...,0.326538
966,Donald Trump Denounces ‘Un-American’ Intellige...,"<a href=""http://www.breitbart.com/big-governme...",0.319277
15552,US officials corroborate aspects of Russia dos...,"For the first time, US investigators say they ...",0.318796
15698,White House was warned Flynn could be blackmai...,The Justice Department warned the Trump admini...,0.256764
15880,Former Clinton aides apoplectic over Trump cam...,Hillary Clinton's former campaign aides are ap...,0.256621
672,CNN’s Acosta: Trump Only Calling on Conservati...,"<a href=""http://www.breitbart.com/video/2017/0...",0.245172
8145,"Dems see disparity in handling of Clinton, Rus...",WASHINGTON (AP) -- Democrats are critic...,0.227264
115,"Maxine Waters: Trump’s Administration, Associa...","<a href=""http://www.breitbart.com/video/2017/0...",0.218727
274,KLEIN – Trump-Russia Claims Rethought: How the...,"<a href=""http://www.breitbart.com/jerusalem/20...",0.204384


In [29]:
spec_dt_url = score_table_prep(spec_docs, filename="images/spec_docs_table.html")

In [30]:
ig.vs[0]

igraph.Vertex(<igraph.Graph object at 0x7f7e85f294f8>, 0, {'name': '0', 'label': 'Winners: Tom Brady’s Donald Trump Friendship in Spotlight as Super Bowl Looms', 'group': 0})

In [34]:
reload(graph_plot_utils)

<module 'graph_plot_utils' from '/home/immersinn/gits/rssfeed_link_collector/src/graph_plot_utils.py'>

In [35]:
fig = graph_plot_utils.create_graph_fig(ig, show_groups=True)

In [36]:
iplot(fig)

In [37]:
graph_plot_url = plot(fig, filename='images/spec23_groupsGraph.html', auto_open=False,)
graph_plot_url = graph_plot_url[7:]

## Main Article

In [None]:
orig_article_title = doc['title']
orig_article_content = doc['summary']

In [None]:
wil01 = word_info['l01']
wil01.head()

## Level 1 Information

In [None]:
qwl01 = mysql_utils.query_idWordLookup(words['l01'])
qwl01

In [None]:
nl01_docs = len(docs['l01'])

In [None]:
wil02 = word_info['l02']
wil02.head()

In [None]:
set(bow.keys()).intersection(set([words_all]))

In [None]:
set(bow.keys()).intersection(words_all)

## Generate HTML as String and Write to File

In [38]:
html_string = '''
<html>
    <head>
        <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
        <style>body{ margin:0 100; background:whitesmoke; }</style>
    </head>
    <body>
        <h1>Comparison of JMS and Spectral Graph Methods</h1>
        
        <!-- *** Section 1 *** --->
        <h2>Section 1: Document Overview</h2>
        
            
            <h4>Document Source:</h4>
                <a href="''' + url + '''">Link</a>
            <h4>Document Title:</h4>
                <p><b>''' + doc['title'] + '''</b></p>
            <h4>Document Content:</h4>
                <p>"''' + doc['summary'] + '''"</p>

        <!-- *** Section 3 *** --->
        <h2>Section 3: "Top Docs" Review</h2>
        
                
            <table width="100%">
                <tr>
                    <td width="50%"><h3>JMS Top Docs:</h3></td>
                    <td width="47%"><h3>Spectral Cluster Top Docs:</h3></td>
                </tr>
            </table>


            <iframe style="padding:40px" width="47%" height="480" frameborder="0" seamless="seamless" scrolling="yes" align="left"\
    src="''' + jms_dt_url + '''"></iframe>

            <iframe style="padding:40px" width="47%" height="480" frameborder="0" seamless="seamless" scrolling="yes" align="right"\
    src="''' + spec_dt_url + '''"></iframe>
    
            <h3>Spectral23 Communities Graph</h3>
    
            <iframe style="padding:40px" width="1000" height="1000" frameborder="0" seamless="seamless" scrolling="no"\
    src="''' + graph_plot_url + '''"></iframe>


    
    </body>
</html>'''

In [39]:
f = open(os.path.join(main_repo_dir,'reports/dashboards/MethCompare.html'),'w')
f.write(html_string)
f.close()