In [1]:
import sys
import os

In [2]:
import time
import datetime

In [3]:
import numpy
import pandas

In [4]:
import plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import figure_factory as FF
import plotly.graph_objs as pogo
from plotly.graph_objs import Marker, Line, Data

init_notebook_mode(connected=True)

In [5]:
from IPython.display import display, HTML

In [6]:
main_repo_dir = os.path.abspath(os.path.join('../..'))
sys.path.append(os.path.join(main_repo_dir, 'src'))

In [7]:
import utils
import mysql_utils

## Create Start, End Dates

In [8]:
# Get the current date
cur = datetime.datetime.utcnow().date()

# Get distance to 2 Sundays ago
wkday = (cur.weekday() + 1) % 7
shift = 7 + wkday

# Create start, end dates
start_date = cur - datetime.timedelta(days=shift)
start_date = datetime.datetime(year=start_date.year, month=start_date.month, day=start_date.day)
end_date = start_date + datetime.timedelta(days=7)

# Convert to strings
start_date = start_date.strftime("%Y-%m-%d %H:%M:%S")
end_date = end_date.strftime("%Y-%m-%d %H:%M:%S")
week_of_date = start_date.split()[0]

print(start_date)
print(end_date)
print(week_of_date)

2017-03-12 00:00:00
2017-03-19 00:00:00
2017-03-12


## Query Data from MySQL

In [9]:
df = mysql_utils.query_docsByDatetime(start_dt=start_date, end_dt=end_date)

In [10]:
# Flag Duplicates?
unique_entries = []
titles = set()
for i in df.index:
    if df.ix[i].title not in titles:
        unique_entries.append(i)
        titles.update([df.ix[i].title])
        
df['ind'] = 0
df.loc[unique_entries, 'ind'] = 1
#df.index = range(df.shape[0])

In [11]:
# Fix RSS Feed links
def repair_rsslink(rss_link):
    if rss_link.startswith('http://phys.org/rss-feed/'):
        rss_link = 'https://phys.org/rss-feed/' + rss_link[len('http://phys.org/rss-feed/'):]
        
    return(rss_link)

In [12]:
df['rss_link'] = df.apply(lambda x: repair_rsslink(x['rss_link']), axis=1)

## Counts

In [13]:
# Get total counts
total_new_articles = df.ind.sum()
total_new_articles

7522

### Table 01: By RSS Feed

Counts by RSS Feed

In [14]:
res = df[['ind', 'rss_link']].groupby(['rss_link']).agg(['count'])

In [15]:
res = pandas.DataFrame(data = {'rss_link' : list(res['ind'].index),
                               'count' : list(res['ind']['count'])}
                      )

In [16]:
summary_table_1 = FF.create_table(res[['rss_link', 'count']])

first_table_url = plot(summary_table_1, filename='images/counts_by_rss.html', auto_open=False,)
first_table_url = first_table_url[7:]

### Table 02: By Topic

Counts by Topic (not very refined at the moment)

In [17]:
feeds = utils.load_feedlists_data()

In [18]:
feed_topic_lu = {feed['Link']:feed['RssName'] for feed in feeds}

In [19]:
df['name'] = df.apply(lambda x: feed_topic_lu[x['rss_link']], axis=1)

In [20]:
def topic_from_name(name):
    topic = name.lower()
    if topic.find('business') > -1:
        topic = 'business'
    if topic.find('financ') > -1:
        topic = 'finance'
    if topic.find('econ') > -1 or topic.find('market') > -1 or topic.find('currenc') > -1:
        topic = 'economy'
    if topic.find('science') > -1:
        topic = 'science'
    if topic.find('tech') > -1:
        topic = 'technology'
    if topic.find('transport') > -1:
        toipic = 'transportation'
    if topic.find('travel') > -1:
        topic = 'travel'
    
    return(topic)

In [21]:
df['topic'] = df.apply(lambda x: topic_from_name(x['name']), axis=1)

In [22]:
df['ind'] = 1
topic = df[['ind', 'topic']].groupby(['topic']).agg(['count'])
topic = pandas.DataFrame(data = {'topic' : list(topic['ind'].index),
                                 'count' : list(topic['ind']['count'])}
                      )

In [23]:
summary_table_2 = FF.create_table(topic[['topic', 'count']])

second_table_url = plot(summary_table_2, filename='images/counts_by_topic.html', auto_open=False,)
second_table_url = second_table_url[7:]

## Feed Trends with Time Series

In [24]:
# From: http://moderndata.plot.ly/generate-html-reports-with-python-pandas-and-plotly/
def moving_average(interval, window_size):
    window = numpy.ones(int(window_size))/float(window_size)
    return numpy.convolve(interval, window, 'same')

In [25]:
feed_times = pandas.DataFrame(data={'val' : list(df['ind'])}, index=df.published)
feed_times.sort_index(inplace=True)

In [26]:
feed_times_hour = feed_times.resample('H', kind='period').sum()
feed_times_hour.index = pandas.DatetimeIndex([p.to_timestamp() for p in list(feed_times_hour.index)])

In [27]:
feed_times_hour['ma'] = moving_average(feed_times_hour.val, 6)

In [28]:
xy_data = pogo.Scatter( x=feed_times_hour.index, y=feed_times_hour.val,
                  mode='markers', marker=Marker(size=8),
                  name='Hourly Count')
# vvv clip first and last points of convolution
mov_avg = pogo.Scatter( x=feed_times_hour.index[5:-4], y=feed_times_hour.ma[5:-4], \
                  line=Line(width=2,color='red'), name='Moving average')
data = Data([xy_data, mov_avg])

iplot(data, filename='hourly feed count moving avg')

In [29]:
first_plot_url = plot(data, filename='images/hourly feed count moving avg.html', auto_open=False,)
first_plot_url = first_plot_url[7:]

## Log Info

In [30]:
with open(os.path.join(main_repo_dir, "main_run.log"), "r") as f:
    log = f.readlines()

In [31]:
dates, entries = zip(*[(' '.join(line.split()[:3]), ' '.join(line.split()[3:])) for line in log])

In [32]:
dates = [datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p') for date in dates]

In [33]:
log = pandas.DataFrame(data = {'entry' : entries}, index=dates)

In [34]:
log = log.loc[(log.index >= start_date) & (log.index < end_date)]

In [35]:
log['day'] = [ent.strftime('%Y-%m-%d') for ent in log.index]

In [36]:
# Define which entry we're in
entry_ind = -1

def update_entry(x):
    global entry_ind
    if x['entry'] == "Retrieving contents...":
        entry_ind += 1
    return(entry_ind)

log['entry_ind'] = log.apply(lambda x: update_entry(x), axis=1)

In [37]:
# Flag Errors
def flag_error(x):
    text = x.entry.lower()
    if text.find('warning') > -1 or text.find('error') > -1:
        return(True)
    else:
        return(False)
    
log['is_error'] = log.apply(lambda x: flag_error(x), axis=1)

In [38]:
log.head()

Unnamed: 0,entry,day,entry_ind,is_error
2017-03-13 05:00:02,Retrieving contents...,2017-03-13,0,False
2017-03-13 05:00:02,Total feeedss to visit: 250,2017-03-13,0,False
2017-03-13 05:00:02,Creating 8 Tor openers...,2017-03-13,0,False
2017-03-13 05:00:07,IP Used: b'77.247.181.163',2017-03-13,0,False
2017-03-13 05:00:09,Total entries retrieved from https://www.thegu...,2017-03-13,0,False


In [39]:
log.tail()

Unnamed: 0,entry,day,entry_ind,is_error
2017-03-17 17:09:54,Total new links added: 0,2017-03-17,65,False
2017-03-17 17:09:57,No contents found,2017-03-17,65,False
2017-03-17 17:09:58,Total entries retrieved from http://www.ft.com...,2017-03-17,65,False
2017-03-17 17:09:59,Total new links added: 0,2017-03-17,65,False
2017-03-17 17:09:59,Total new links added from all feeds: 70,2017-03-17,65,False


In [40]:
total_days = len(log.day.unique())
total_runs = len(log.entry_ind.unique())

### Time for each Pull

In [41]:
day_entry_groups = log.groupby(['day', 'entry_ind'])

In [42]:
times = pandas.DataFrame(day_entry_groups.apply(lambda x: (x.index[-1] - x.index[0]).total_seconds()))

In [43]:
data_times = []
for day in times.index.get_level_values('day').unique():
    subd = times.ix[day].values
    trace = pogo.Box(
                     y=list(subd.reshape((1,subd.shape[0]))[0]),
                     name=str(day)
                    )
    data_times.append(trace)

In [44]:
layout = pogo.Layout(
    title='Run Times by Day',
    xaxis=dict(
        tickformat = "%b %e %Y",
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Time (seconds)',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
)

dt_fig = pogo.Figure(data=data_times, layout=layout)
iplot(dt_fig)

In [45]:
run_times_plot_url = plot(dt_fig, filename='images/run times box plot.html', auto_open=False,)
run_times_plot_url = run_times_plot_url[7:]

### Errors Info

In [46]:
day_groups = log.groupby(['day'])

In [47]:
# attempts per day
day_groups.apply(lambda x: len(x.entry_ind.unique()))

day
2017-03-13    14
2017-03-14    13
2017-03-15    13
2017-03-16    13
2017-03-17    13
dtype: int64

In [48]:
# Errors per day
epd = day_groups.apply(lambda x: sum(x.is_error))

In [49]:
g = pogo.Bar(x=list(epd.index), y=epd.values)

data = pogo.Data([g])

iplot(data, filename='errors per day')

#### Breakdown of Types

In [50]:
def extract_err_class(error_text):
    start = error_text.find('<class ')
    end = error_text.find("'>:")
    if start > -1 and end > -1:
        err_class = error_text[start : (end + 2)]
    else:
        err_class = ''
    return(err_class)

In [51]:
log['err_class'] = log.entry.apply(lambda x: extract_err_class(x))

In [52]:
day_err_groups = log.loc[log.is_error].groupby(['err_class', 'day'])

In [53]:
err_counts = pandas.DataFrame(day_err_groups.apply(len))

In [54]:
days = times.index.get_level_values('day').unique()
errors = err_counts.index.get_level_values('err_class').unique()

# Add '0's where needed to prevent missing dates / data in plot
current = set(err_counts.index.values)

for e in errors:
    for d in days:
        if (e,d) not in current:
            err_counts.ix[(e,d),:] = 0

In [55]:
err_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,0
err_class,day,Unnamed: 2_level_1
<class 'UnboundLocalError'>,2017-03-13,29.0
<class 'mysql.connector.errors.DataError'>,2017-03-13,29.0
<class 'urllib.error.HTTPError'>,2017-03-13,29.0
<class 'urllib.error.HTTPError'>,2017-03-14,4.0
<class 'UnboundLocalError'>,2017-03-14,0.0
<class 'UnboundLocalError'>,2017-03-15,0.0
<class 'UnboundLocalError'>,2017-03-16,0.0
<class 'UnboundLocalError'>,2017-03-17,0.0
<class 'mysql.connector.errors.DataError'>,2017-03-14,0.0
<class 'mysql.connector.errors.DataError'>,2017-03-15,0.0


In [56]:
data_ecs = []
for et in err_counts.index.get_level_values('err_class').unique():
    subd = err_counts.ix[et].values
    # Each trace is an error type
    # "x" is the day
    trace = pogo.Bar(
                     x=[str(e) for e in err_counts.ix[et].index],
                     y=list(subd.reshape((1,subd.shape[0]))[0]),
                     name=et.split("'")[1]
                    )
    data_ecs.append(trace)

In [57]:
layout = pogo.Layout(
    title='Error Counts by Day, Breakdown by Type',
    xaxis=dict(
        tickformat = "%b %e %Y",
        ticks = [str(d) for d in err_counts.index.get_level_values('day').unique()],
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        ),
    ),
    yaxis=dict(
        title='Error Counts',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    legend=dict(
        x=0,
        y=1.0
    ),
    barmode='stack'
)

ec_fig = pogo.Figure(data=data_ecs, layout=layout)
iplot(ec_fig, filename='stacked-bar')

In [58]:
err_counts_plot_url = plot(ec_fig, filename='images/error counts stacked.html', auto_open=False,)
err_counts_plot_url = err_counts_plot_url[7:]

## Generate HTML as a String and Write to File

In [59]:
html_string = '''
<html>
    <head>
        <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
        <style>body{ margin:0 100; background:whitesmoke; }</style>
    </head>
    <body>
        <h1>Feed Summary Information for Week of ''' + week_of_date + '''</h1>

        <!-- *** Section 1 *** --->
        <h2>Section 1: Hourly Feed Count for Week</h2>
            <iframe width="1000" height="550" frameborder="0" seamless="seamless" scrolling="no"\
    src="''' + first_plot_url + '''"></iframe>
            <p>blah blah blah.</p>

            <p>Comments can be inserted here</p>
        
        <!-- *** Section 2 *** --->
        <h2>Section 2: Feed Summary Information</h2>
        
            <h4>Total new stories for the week: ''' + str(total_new_articles) + '''</h4>
                
            <table width="100%">
                <tr>
                    <td width="65%"><h3>Counts by Feed:</h3></td>
                    <td width="30%"><h3>Counts by Topic:</h3></td>
                </tr>
            </table>


            <iframe style="padding:40px" width="65%" height="480" frameborder="0" seamless="seamless" scrolling="yes" align="left"\
    src="''' + first_table_url + '''"></iframe>

            <iframe style="padding:40px" width="30%" height="480" frameborder="0" seamless="seamless" scrolling="yes" align="right"\
    src="''' + second_table_url + '''"></iframe>


        <!-- *** Section 3 *** --->
        <h2>Section 3: Log Info</h2>
        
            <h3>Run Times</h3>
            
            <iframe style="padding:40px" width="1000" height="600" frameborder="0" seamless="seamless" scrolling="no"\
    src="''' + run_times_plot_url + '''"></iframe>
            
            <h3>Error Types Encountered</h3>
            
            <iframe style="padding:40px" width="1000" height="600" frameborder="0" seamless="seamless" scrolling="no"\
    src="''' + err_counts_plot_url + '''"></iframe>
    
    </body>
</html>'''

In [60]:
f = open(os.path.join(main_repo_dir,'reports/dashboards/RSSFeedWeekly.html'),'w')
f.write(html_string)
f.close()