# Community Sentiment
__CONFIGURATION:__ You will need to enter in an access token from [github](https://github.com/settings/tokens) in the configuration cell below. 

This dashboard tries to describe the sentiment of an open source project on github. A project can have elements of both positive and negative sentiment. The elements which define positive and negative sentiment are defined below.

__Positive__
* When an issue is closed
* When a pull request is closed

__Negative__
* When an issue has been idle (no activity or comments) within a given time frame.
* When a pull request has been idle (no activity or comments) within a given time frame.

<div class="alert alert-info" role="alert" style="margin-top: 10px">
<strong>Note:</strong> Requires Declarative Widgets v0.6.0+.
</div>

## Configuration

In [None]:
access_token = 'XXXXXXXXXXXXXXXXXXXXX'
github_rest_endpoint = 'https://api.github.com{}'
github_repo_issues_endpoint = github_rest_endpoint.format('/repos/{0}/{1}/issues')
github_repo_issue_events_endpoint = github_rest_endpoint.format('/repos/{0}/{1}/issues/{2}/events')
github_repo_issue_comments_endpoint = github_rest_endpoint.format('/repos/{0}/{1}/issues/{2}/comments')

## Setup
Import modules and any web components and setup any constants.

In [None]:
import json, requests
import re
from datetime import datetime, timedelta
import dateutil.parser
import os
import pickle
from declarativewidgets import channel
import pandas as pd
from pandas import DataFrame
import numpy as np
import time

In [None]:
import declarativewidgets as widgets

widgets.init()

In [None]:
last_page_regex = re.compile('\<([^\>]*)\>; rel=\"last\"')
page_number_regex = re.compile('[&|\?]page=(\d+)')
cache_dir = os.path.join(os.getcwd(),'.dashboard_cache')

In [None]:
%%html
<link rel='import' href='urth_components/urth-viz-chart/urth-viz-chart.html' is='urth-core-import'>
<link rel='import' href='urth_components/paper-slider/paper-slider.html' package='PolymerElements/paper-slider' is='urth-core-import'>
<link rel='import' href='urth_components/paper-button/paper-button.html' package='PolymerElements/paper-button' is='urth-core-import'>
<link rel='import' href='urth_components/paper-progress/paper-progress.html' package='PolymerElements/paper-progress' is='urth-core-import'>
<link rel='import' href='urth_components/paper-toggle-button/paper-toggle-button.html' package='PolymerElements/paper-toggle-button' is='urth-core-import'>

<!-- Custom Widgets !-->
<link rel='import' is='urth-core-import' package='/home/jovyan/work/community_sentiment_demo/widgets/urth-timeline'
    href='urth_components/urth-timeline/urth-timeline.html'>

## Data Caching
To avoid rate limits, we will cache data to the file system.

In [None]:
def make_cache_dirs(org, repo):
    os.makedirs(os.path.join('.dashboard_cache', 'github', org, repo), exist_ok = True)

    
def load_cache(org, repo):
    make_cache_dirs(org,repo)
    issues_path = os.path.join('.dashboard_cache', 'github', org, repo, 'issues')
    if os.path.exists(issues_path):
        cached_issues = pickle.load(open(issues_path, 'rb'))
    else:
        cached_issues = {'cache_date' : None, 'data': []}
        
    events_path = os.path.join('.dashboard_cache', 'github', org, repo, 'events')
    if os.path.exists(issues_path):
        cached_events = pickle.load(open(events_path, 'rb'))
    else:
        cached_events = {'cache_date' : None, 'data': []}
                        
            
    comments_path = os.path.join('.dashboard_cache', 'github', org, repo, 'comments')
    if os.path.exists(issues_path):
        cached_comments = pickle.load(open(comments_path, 'rb'))
    else:
        cached_comments = {'cache_date' : None, 'data': []}
    
    return cached_issues, cached_events, cached_comments

def cache_data(org, repo, **kwargs):
    make_cache_dirs(org, repo)
    cache_date = datetime.now().isoformat() + 'Z'
    for key, value in kwargs.items():
        file_to_save = os.path.join('.dashboard_cache', 'github', org, repo, key)
        pickle.dump(
            {
                'cache_date' : cache_date,
                'data': value 
            }, 
            open(file_to_save, 'wb')
        )

In [None]:
def parse_date(date_string):
    '''Parses an ISO date string and returns a UTC datetime object.
    '''
    if date_string is None:
        return np.nan
    date_val = dateutil.parser.parse(date_string)
    return datetime.utcfromtimestamp(date_val.timestamp())

# Github

In [None]:
def _github_paged_collection(url,params, page=1, track_progress=True):
    '''Returns a collection from github by iterating over a paged resource.
    
    Returns:
        An array of dictionaries
    '''
    params['access_token'] = access_token
    params['per_page'] = 100
    params['page'] = page
    params['direction'] = 'asc'
    params['sort'] = 'updated'
    headers = {}
        
    # If the cache dir exists add the ETag header
    response = requests.get(url=url,params=params, headers=headers)
    values = json.loads(response.text)
    if 'Link' in response.headers and len(values) > 0: 
        if page == 1 and track_progress:
            last_page_link = last_page_regex.findall(response.headers['Link'])[0]
            last_page = page_number_regex.findall(last_page_link)[0]
            channel().set('progress-end', last_page)
            
        # Set the progress to the page
        channel().set('progress', page)
        return values + _github_paged_collection(url,params,page + 1)
    else:
        return values

def get_github_issue_comments(org, repo, issue):
    '''Returns all of the comments, as an array of dictionaries, for a github issue'''
    params = {}
    url = github_repo_issue_comments_endpoint.format(org,repo, issue)
    return _github_paged_collection(url, params)


def get_github_issue_events(org, repo, issue):
    '''Returns all of the events, as an array of dictionaries, for a github issue'''
    params = {}
    url = github_repo_issue_events_endpoint.format(org, repo, issue)
    return _github_paged_collection(url, params, track_progress=False)
        
def get_github_issues(org, repo, since=None):
    '''Returns all of the issues, as an array of dictionaries, for a github repo'''
    params = {
        'state' :  'all'
    }
    
    if since:
        params['since'] = since
        
    channel().set('progress-message', 'Collecting issues for {}/{}'.format(org, repo))
    url = github_repo_issues_endpoint.format(org,repo)
    return _github_paged_collection(url, params)

def load_github_data(org, repo):
    '''Loads a github repo's issues, events, and comments for a github repo. 
    
    Returns:
        issues, events, comments: The issues, events for all issues, and comments for all issues of the github repo
    '''
    channel().set('show-progress', True)
    channel().set('progress', 0)
    
    cached_issues, cached_events, cached_comments = load_cache(org,repo)
    issues = get_github_issues(org, repo, since=cached_issues['cache_date'])
    events = []
    comments = []
    
    total_issues = len(issues)
    if(total_issues  > 0 ):
        curr_issue = 0
        channel().set('progress-end', len(issues))
        for issue in issues:
            issue_number = issue['number']
            channel().set('progress-message', 'Getting info for issue {} of {}'.format(curr_issue, total_issues))
            issue_events = get_github_issue_events(org, repo, issue_number)
            for event in issue_events:
                event['number'] = issue_number
            events.extend(issue_events)

            issue_comments = get_github_issue_comments(org, repo, issue_number)
            for comment in issue_comments:
                comment['number'] = issue_number
            comments.extend(issue_comments)

            curr_issue = curr_issue+1
            channel().set('progress', curr_issue)  
            
    updated_issue_ids = list(map(lambda x: x['number'], issues))
    filtered_cached_issues = list(filter(lambda x: x['number'] not in updated_issue_ids, cached_issues['data']))
    filtered_cached_events = list(filter(lambda x: x['number'] not in updated_issue_ids, cached_events['data']))
    filtered_cached_comments = list(filter(lambda x: x['number'] not in updated_issue_ids, cached_comments['data']))
    issues = issues + filtered_cached_issues
    events = events + filtered_cached_events
    comments = comments + filtered_cached_comments
    cache_data(org, repo, events=events, issues=issues, comments=comments)
    
    issues_df = DataFrame(issues)
    issue_events_df = DataFrame(events)
    issue_comments_df = DataFrame(comments)

    # Parse the dates only if we have issues, events, or comments for the respective dataframe
    # And add any attributes used for filtering, so we an just pass the empty dataframe along
    if(len(issues_df) > 0):
        issues_df.closed_at = issues_df.closed_at.apply(parse_date)
        issues_df.created_at = issues_df.created_at.apply(parse_date)
        issues_df.updated_at = issues_df.updated_at.apply(parse_date)
    else:
        issues_df['closed_at'] = datetime.utcfromtimestamp(0)
        issues_df['created_at'] = datetime.utcfromtimestamp(0)
        issues_df['updated_at'] = datetime.utcfromtimestamp(0)
        issues_df['number'] = 0
        
    # If no pull requests have been made then the pull request field will 
    # not be in the dataframe and will break our filters    
    if 'pull_request' not in issues_df:
        issues_df['pull_request'] = np.nan            
            
    if(len(issue_events_df) > 0):
        issue_events_df.created_at = issue_events_df.created_at.apply(parse_date)
    else:
        issue_events_df['created_at'] = datetime.utcfromtimestamp(0)
        issue_events_df['number'] = 0
        issue_events_df['event'] = ''
        
    if(len(issue_comments_df) > 0):
        issue_comments_df.created_at = issue_comments_df.created_at.apply(parse_date)
    else:
        issue_comments_df['created_at'] = datetime.utcfromtimestamp(0)
        issue_comments_df['number'] = 0
        
    channel().set('show-progress', False)
    return issues_df, issue_events_df, issue_comments_df

## Filter/Classification Methods

In [None]:
def filter_pull_requests_items(items):
    '''Filters items to only inlucde items which are pull requests
        
    Args:
        items: a dataframe of github items
        
    Returns:
        A dataframe of pull requests.
    '''
    if len(items) > 0:
        return items[items['pull_request'].apply(lambda x: type(x) == dict)]
    else:
        return items

def filter_issue_items(items):
    '''Filters items to only include items which are issues
    
    Args:
        items: A dataframe of github items
        
    Returns:
        A dataframe of issues.
    '''
    if len(items) > 0:
        return items[items['pull_request'].apply(lambda x: type(x) != dict and np.isnan(x))]
    else:
        return items

def filter_idle_items_during(items, events, comments, start_date, end_date):
    '''Filters items to include only the items that are considered idle.
    
    Idle is defined as follows:
        The item was created before start_date
        The item was open before start_date
        There were no comments or events associated with the item between start_date and end_date
        
    Args:
        items: A dataframe containing the items for a github repo
        events: A dataframe containing the events for a github repo
        comments: A dataframe containing the comments for items in a github repo
        start_date: A datetime to represent the beginning of the window to consider when items are idle
        end_date: A datetime to represent the end of the window to consider when items are idle
        
    Returns:
        A datframe of items idle during the given time frame.
    '''
    all_df = pd.merge(items, events, how='outer', on='number', suffixes=('_item', '_event'))
    
    # Contains events within the time window
    df = all_df[
        (all_df.created_at_event > start_date) & (all_df.created_at_event < end_date)
    ]
    
    # Active items within the time window
    active_items_set = df.drop_duplicates('number')
    
    
    # Items that had no events within window (not necesarilly inactive)
    filtered_df = all_df[~all_df.number.isin(active_items_set.number)]
    
    
    # If the item was closed before the start date
    items_closed_before_window = filtered_df[
            (filtered_df.event == 'closed') &
            (filtered_df.created_at_event <= start_date)
    ]
    
    filtered_df = filtered_df[
        ~filtered_df.number.isin(items_closed_before_window.number)
    ]
    
    # If the item was opened after the end date it is not inactive
    items_opened_after_window = filtered_df[
        (filtered_df.created_at_item > end_date) 
    ]
    
    filtered_df = filtered_df[
        ~filtered_df.number.isin(items_opened_after_window.number)
    ]
    
    # If there was a comment on the item during the window, it is active
    comments_within_window = comments[
        (comments.created_at > start_date) &
        (comments.created_at < end_date)
    ]
    
    filtered_df = filtered_df[
        ~filtered_df.number.isin(comments_within_window.number)
    ]
    
    inactive_items = filtered_df.drop_duplicates('number')
    return items[items.number.isin(inactive_items.number)]


def filter_closed_items_during(items, events, start_date, end_date):
    '''Filters items to include only the items that are closed 
        
    Args:
        items: A dataframe containing the items for a github repo
        events: A dataframe containing the events for a github repo
        start_date: A datetime to represent the beginning of the window to consider when items are idle
        end_date: A datetime to represent the end of the window to consider when items are idle

    Returns:
        A datframe of items closed during the given time frame.
    '''
    closed_events_during = events[
        (events.created_at > start_date) & 
        (events.created_at < end_date) & 
        (events.event == 'closed')
    ]
    return items[items.number.isin(closed_events_during.number)]

## Data Access

In [None]:
# Issues
def get_closed_issues_during(items, events, start_date, end_date):
    '''Filters items to include only issues closed between start_date and end_date.
    
    Returns: a dataframe containing the closed issues during the given time frame.
    '''
    return filter_closed_items_during(filter_issue_items(items), events, start_date, end_date)    

def get_closed_issue_count_during(items, events, start_date, end_date):
    '''Returns the total number of issues closed between start_date and end_date.
    
    Returns: total number of issues closed between start_date and end_date.
    '''
    return len(get_closed_issues_during(items, events, start_date, end_date))

def get_idle_issues_during(items, events, comments, start_date, end_date):
    '''Filters items to include only issues idle between start_date and end_date.
    
    Returns: a dataframe containing the idle issues during the given time frame.
    '''
    return filter_idle_items_during(filter_issue_items(items), events, comments, start_date, end_date)

def get_idle_issue_count_during(items, events, comments, start_date, end_date):
    '''Returns the total number of issues idle between start_date and end_date.
    
    Returns: total number of issues idle between start_date and end_date.
    '''
    return len(get_idle_issues_during(items, events, comments, start_date, end_date))

# Pull requests
def get_closed_pull_requests_during(items, events, start_date, end_date):
    '''Filters items to include only pull requests closed between start_date and end_date.
    
    Returns: a dataframe containing the closed pull requests during the given time frame.
    '''

    return filter_closed_items_during(filter_pull_requests_items(items), events, start_date, end_date)    

def get_closed_pull_request_count_during(items, events, start_date, end_date):
    '''Returns the total number of pull requests closed between start_date and end_date.
    
    Returns: total number of pull requests closed between start_date and end_date.
    '''
    return len(get_closed_pull_requests_during(items, events, start_date, end_date))

def get_idle_pull_requests_during(items, events, comments, start_date, end_date):
    '''Filters items to include only pull requests idle between start_date and end_date.
    
    Returns: a dataframe containing the idle pull requests during the given time frame.
    '''
    return filter_idle_items_during(filter_pull_requests_items(items), events, comments, start_date, end_date)

def get_idle_pull_request_count_during(items, events, comments, start_date, end_date):
    '''Returns the total number of pull requests idle between start_date and end_date.
    
    Returns: total number of pull requests idle between start_date and end_date.
    '''
    return len(get_idle_pull_requests_during(items, events, comments, start_date, end_date))

In [None]:
def get_sentiment_counts(org, repo):
    '''Returns the counts for individual components of sentiment analysis.
    
    Returns:
        A tuple of the following values: closed issues, idles issues, closed pull requests, idle pull requests
    '''
    items, events, comments = load_github_data(org, repo)
    end_date = datetime.now()
    start_date = end_date - timedelta(days=7)
    closed_issues = get_closed_issue_count_during(items, events, start_date, end_date)
    idle_issues = get_idle_issue_count_during(items, events, comments, start_date, end_date)
    closed_prs = get_closed_pull_request_count_during(items, events, start_date, end_date)
    idle_prs = get_idle_pull_request_count_during(items, events, comments, start_date, end_date)
    return (closed_issues, idle_issues, closed_prs, idle_prs)
    
def get_individual_sentiment_data(org, repo):
    '''Returns the sentimnet grouped by the different components (issues and pull requests)
    '''
    closed_issues, idle_issues, closed_prs, idle_prs = get_sentiment_counts(org, repo)
    return {
        'rows': [
            ['Issues', closed_issues, idle_issues],
            ['Pull Requests', closed_prs, idle_prs]
        ],
        'columns': ['Index', 'Positive', 'Negative']
    }


def get_total_sentiment_data(org, repo):
    '''Returns a dictionary representing the total values for positive and negative sentiment.
    '''
    closed_issues, idle_issues, closed_prs, idle_prs = get_sentiment_counts(org, repo)
    return {
        'rows': [
            ['Total', closed_issues + closed_prs, idle_issues + idle_prs]
        ],
        'columns': ['Index', 'Positive', 'Negative']
    }

def get_historical_sentiment_data(org, repo, start_date, end_date):
    '''Returns a dictionary representing the individual sentiment components between the start and end dates.
    
    Args:
        org: The github organization to collect data for
        repo: The github repository to collect data for 
        start_date: An ISO standard date string representing the start date of the historical window.
        end_date: An ISO standard date string representing the end date of the historical window.
    '''
    items, events, comments = load_github_data(org, repo)
    start_date = parse_date(start_date)
    end_date = parse_date(end_date)
    rows = []
    curr_time = end_date
    prev_time = end_date
    index = 0
    delta = (end_date - start_date) / 30
    while(curr_time > start_date):
        prev_time = curr_time
        curr_time = curr_time - delta
        index = index + 1
        rows.append([
                curr_time.isoformat(),
                get_closed_issue_count_during(items, events, curr_time, prev_time),
                get_idle_issue_count_during(items, events, comments, curr_time, prev_time),
                get_closed_pull_request_count_during(items, events, curr_time, prev_time),
                get_idle_pull_request_count_during(items, events, comments, curr_time, prev_time)
        ])
    interval = delta.days + round(delta.seconds/(60*60*24), 2)
    return {
        'rows': rows,
        'columns': ['Index', 'Closed Issues', 'Idle Issues', 'Closed PRs', 'Idle PRs'], 
        'metadata' : { 'interval' : interval}
    }

# UI


In [None]:
%%html
<style>
paper-progress, paper-slider {
    display: block;
    width: 100%;
    margin: 20px 0;
}
</style>

In [None]:
%%html
<template is="urth-core-bind">
    <urth-core-function id="get_individual_sentiment_data" ref="get_individual_sentiment_data" 
        arg-org="{{org}}"  
        arg-repo="{{repo}}"  
        result='{{individual_sentiment_data}}'>
    </urth-core-function> 
    <urth-core-function id="get_total_sentiment_data" ref="get_total_sentiment_data" 
        arg-org="{{org}}"  
        arg-repo="{{repo}}"  
        result='{{total_sentiment_data}}'>
    </urth-core-function> 
    <urth-core-function id="get_historical_sentiment_data" ref="get_historical_sentiment_data" 
        arg-org="{{org}}"  
        arg-repo="{{repo}}"  
        arg-start_date='{{window_start}}'
        arg-end_date='{{window_end}}'
        result='{{historical_data}}'>
    </urth-core-function>
    <urth-core-channel id="defaultChannel"></urth-core-channel>
    <script>
        function get_data(){
            document.getElementById('get_individual_sentiment_data').invoke();
            document.getElementById('get_total_sentiment_data').invoke();
            document.getElementById('get_historical_sentiment_data').invoke();
        }
        // Set the start and end for the timeline widget
        var timeline_end = new Date();
        var timeline_start = new Date(timeline_end.getTime() - 1000*60*60*24*30);
        var channel = document.getElementById('defaultChannel');
        channel.set('timeline_start', timeline_start);
        channel.set('timeline_end', timeline_end);
        channel.set('window_end', new Date().toISOString());
        channel.set('window_start', new Date().toISOString());
        channel.set('palette', ["#4575b4","#d73027", "#313695", "#a50026",])
    </script>
</template>

In [None]:
%%html
<template id='banner' is="urth-core-bind">
    <h1>Community Project Sentiment</h1>
    <paper-item>
        <p>
            This dashboard tries to describe the sentiment of an open source project on github. 
            A project can have elements of both positive and negative sentiment.
            The elements which define positive and negative sentiment are defined below.
        </p>
    </paper-item>
    <paper-item>
            <p>
            <ul>
                <li>
                    Positive
                    <ul>
                        <li>When an issue is closed</li>
                        <li>When a pull request is closed</li>
                        
                    </ul>
                </li>
                <li>
                    Negative
                    <ul>
                        <li>When an issue has been idle (no activity or comments) within a given time frame.</li>
                        <li>When a pull request has been idle (no activity or comments) within a given time frame.</li>
                    </ul>
                </li>
            </ul>
        </p>
    </paper-item>
    <paper-input label='Organization' value="{{org}}"></paper-input>
    <paper-input label='Repository' value="{{repo}}"></paper-input>
    <paper-button raised onclick='get_data();'>Collect Data</paper-button>
    <paper-toggle-button id='stream-toggle' on-change='toggleStream'>Stream</paper-toggle-button>
    <template is='dom-if' if='{{show-progress}}'>
        <paper-progress class='blue' value="{{progress}}" min='0' max='{{progress-end}}'></paper-progress>
        <paper-item>
            <p style='text-align:center; width:100%'>{{progress-message}}</p>
        </paper-item>
    </template>
    <script>
        var streamInterval = null;
        banner.toggleStream = function(e){
            if(document.getElementById('stream-toggle').active) {
                streamInterval = setInterval(get_data, 7000);
            } else {
                clearInterval(streamInterval);
            }
        };
    </script>
</template>

In [None]:
%%html
<template is="urth-core-bind">
    <h2 style='text-align:center;'>Overall Project Sentiment (Past 7 Days)</h2>
    <urth-viz-chart type='bar' palette='{{palette}}' 
        datarows='{{total_sentiment_data.rows}}' 
        columns='{{total_sentiment_data.columns}}'>
    </urth-viz-chart>
</template>

In [None]:
%%html
<template is="urth-core-bind">
        <h2 style='text-align:center;'>Sentiment Breakdown (Past 7 Days)</h2>
        <urth-viz-chart type='bar' palette='{{palette}}' 
            datarows='{{individual_sentiment_data.rows}}' 
            columns='{{individual_sentiment_data.columns}}'>
        </urth-viz-chart>
</template>

In [None]:
%%html
<template id='timeline_bind' is='urth-core-bind'>
    <h2 style='text-align:center;'>
        Historical Sentiment 
    </h2>
    <h3 style='text-align:center;'>
    <span id='startLabel'></span> to <span id='endLabel'></span> every <span>{{historical_data.metadata.interval}}</span> days
    </h3>
    <urth-viz-chart type='line' palette='{{palette}}' 
        datarows='{{historical_data.rows}}' 
        columns='{{historical_data.columns}}'>
        <urth-viz-col index="0" type="date" ></urth-viz-col>
    </urth-viz-chart>
    <div style='padding-left:60px;'>
        <urth-timeline 
            start-date='{{timeline_start}}'
            end-date='{{timeline_end}}'
            on-window-changed='update'>
        </urth-timeline>
    </div> 
    <script>
        timeline_bind.update = function(e){
            var startLabel = document.getElementById('startLabel');
            var endLabel = document.getElementById('endLabel');
            var historical_function = document.getElementById('get_historical_sentiment_data');
            var channel = document.getElementById('defaultChannel');

            channel.set('window_start', e.detail.window_start.toISOString());
            channel.set('window_end', e.detail.window_end.toISOString());
            startLabel.innerText = e.detail.window_start.toLocaleDateString();
            endLabel.innerText = e.detail.window_end.toLocaleDateString();

            // A work around for declarative widgets issue 35
            // https://github.com/jupyter-incubator/declarativewidgets/issues/35
            historical_function.args['start_date'] = e.detail.window_start.toISOString();
            historical_function.args['end_date'] = e.detail.window_end.toISOString();
            if(historical_function.isValid()) {
                historical_function.invoke();
            }
        }
    </script>
</template>