\# Developer: Ali Hashaam (ali.hashaam@initos.com) <br>
\# 2nd March 2019 <br>

\# © 2019 initOS GmbH <br>
\# License MIT <br>

The code is responsible to get data for Jira Projects <br>
- lucene <br>
- jackrabbit <br> 
- httpclient <br>

In [1]:
import urllib2, json, re
import pandas as pd
from bs4 import BeautifulSoup
from __future__ import unicode_literals
import logging
import logging.handlers

In [2]:
def establish_logger(plateform):
    logger = logging.getLogger(plateform+"_scrapper")
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('scrapping_logs/'+plateform+"_scrapping_output.log")
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger

In [3]:
def scrap_data(issue_id, description): #LUCENE-754
    comments = []
    results = urllib2.urlopen('https://issues.apache.org/jira/browse/'+issue_id)
    body = BeautifulSoup(results, "lxml").body
    issue_id = body.find(id="key-val").text
    issue_summary = body.find(id="summary-val").text.strip().replace(',','')
    description_html = body.find(id="descriptionmodule")
    if description_html:
        issue_description = description_html.find(id="description-val").text.strip().replace(',','')
    else: 
        issue_description = ''
    scripts = body.find_all('script')
    for script in scripts:
        if 'activity-panel-pipe-id' in script.text:
            script = script.string.decode('unicode-escape')
            html = script[script.find('<div'):script.rfind('/div>')+5].replace('\\n', ''
                                                                              ).replace('\\"', '"').replace('\\/', '/')
            break
    if html:
        html = BeautifulSoup(html, "lxml")
        content = html.body.find(id='issue_actions_container').find_all('div', 'issue-data-block')
        if content:
            for con in content:
                temp_dict = {}
                temp_dict['id'] = issue_id
                temp_dict['comment_id'] = con['id']
                head = con.find('div','action-head')
                body = con.find('div','action-body')
                temp_dict['comment_head'] = head.find('div','action-details').text.strip().replace(',','')
                if body.p:
                    temp_dict['comment_body'] = body.p.text.replace(',','')
                comments.append(temp_dict)
        else:
            comments.append({'id':issue_id, 'comment_id':'','comment_head':'','comment_body':''})
    description.append({'id':issue_id, 'summary':issue_summary, 'description': issue_description})
    return comments, description

In [4]:
def collect_data(lookup_df, logger, df):
    description = []
    ignored_ids = []
    for ids in lookup_df['ID']:
        try:
            data, description = scrap_data(ids, description)
            df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)
            logger.info('Data for '+ids+' scrapped')
        except:
            ignored_ids.append(ids)
            logger.info(ids+' not scrapped because of errors.')
            continue
    description_df = pd.DataFrame(description)
    return df, description_df, ignored_ids

In [5]:
def scrapper_main(plateform): #lucene, jackrabbit, httpclient
    logger = establish_logger(plateform)
    df = pd.DataFrame(columns=['id', 'comment_id', 'comment_head', 'comment_body'])
    lookup_df = pd.read_csv('../scrapped_data/'+plateform+'_classification_vs_type.csv')
    df, description_df, ignored_ids = collect_data(lookup_df, logger, df)
    print 'for plateform '+plateform+' error occured for: '+','.join(ignored_ids)
    df.to_csv('../scrapped_data/'+plateform+'_issues_comments.csv', encoding = 'utf-8', index=False)
    description_df.to_csv('../scrapped_data/'+plateform+'_issues_description.csv', encoding = 'utf-8', index=False)
    return ignored_ids, logger

In [19]:
def scrap_failed_ids(plateform, ignored_ids, logger):
    df = pd.read_csv('../scrapped_data/'+plateform+'_issues_comments.csv')
    df_description = pd.read_csv('../scrapped_data/'+plateform+'_issues_description.csv')
    print len(df), len(df_description)
    description = []
    for ids in ignored_ids:
        data, description = scrap_data(ids, description)
        df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)
        logger.info('Data for '+ids+' scrapped')
    description_df = pd.DataFrame(description)
    df.to_csv('../scrapped_data/'+plateform+'_issues_comments.csv', encoding = 'utf-8', index=False)
    description_df.to_csv('../scrapped_data/'+plateform+'_issues_description.csv', encoding = 'utf-8', index=False)

In [None]:
def merge_all_comments(plateform):
    """
    merge textual data into one column
    
    > Parameters:
    plateform: str- Name of plateform whose textual data is to be dealt with
    
    > Returns:
    df: Pandas Dataframe- dataframe with two columns id and text, with text containing all the textual
        data belonging to particular id
    """
    df_comments = pd.read_csv('../scrapped_data/'+plateform+'_issues_comments.csv')
    df_comments = df_comments.fillna('')
    df_comments = df_comments.groupby(['id'])['comment_body'].apply('. '.join).to_frame('comments').reset_index()
    df_description = pd.read_csv('../scrapped_data/'+plateform+'_issues_description.csv')
    df = pd.merge(df_description, df_comments, how='left', left_on='id', right_on='id')
    df.to_csv('../datasets/'+plateform+'_issues.csv', encoding = 'utf-8', index=False)

In [6]:
for plateform in ['lucene', 'jackrabbit', 'httpclient']:
    ignored_ids, logger = scrapper_main(plateform)
    scrap_failed_ids(plateform, ignored_ids, logger)
    merge_all_comments(plateform)

for plateform jackrabbit error occured for: JCR-145,JCR-209
for plateform httpclient error occured for: HTTPCLIENT-1087


# Testing

In [41]:
#with open('httpclient_scrapping_output.log') as fp:
#    lines = (line.rstrip() for line in fp) # All lines including the blank ones
#    lines = (line for line in lines if line) # Non-blank lines
#    for line in lines:
#        if "not scrapped because of errors." in line:
#            print line

In [46]:
#for platefprm in ['lucene', 'jackrabbit', 'httpclient']:
#    lookup_df = pd.read_csv(plateform+'_classification_vs_type.csv')
#    df = pd.read_csv(plateform+'_issues_comments.csv')
#    description_df = pd.read_csv(plateform+'_issues_description.csv')
#    print plateform
#    print len(lookup_df[~(lookup_df['ID'].isin(df['id']))])
#    print len(lookup_df[~(lookup_df['ID'].isin(description_df['id']))])