# Github Repo Scraper

In [1]:
#import libraries
import os
import re
import bs4
import csv
import time
import requests
import json, codecs
from pprint import pprint
from github import Github
from bs4 import BeautifulSoup

In [2]:
#write results on a file in json format
def write_json(page_objects,page):
    
    path_name = 'data'
    if not os.path.exists(path_name):
        os.makedirs(path_name)
    
    file = 'data_word_emb' + str(page) + '.json'
    
    with open(os.path.join(path_name,file),'a') as f:
        json.dump(page_objects,f)

In [3]:
#define functions 
def find_tags(tags,each_issue):
    
    for counter,temp in enumerate(each.find_all('a')):
        tag = str(temp.string)
        if tag != "None":
            tags.append(tag)
        
    remove_special_ch(tags)

In [4]:
#maybe embody it in the find_desc function to reduce iterations.
def remove_special_ch(text):
    
    for i in range(0,len(text)):
        text[i] = text[i].replace('\n',' ')
        text[i] = text[i].replace('\t',' ')
        text[i] = re.sub(r"<.*?>"," ",text[i])
        #maybe better solution is <[^<>]+> matches any character except < or > 
        #one or more times included inside < and > 

In [5]:
def search_all_page(stack_trace,page):
    temp = page.find_all('div',class_='edit-comment-hide')
    for i in temp:
        temp_2 = i.find('td')
        temp_desc = []
        calculate_desc(temp_2,temp_desc,stack_trace,0)
        if stack_trace != []:
            break

In [6]:
def find_desc(each,base_url,description,stack_trace,is_bug):
    
    next_url     = base_url+each.find('a')['href']
    response     = requests.get(next_url)
    html_content = response.content
    dom          = BeautifulSoup(html_content,'html.parser')
    temp         = dom.find('div',class_ = 'edit-comment-hide')
    temp_2       = temp.find('td')
    
    #find description and the stack trace
    calculate_desc(temp_2,description,stack_trace,1)
    
    #search all the page for a stack trace
    if stack_trace == [] and is_bug == 1:
        search_all_page(stack_trace,dom)
    
    #find the person who closed the issue
    who_closed_it = "none"
    temp_3        = dom.find_all('div',class_='TimelineItem-body') 
    for i in temp_3:
        if i.text.find('closed this')!=-1:
            if i.find('a',class_='author Link--primary text-bold') is None:
                continue
            who_closed_it = i.find('a',class_='author Link--primary text-bold').text 
    
    #remove special characters from the description and stack trace
    remove_special_ch(description)
    
    if stack_trace != []:
        remove_special_ch(stack_trace)
    
    return who_closed_it

In [7]:
def calculate_desc(html_text,description,stack_trace,flag):
    
    html_content = html_text.contents
    length = len(html_content)
    
    if length == 1:
        #some lines are empty. No need to write them
        if html_content[0]!= []:
            str_temp = str(html_content[0])
            if str_temp.find('.java:')!=-1 or str_temp.find('java.')!=-1 or str_temp.find('AndroidRuntime:')!=-1:
                stack_trace.append(str(html_content[0]))
            else:
                if flag == 1:
                    if str(html_content[0]).startswith('<code>') != True:
                        description.append(str(html_content[0]))
    else:
        for i in range(length):
            
            if type(html_content[i]) is bs4.element.NavigableString:
                #avoid writting empty lines and html tags <br>
                if len(html_content[i])>4:
                        str_temp = str(html_content[i])
                        if str_temp.find('.java:')!=-1 or str_temp.find('java.')!=-1 or str_temp.find('AndroidRuntime:')!=-1:
                            stack_trace.append(str(html_content[i]))
                        elif flag == 1:
                            description.append(str(html_content[i]))
            elif type(html_content[i] is bs4.element.Tag):
                #call recursively the function till length is 1
                calculate_desc(html_content[i],description,stack_trace,flag)
        

In [8]:
#search all closed issues

#define some variables
total_issues  = 0
total_traces  = 0
total_bugs    = 0
pages_counter = 0
issues        = []


#initialize important url's
base_url = "https://github.com/"
query_url = f"https://github.com/cgeo/cgeo/issues?page=1&q=is%3Aissue+is%3Aclosed"

#authorization in order to make more requests.
token = os.getenv("GITHUB_TOKEN")
headers = {'Authorization': f'token {token}'}

response = requests.get(query_url, headers = headers)

In [9]:
#loop through all pages

while True:
    response_code = response.status_code
    
    if response_code != 200:
        raise Exception("Error Occured")
    else:
        html_content = response.content
        dom = BeautifulSoup(html_content,'html.parser')
        
        #find all issues in every page
        all = dom.findAll('div', class_='flex-auto min-width-0 p-2 pr-3 pr-md-2')
        
        page_objects = []
        pages_counter += 1
        
        #real scraping begins
        #search all issues per page
        for each in all:
            
            #find tags and who open the issue
            tags=[]
            find_tags(tags,each)
            
            #flag that is activated if the issue is bug
            is_bug = 0
            for i in tags:
                if i == "Bug":
                    is_bug = 1
                    total_bugs = total_bugs+1
            
            #find description, stack trace and who closed
            description   = []
            stack_trace   = []
            who_closed_it = find_desc(each,base_url,description,stack_trace,is_bug)
            
            total_issues = total_issues+1
            if stack_trace !=[]:
                total_traces = total_traces +1
            
            print(total_issues,tags[0])
            
            #write dictionary
            if len(tags)>=1:
                issue_object = {'name':tags[0],'tags':tags[1:-1],'opened_by':tags[len(tags)-1],
                                'description':description,'stack_trace':stack_trace,'closed_by':who_closed_it}
            
            
            page_objects.append(issue_object)
        
        #write issues by page 
        write_json(page_objects,pages_counter)
        
        #combine all issues to a global list
        #issues.extend(page_objects)
        
        #visit the next page if exists
        end = dom.findAll('a',class_='next_page')
        if end == []:
            break
        
        next_url = base_url + end[1]['href']
        response = requests.get(next_url)
        
        ##for debugging
        if total_issues % 25 != 0:
            print("##########################\nTHE PAGE##############\n ",pages_counter,"HAS NOT 25 ISSUES",)
        ##
        
        # Sleep for 60 seconds
        time.sleep(60)

1 Question: Do routing profiles still work as we now have localized raw folders?
2 String search_filter_info_message contains a failure
3 Exclude new markdown changelog translations from merging
4 Android 5 users with SAF problems
5 [New filter framework] Filter for "Hidden" date
6 Container size iso. GC-Code in map view
7 Move filter buttons to ActionBar 
8 Use different icon for LC caches
9 Respect selected update interval for BRouter on first startup
10 more unshortened LC codes hit the UI
11 Routing profiles not set in menu after first tile downloads
12 Improve internal routing first use (routing profiles)
13 Backup with account data but empty database not possible
14 Comma in formula to parse is not recognized
15 Tap on notification status opens in Chrome Webview
16 Use short geocode on target name
17 [New Filters] Add filter "Attributes to Exclude" to suppress caches with given attributes
18 ALC live map loading needs optimization
19 Filter screen wrong colored on white theme
20 

164 Unresponsive/Slow app when buffering themes on SD
165 Check on existing final coords before navigation is started
166 NullPointerException in Geopoint.distanceTo
167 [Nightly] Syc'ing themes folder to internal storage does not work
168 Meaning of string folder_process_status_currentfile
169 SAF: Picture attached via Google Fotos gets attached twice
170 replace com.yqritc.recyclerviewflexibledivider
171 use of JCenter is deprecated
172 SAF: Behavior of migration wizard if folder selection is aborted
173 custom cache icon can only be changed for a whole list
174 Make use of enhanced filter possibilities
175 Next beta/feature release (SAF!)
176 Crash when trying to attach image
177 [Nightly] Save / Refresh Mystery does not work - sometimes
178 Question: Offline Log Deletion on Cache Deletion ok?
179 Offline find counter [NUMBER] is inaccurate and getting worse
180 tests fail on testSearchTrackable()
181 ANR on first startup and when opening About c:geo
182 Calculating waypoints with v

331 Map resets to GoogleMap after version upgrade
332 send logfile to support
333 crash when adding current position WP to not offline stored caches 
334 NPE in InitializeMap
335 ArrayIndexOutOfBoundsException in CacheListAdapter
336 [Livemap] open in browser for UDC's 
337 Show if you did not find a cache
338 waypoint type edit field is missing
339 Access of stacked caches
340 Error during log /w picture
341 dialog theming bugs when using light theme
342 Crash when opening live map
343 Webview - No other browser option if Chrome installed
344 OSM Attribution Format
345 try to reload non-existing gpx
346 Localization missing for map download dialog
347 Copy of waypoint to clipboard
348 ArithmeticException in ImageUtils
349 Purge deleted UDC from DB immediately (to be able to "change" UDC coords)
350 Set activity transition for splash activity
351 Distance info constantly shown as "0,00m" on GMaps
352 Offline map download shouldn't use mobile data
353 Ask for file name on exporting indi

498 [Nightly] Edit of found log leads to new note
499 Unified handling for backing up/restoring caches & settings
500 Log.d() does no longer work on "activate debug log" setting
501 Enhance signature number template 
502 [Nightly] Incorrect Date of Offline Log
503 Made offline log count in log templates optional
504 Status of map downloader if target is not writeable
505 Problem with tb actions while logging
506 ClassCast Exception on launching NewMap or CGeoMap
507 Downloading offline maps - stops sporadically, after completion there is no map
508 Explain how to find Offline logs
509 NPE reported on Google pre launch report 
510 [Nightly] Crash when tap on Saved Log
511 Support color-coded cache categories
512 Send nearby notifications to Android notification interface
513 Support new gc.com attributes
514 Offer c:geo for navigation intents
515 Where is map_trail_hide?
516 Info, cachewiki (German)
517 Map: scale jump
518 Add meaning of c:geo to FAQs
519 Map crashes on aborting "load i

662 Whereyougo links not recognized
663 Support importing bookmarks like pocket queries directly
664 [Nightly] Changing width of customized lines behaves weird
665 History trail active on GoogleMaps altough deactivated
666 cgeo.geocaching.activity.ProgressTest.testProgressWrapper failing randomly
667 Line feeds in log text missing in log view after submit
668 Routed distance no longer shown
669 Accuracy circle on GMv2 incorrectly colored
670 GPX track: Add option to unload track
671 GPX track: Crash if selected track file removed
672 GPX track: Crash on loading invalid GPX track file
673 Crash on map while track is shown
674 Default line appearances have changed
675 Open links from cache listing in separate task
676 Add elevation and timestamp info to exported history trail GPX
677 Limit file types in intents
680 Authentication towards OC.de failing
681 Evaluate new location permission with targetSDK
682 Rework storage framework acc. to Android 11 requirements
683 Empty list has wrong 

833 Cache map jumps from cache position to my position if rotation is enabled
834 Hint field for coordinates incorrect?
835 checkstyle configuration for static imports
836 Next bugfix release
837 Fix remaining lint issues and define baseline
838 Better UI for rotating maps feature
839 Rotating map not rotating as expected
840 Support trails
841 Wrong log date on geocaching.su
842 Support new locationless cache type
843 Codacy GitHub integration
844  GitHub will discontinue password authentication to the API
845 "Caches hidden" fails to load with customized "cache owner" name
846 Update AndroidAnnotations to 4.7 when available
847 Remove "Force to english"
848 Different dir chooser dialogs in use
849 Harmonize maps - Add selection dialog for overlaying icons
850 Google Maps v2: disable screen rotation
851 Strange Google satellite view
852 Delete stale branches / branches no longer used
853 Default to OSM:Map if Google Map cant be used
854 Next bugfix release
855 Harmonize maps - Add tar

1006 Add utility app to keep GPS on
1007 Search for geocode working without active connectors
1008 Getting started link to service settings not working
1009 Endless loop after denying storage authorization
1010 build fails with gradle error ... can't find referenced class ... ActivityCompat21
1011 Sendto c:geo asks for registrate first
1012 Merging of disabled status between map and saved caches
1013 Create account option not working for some OC platforms
1014 missing option to create account for geocaching.su
1015 Next feature release
1016 Move data dir to SD crashes the app
1017 Trackable "status" seems not localized
1018 Support direct offline map download links
1019 Found status not correctly merged between Live Map and Database
1020 waypoints not shown
1021 Remove map strategy
1023 Zooming in is bumpy on mapsforge 0.11
1024 Have share option after field note export
1025 cannot install from F-Droid 
1026 UI frozen while loading trackables on log page
1027 Use hybrid map as satellit

KeyboardInterrupt: 

In [None]:
#print some statistics
print(total_bugs)
print(total_traces)
print(total_issues)
#must be 6106

In [None]:
# to ask
# make small changes and code that is not stack trace i discard it. Is it ok??