In [44]:
import requests
import json

def get_repo_data(target_stars, date_since, file_name):
    """
    This block gets all the popular repo data from Github.
    """
    json_content = {'data': []}
    for i in xrange(1,11):
        url = 'https://api.github.com/search/repositories?q=created:%3E{}+stars:%3E{}&sort=stars&order=desc&per_page=100&page={}'.format(date_since, target_stars, i)
        res = requests.get(url)
        json_content['data'] += res.json()['items']
    #saves the data to a file in json format
    with open(file_name, 'w') as f:
        f.write(json.dumps(json_content, sort_keys=True, indent=4, separators=(',', ': ')))

In [45]:
"""
Test get repo data function
"""
def get_repo_master_data():
    """
    Defines what 'popular' for a repo means
    """
    target_stars = 5000
    date_since = '2000-11-06'
    file_name = 'repo_master_data_json.txt' # File where repo info is stored
    get_repo_data(target_stars, date_since, file_name)

In [54]:
# UNCOMMENT the line below to fetch info for popular repos

# get_repo_master_data()

In [55]:
file_name = 'repo_master_data_json.txt' # File where most popular repo info is stored

def test_popular_repos_file(file_name, num):
    with open(file_name, 'r') as f:
        data = json.load(f)
        print len(data['data'])
    print json.dumps(data['data'][num], sort_keys=True, indent=4, separators=(',', ': '))

# UNCOMMENT below to test any entry in the popular repos file    
    
# test_popular_repos_file(file_name, 0)

In [None]:
from configobj import ConfigObj

"""
Github API keys
"""

config = ConfigObj('config.ini')
client_id = config['github_api_id']
client_secret = config['github_api_secret']

GITHUB_AUTH = '?client_id={}&client_secret={}'.format(client_id, client_secret)

In [56]:
def get_repo_commit_data(file_name, repo_master_file_name):
    """
    Gets commit data for each repo inside the popular repos file
    """
    with open(repo_master_file_name, 'r') as f:
        master_data = json.load(f) # reads file with most popular repos into a dict
        commits_data = {}
        for repo in master_data['data']:
            # commits_url, name
            url = repo['commits_url']
            commits_url = url[:len(url)-6] + GITHUB_AUTH
            res = requests.get(commits_url)
            commits_data[repo['id']] = {'name': repo['name'], 'commits': res.json()}
            print repo['name'], len(commits_data), res.status_code

        # wirtes the commit_data dict to a file
        with open(file_name, 'w') as f:
            f.write(json.dumps(commits_data, sort_keys=True, indent=4, separators=(',', ': ')))

In [57]:
# UNCOMMENT the line below to fetch commit info for popular repos

#get_repo_commit_data('repo_commit_data_json.txt', 'repo_master_data_json.txt')

In [60]:
def test_count_commits_info_repos(file_name):
    with open(file_name, 'r') as f:
        commit_data = json.load(f)
        print len(commit_data)
        
# UNCOMMENT below to test the count
        
# file_name = 'repo_commit_data_json.txt'
# test_count_commits_info_repos(file_name)

1000



def get_repo_commit_data(file_name, repo_master_file_name):
    """
    Get all the contributor information
    """
    with open(repo_master_file_name, 'r') as f:
        master_data = json.load(f)
        contributor_data = {}
        for repo in master_data['data']:
            url = repo['contributors_url'] + GITHUB_AUTH
            res = requests.get(url)
            contributor_data[repo['id']] = {'name': repo['name'], 'contributors': res.json()}
            print repo['id'], repo['name'], len(contributor_data), res.status_code
        with open(file_name, 'w') as f:
            f.write(json.dumps(contributor_data, sort_keys=True, indent=4, separators=(',', ': ')))

In [62]:
# UNCOMMENT the line below to fetch commit info for popular repos

#get_repo_commit_data('repo_contributor_data_json.txt', 'repo_master_data_json.txt')

In [None]:
"""
Get all the issue comments data
"""
with open('repo_master_data_json.txt', 'r') as f:
    master_data = json.load(f)
    issue_comment_data = {}
    for repo in master_data['data']:
        url = repo['issue_comment_url']
        url = url[:len(url)-9] + '?client_id={}&client_secret={}'.format(client_id, client_secret)
        res = requests.get(url)
        comments = []
        for comment in res.json():
            comments.append(comment['body'])
        issue_comment_data[repo['id']] = {'name': repo['name'], 'comments': comments}
        print repo['id'], repo['name'], len(issue_comment_data), res.status_code
    with open('repo_issue_comments_json.txt','w') as f:
        f.write(json.dumps(issue_comment_data, sort_keys=True, indent=4, separators=(',', ': ')))

In [None]:
"""
Get all the issues data
"""
with open('repo_master_data_json.txt', 'r') as f:
    master_data = json.load(f)
    issue_comment_data = {}
    for repo in master_data['data']:
        url = repo['issues_url']
        url = url[:len(url)-9] + '?client_id={}&client_secret={}'.format(client_id, client_secret)
        res = requests.get(url)
        contents = []
        for entry in res.json():
            contents.append(entry['body'])
        issue_comment_data[repo['id']] = {'name': repo['name'], 'contents': contents}
        print repo['id'], repo['name'], len(issue_comment_data), res.status_code
    with open('repo_issue_json.txt','w') as f:
        f.write(json.dumps(issue_comment_data, sort_keys=True, indent=4, separators=(',', ': ')))

In [None]:
"""
Get all the languages data
"""
with open('repo_master_data_json.txt', 'r') as f:
    master_data = json.load(f)
    issue_comment_data = {}
    for repo in master_data['data']:
        url = repo['languages_url']
        url = url + '?client_id={}&client_secret={}'.format(client_id, client_secret)
        res = requests.get(url)
        issue_comment_data[repo['id']] = {'name': repo['name'], 'languages': res.json()}
        print repo['id'], repo['name'], len(issue_comment_data), res.status_code
    with open('repo_languages_json.txt','w') as f:
        f.write(json.dumps(issue_comment_data, sort_keys=True, indent=4, separators=(',', ': ')))

In [None]:
"""
Get all the milestone data
"""
with open('repo_master_data_json.txt', 'r') as f:
    master_data = json.load(f)
    issue_comment_data = {}
    for repo in master_data['data']:
        url = repo['milestones_url']
        url = url[:len(url)-9] + '?client_id={}&client_secret={}'.format(client_id, client_secret)
        res = requests.get(url)
        contents = []
        for entry in res.json():
            contents.append({'title': entry['title'], 'description': entry['description'], 'open_issues': entry['open_issues'], 'closed_issues': entry['closed_issues']})
        issue_comment_data[repo['id']] = {'name': repo['name'], 'milestones': contents}
        print repo['id'], repo['name'], len(issue_comment_data), res.status_code
    with open('repo_milestones_json.txt','w') as f:
        f.write(json.dumps(issue_comment_data, sort_keys=True, indent=4, separators=(',', ': ')))

In [None]:
"""
get concrete contributors data
"""
from collections import defaultdict
with open('repo_contributor_data_json.txt') as f, open('repo_master_data_json.txt') as master:
    contributor_data = json.load(f)
    master_data = json.load(master)['data'][:201]
    concrete_contributor = defaultdict(list)
    for repo in master_data:
        if repo['name'] == 'linux': continue
        contributors = contributor_data[str(repo['id'])]['contributors']
        for individual in contributors[:5] if len(contributors) > 5 else contributors:
            followers_url = individual['followers_url'] + '?client_id={}&client_secret={}'.format(client_id, client_secret)
            #following_url = individual['following_url']
            #following_url = following_url[:len(following_url)-13]
            #gists_url = individual['gists_url']
            #gists_url = gists_url[:len(gists_url)-10]
            starred_url = individual['starred_url']
            starred_url = starred_url[:len(starred_url)-15] + '?client_id={}&client_secret={}'.format(client_id, client_secret)
            organizations_url = individual['organizations_url'] + '?client_id={}&client_secret={}'.format(client_id, client_secret)
            repos_url = individual['repos_url'] + '?client_id={}&client_secret={}'.format(client_id, client_secret)
            
            followers_res = requests.get(followers_url)
            followers_json = followers_res.json()
            starred_res = requests.get(starred_url)
            starred_json = starred_res.json()
            organizations_res = requests.get(organizations_url)
            organizations_json = organizations_res.json()
            repos_res = requests.get(repos_url)
            repos_json = repos_res.json()
            print repo['name'], individual['id'], followers_res.status_code, starred_res.status_code, organizations_res.status_code, repos_res.status_code
            concrete_contributor[repo['id']].append({'individual_id': individual['id'],
                                                'login': individual['login'],
                                                'followers': len(followers_json),
                                               'starred': len(starred_json),
                                               'repos': len(repos_json),
                                               'organizations': len(organizations_json)})
        print repo['id'], repo['name'], len(concrete_contributor)
    with open('repo_concrete_contributors_json.txt','w') as ff:
        ff.write(json.dumps(concrete_contributor, sort_keys=True, indent=4, separators=(',', ': ')))


In [None]:
from bs4 import BeautifulSoup
def clean_up(s):
    s = str(s)
    if 'k' in s:
        return int(float(s[:-1]) * 1000)
    else:
        return int(s)
    
with open('repo_concrete_contributors_json2.txt', 'r') as f:
    concrete_contributor = json.load(f)
    for values in concrete_contributor.values():
        for value in values:
            print value['login']
            try:
                html = requests.get('https://github.com/' + value['login'] + '/').content
                soup = BeautifulSoup(html)
                data = soup.find_all('span', {'class' : 'counter'}, text=True)
                repositories = data[0].get_text().strip()
                stars = data[1].get_text().strip()
                followers = data[2].get_text().strip()
                following = data[3].get_text().strip()
                print repositories, stars, followers, following
            except:
                print "^^^^^^^^error above name"
                continue
            value['followers'] = clean_up(followers)
            value['following'] = clean_up(following)
            value['starred'] = clean_up(stars)
            value['repos'] = clean_up(repositories)

    with open('repo_concrete_contributors_json3.txt', 'w') as ff:
        ff.write(json.dumps(concrete_contributor, sort_keys=True, indent=4, separators=(',', ': ')))

In [None]:
from bs4 import BeautifulSoup
import requests
import json
from collections import defaultdict
def clean_up(s):
    s = str(s)
    if 'k' in s:
        return int(float(s[:-1]) * 1000)
    else:
        return int(s)
    
with open('repo_contributor_data_json.txt', 'r') as f:
    contributors = json.load(f)
    concrete_contributors = defaultdict(list)
    for key, values in contributors.iteritems():
        for value in values['contributors']:
            print value['login']
            try:
                html = requests.get('https://github.com/' + value['login'] + '/').content
                soup = BeautifulSoup(html)
                data = soup.find_all('span', {'class' : 'counter'}, text=True)
                repositories = data[0].get_text().strip()
                stars = data[1].get_text().strip()
                followers = data[2].get_text().strip()
                following = data[3].get_text().strip()
                print repositories, stars, followers, following
            except IndexError:
                print "^^^^^^^^error above name"
                continue
            concrete_contributors[key].append({
                    'login': value['login'],
                    'id': value['id'],
                    'followers': clean_up(followers),
                    'following':clean_up(following),
                    'starred':clean_up(stars),
                    'repos':clean_up(repositories)
                })
    with open('repo_concrete_contributors_json.txt', 'w') as ff:
        ff.write(json.dumps(concrete_contributors, sort_keys=True, indent=4, separators=(',', ': ')))

In [None]:
print concrete_contributors

In [None]:
from bs4 import BeautifulSoup
import requests
import json
from collections import defaultdict
def clean_and_get_text_data(s):
    try:
        return int(s.get_text().strip().replace(',','').replace('+',''))
    except UnicodeEncodeError:
        return -1

def scrape_repo_info(username, reponame):
    print username, reponame
#     username = 'fortawesome'
#     reponame = 'font-awesome'
    html = requests.get('https://github.com/' + username + '/' + reponame).content
    soup = BeautifulSoup(html)
    data_1 = soup.find_all('span', {'class' : 'num text-emphasized'}, text=True)
    data_2 = soup.find_all('span', {'class' : 'counter'}, text=True)
    
    commits, branches, release, contributors, issues, pull_requests, projects = 0,0,0,0,0,0,0
#     print data_1, data_2
    if len(data_2) == 2:
        commits = clean_and_get_text_data(data_1[0])
        branches = clean_and_get_text_data(data_1[1])
        release = clean_and_get_text_data(data_1[2])
        contributors = clean_and_get_text_data(data_1[3])
#         issues = clean_and_get_text_data(data_2[0])
        pull_requests = clean_and_get_text_data(data_2[0])
        projects = clean_and_get_text_data(data_2[1])
    else:
        #assert len(data_2) == 3
        commits = clean_and_get_text_data(data_1[0])
        branches = clean_and_get_text_data(data_1[1])
        release = clean_and_get_text_data(data_1[2])
        contributors = clean_and_get_text_data(data_1[3])
        issues = clean_and_get_text_data(data_2[0])
        pull_requests = clean_and_get_text_data(data_2[1])
        projects = clean_and_get_text_data(data_2[2])

    data = {'commits': commits,
            'branches': branches,
            'release': release,
            'contributors': contributors,
            'issues': issues,
            'pull_requests': pull_requests,
            'projects': projects
           }
    print data
    return data

"""
with open('repo_master_data_json.txt','r') as f:
    master_data = json.load(f)['data']
    repo_concrete_data = {}
    error_list = []
    for repo in master_data:
        username = repo['owner']['login']
        reponame = repo['name']
        try:
            repo_concrete_data[repo['id']] = scrape_repo_info(username, reponame)
        except:
            error_list.append((repo['id'], username, reponame))
    with open('repo_concrete_data_json.txt', 'w') as ff:
        ff.write(json.dumps(repo_concrete_data, sort_keys=True, indent=4, separators=(',', ': ')))
    print error_list
"""

In [None]:
error_list = [(5344375, u'textmate', u'textmate'), (58836534, u'parkjs814', u'AlgorithmVisualizer'), (1549138, u'CocoaLumberjack', u'CocoaLumberjack'), (15953199, u'HubSpot', u'youmightnotneedjquery'), (24817507, u'moklick', u'frontend-stuff'), (132321, u'facebookarchive', u'three20'), (2281775, u'marcuswestin', u'WebViewJavascriptBridge'), (41592744, u'AllThingsSmitty', u'css-protips'), (20799673, u'kesenhoo', u'android-training-course-in-chinese'), (14454268, u'briangonzalez', u'jquery.adaptive-backgrounds.js'), (50301368, u'p-e-w', u'maybe'), (5894096, u'mdo', u'code-guide'), (21109196, u'PureLayout', u'PureLayout'), (48175620, u'google', u'agera'), (16466596, u'michaelvillar', u'dynamics.js'), (11423758, u'mame', u'quine-relay'), (12977854, u'breach', u'breach_core'), (25212911, u'davidsonfellipe', u'awesome-wpo'), (3159966, u'ubuwaits', u'beautiful-web-type'), (31125362, u'bendc', u'frontend-guidelines'), (5094437, u'ducksboard', u'gridster.js'), (14454268, u'briangonzalez', u'jquery.adaptive-backgrounds.js'), (50301368, u'p-e-w', u'maybe'), (5894096, u'mdo', u'code-guide'), (20167283, u'sbstjn', u'timesheet.js'), (21109196, u'PureLayout', u'PureLayout'), (55026106, u'huluoyang', u'freecodecamp.cn'), (48175620, u'google', u'agera'), (16466596, u'michaelvillar', u'dynamics.js'), (11423758, u'mame', u'quine-relay'), (12977854, u'breach', u'breach_core'), (25212911, u'davidsonfellipe', u'awesome-wpo'), (63484632, u'facebookresearch', u'fastText'), (3605299, u'baconjs', u'bacon.js'), (51994692, u'geeeeeeeeek', u'electronic-wechat')]

In [None]:
import json
error_list2 = []
repo_concrete_data = {}
for ID, username,reponame in error_list:
    try:
        repo_concrete_data[ID] = scrape_repo_info(username, reponame)
    except IndexError:
        error_list2.append((ID, username, reponame))
with open('repo_concrete_data_json2.txt', 'w') as ff:
    ff.write(json.dumps(repo_concrete_data, sort_keys=True, indent=4, separators=(',', ': ')))
error_list = error_list2
print len(error_list)
print error_list

In [None]:
#unique_data = {}
for ID, username, reponame in set(error_list):
    print ID, username, reponame

In [None]:
"""
Combining the data to a giant json file
"""
import json
all_data = {}
with open('repo_master_data_json.txt', 'r') as master_file:
    master_data = json.load(master_file)['data']
    for repo in master_data:
        data = {
            'id' : repo['id'],
            'name' : repo['name'],
            'created_at' : repo['created_at'],
            'description' : repo['description'],
            'forks_count' : repo['forks_count'],
            'has_downloads' : repo['has_downloads'],
            'has_issues' : repo['has_issues'],
            'has_pages' : repo['has_pages'],
            'has_wiki' : repo['has_wiki'],
            'has_homepage' : True if repo['homepage'] else False,
            'primary_language' : repo['language'],
            'open_issues_count' : repo['open_issues_count'],
            'size' : repo['size'],
            'stars_count' : repo['stargazers_count'],
            'watchers_count' : repo['watchers_count']
        }
        all_data[repo['id']] = data

In [None]:
with open('repo_languages_json.txt', 'r') as f:
    language_data = json.load(f)
    for ID, repo in all_data.iteritems():
        repo['languages'] = language_data[str(ID)]['languages']

In [None]:
with open('repo_concrete_contributors_json.txt', 'r') as f:
    contributors_data = json.load(f)
    missing_list = []
    for ID, repo in all_data.iteritems():
        repo['contributors'] = contributors_data[str(ID)] if str(ID) in contributors_data else []
        if str(ID) not in contributors_data: missing_list.append((str(ID), repo['name']))

In [None]:
with open('repo_concrete_data_json.txt', 'r') as f:
    other_data = json.load(f)
    for ID, repo in all_data.iteritems():
        if str(ID) in other_data:
            repo['branches_count'] = other_data[str(ID)]['branches']
            repo['commits_count'] = other_data[str(ID)]['commits']
            repo['contributors_count'] = other_data[str(ID)]['contributors']
            repo['projects_count'] = other_data[str(ID)]['projects']
            repo['pull_requests_count'] = other_data[str(ID)]['pull_requests']
            repo['release_count'] = other_data[str(ID)]['release']
        else:
            repo['branches_count'] = -1
            repo['commits_count'] = -1
            repo['contributors_count'] = -1
            repo['projects_count'] = -1
            repo['pull_requests_count'] = -1
            repo['release_count'] = -1

In [None]:
with open('repo_issue_comments_json.txt', 'r') as f:
    comments_data = json.load(f)
    for ID, repo in all_data.iteritems():
        if str(ID) in comments_data:
            repo['issue_comments_count'] = comments_data[str(ID)]['comments']
        else:
            repo['issue_comments_count'] = []

In [None]:
with open('repo_milestones_json.txt', 'r') as f:
    milestones_data = json.load(f)
    for ID, repo in all_data.iteritems():
        if str(ID) in milestones_data:
            repo['milestones_count'] = len(milestones_data[str(ID)]['milestones'])
        else:
            repo['milestones_count'] = -1

In [None]:
all_data.values()[:2]

In [None]:
with open('all_data.json', 'w') as ff:
    ff.write(json.dumps(all_data, sort_keys=True, indent=4, separators=(',', ': ')))

In [None]:
with open('all_data_array.json', 'w') as ff:
    ff.write(json.dumps(all_data.values(), sort_keys=True, indent=4, separators=(',', ': ')))

**INSIGHTS WE PLAN ON DISPLAYING**
   - What language the most popular repositories use
   - Most used language for these repositories
   - When languages die, the time stamp for repositories that were created that use specific languages
   - Is there a correlation between stars and number of commits?
   - Is there a correlation between stars and amount of contributors?
   - Is there a correlation between contributors and amount of commits?