## 1. Imports

In [2]:
import os
import pandas as pd
import progressbar as pbar
from credentials import username, password
from github import Github

## 2. Github authentication

Importing login details for Github from different file for security reasons. The credentitals file is a Python script and has to be located in the /scripts directory. The file consists of two functions - username() and password() where each of the functions has to return only the username and password as a string.

In [3]:
g = Github(username(), password())  # credentials for github login
repos = g.get_user().get_repos()  # iterable; all the repos associated with the profile incl. organizations

## 3. Assignment and staff details

A variable which is a string is needed, this will be the assignment name on Github to look for. The pair number is not included so all pairs repositories will be scraped at once. To get the pair number the length of the string is needed for indexing purposes but I want it to include the word "pair", so slight modification is made. Finally, a list that contains the GitHub usernames of all the staff, this is needed so the staff will be excluded as being collaborators and any commits made by the staff won't be counted in the student repositories.

In [7]:
ha_1 = '2is50-2019-2020-homework-assignment-1-pair-'  # repos to filter for; 
ha_len = len('2is50-2019-2020-homework-assignment-1-')  # will be used for retrieving pair# by indexing
staff_ids = ['sakce', 'kvidelov', 'mackees', 'sakehl', 'thatmariia', 'wstomv', 'sansteTUe', 'HDylanTV', 'gzwaan', 'dmarinissen']

## 4. Scraping the API

### Useful commands

- repo.get_commits() returns all commit logs as an iterable

- repo.get_collaborators() returns all collaborators of that repo as an iterable

- commit.author returns information about the commit author

- commit.author.login returns the commit author username


### 4.1. Commit Authors

There are three possible author cases, the commit is made with a student account, the commit is made with a staff account or the commit is made with anonymous account. In the third case, commit.author is returning None as far as I know, in the other two cases the command will return the account. For taking just the username I am using commit.author.login which cannot be used on None though.

In [22]:
def filterCollaborators(repository):
    """The function filters out only the student accounts from a repository as collaborators."""
    
    student_collaborators = []

    for i in repository.get_collaborators():

        if i.login in staff_ids:
            pass
        else:
            student_collaborators.append(i.login)

    return student_collaborators

In [58]:
pair_numb = []
authors = []
emails = []
date_time = []
commit_messages = []

for repo in repos: # iterating over all repositories associated with the login account

    if ha_1 in repo.name:  # filtering only the assignment repos

        pair_id = int(str(repo.name)[len(ha_1):])  # getting the pair number from the repository name
        
        #collaborators = filterCollaborators(repo)  # getting the repo collaborators
        commits = repo.get_commits()  # iterable that contains the logs of each commit in the repo
        
        for commit in commits:  
            
            commit_date = commit.commit.author.date  # getting the date and time of the commit
 

            if commit.author == None:  # checking whether the account is anonymous

                author = 'Anonymous'

                pair_numb.append(pair_id)
                date_time.append(commit_date)
                authors.append(author)  # author username
                emails.append(commit.commit.author.email)  # author email
                commit_messages.append(commit.commit.message)  # commit message 

                print(commit_date, ' Anonymous commit - Pair: ', pair_id)

            elif commit.author.login not in staff_ids:  # optimal case when the commit is by student


                pair_numb.append(pair_id)
                date_time.append(commit_date)
                authors.append(commit.author.login)  # author username
                emails.append(commit.commit.author.email)  # author email
                commit_messages.append(commit.commit.message)  # commit message 

            
            elif commit.author.login in staff_ids:  # staff commits
                pass
        

            else:  # for anything weird that I may not be taking into account
                print('Issue!')
                print(commit_date, 'Pair: ', pair_id, 'Username: ', commit.author.login)


    else:  # not an assignment repository
        pass

2020-05-14 06:00:19  Anonymous commit - Pair:  28
2020-05-13 04:04:19  Anonymous commit - Pair:  28
2020-05-11 07:47:32  Anonymous commit - Pair:  28
2020-05-11 01:06:16  Anonymous commit - Pair:  28
2020-05-06 01:45:38  Anonymous commit - Pair:  28
2020-05-06 01:38:14  Anonymous commit - Pair:  28
2020-05-05 02:23:00  Anonymous commit - Pair:  28
2020-05-05 00:49:08  Anonymous commit - Pair:  28
2020-05-04 19:26:12  Anonymous commit - Pair:  28
2020-05-04 19:23:42  Anonymous commit - Pair:  28
2020-05-04 19:23:37  Anonymous commit - Pair:  28
2020-05-04 19:20:46  Anonymous commit - Pair:  28
2020-05-04 19:17:04  Anonymous commit - Pair:  28
2020-05-01 15:14:41  Anonymous commit - Pair:  28
2020-05-01 14:56:41  Anonymous commit - Pair:  28
2020-05-14 10:13:14  Anonymous commit - Pair:  29
2020-05-14 09:11:18  Anonymous commit - Pair:  29
2020-05-14 09:09:51  Anonymous commit - Pair:  29
2020-05-14 08:01:18  Anonymous commit - Pair:  29
2020-05-13 14:34:39  Anonymous commit - Pair:  29


In [59]:
api_data = {'Pair': pair_numb, 'Author': authors, 'Author_Email': emails, 'Date': date_time, 'Commit_Message': commit_messages}

In [60]:
pd.DataFrame(api_data)

Unnamed: 0,Pair,Author,Author_Email,Date,Commit_Message
0,10,GiuliaFliescher,g.fliescher@student.tue.nl,2020-05-17 13:40:17,changed visual layout
1,10,GiuliaFliescher,g.fliescher@student.tue.nl,2020-05-17 13:39:51,removed color oprions we dont use
2,10,GiuliaFliescher,g.fliescher@student.tue.nl,2020-05-17 13:13:38,unintentional changes
3,10,GiuliaFliescher,g.fliescher@student.tue.nl,2020-05-17 13:13:06,tried to fix SaveImage function
4,10,GiuliaFliescher,g.fliescher@student.tue.nl,2020-05-17 13:02:50,imported numba and jit
...,...,...,...,...,...
1695,90,Loebna,loebnasabbah@hotmail.com,2020-05-05 10:54:56,Added name + date
1696,90,LinTaw,l.tawafra@student.tue.nl,2020-05-05 10:37:21,Contributor 2 name
1697,90,LinTaw,l.tawafra@student.tue.nl,2020-05-05 10:36:19,Merge remote-tracking branch 'origin/master'\n...
1698,90,LinTaw,l.tawafra@student.tue.nl,2020-05-05 10:30:06,Contributor 2 name


In [23]:
coms = repos[0].get_commits()

In [33]:
coms[0].commit.author.date

datetime.datetime(2020, 4, 28, 15, 4, 37)

In [50]:
api_data

2020, 5, 13, 18, 24, 23),
  datetime.datetime(2020, 5, 11, 7, 44, 7),
  datetime.datetime(2020, 5, 11, 7, 30, 53),
  datetime.datetime(2020, 5, 11, 7, 30, 44),
  datetime.datetime(2020, 5, 7, 9, 21, 47),
  datetime.datetime(2020, 5, 7, 9, 21, 7),
  datetime.datetime(2020, 5, 7, 8, 53, 47),
  datetime.datetime(2020, 5, 6, 12, 54, 3),
  datetime.datetime(2020, 5, 5, 15, 18, 35),
  datetime.datetime(2020, 5, 4, 14, 25, 49),
  datetime.datetime(2020, 5, 4, 14, 5, 21),
  datetime.datetime(2020, 5, 4, 9, 47, 1),
  datetime.datetime(2020, 5, 4, 8, 25, 5),
  datetime.datetime(2020, 5, 3, 11, 13, 55),
  datetime.datetime(2020, 5, 1, 9, 42, 22),
  datetime.datetime(2020, 5, 1, 9, 38, 19),
  datetime.datetime(2020, 5, 1, 9, 36, 31),
  datetime.datetime(2020, 5, 1, 8, 10, 1),
  datetime.datetime(2020, 5, 1, 8, 10),
  datetime.datetime(2020, 5, 1, 8, 9, 59),
  datetime.datetime(2020, 5, 16, 16, 47, 35),
  datetime.datetime(2020, 5, 14, 12, 41, 41),
  datetime.datetime(2020, 5, 14, 12, 40, 19),
  da