## 1. Imports

In [1]:
import os
import pandas as pd
from github import Github
from credentials import username, password
from githubAPI_scrape import scrapeAPI

## 2. Storing the API data

The goal is to get the data into .csv file which has columns: Pair, Date, Member_1, Member_2, Total.

Note that each row should represent a day of work. So, on a date X it shows how many commits each one of the pair members did. If neither one did any commits then this row is not included.

In [2]:
g = Github(username(), password())  # credentials for github login
repos = g.get_user().get_repos()  # iterable; all the repos associated with the profile incl. organizations

In [3]:
df_data = pd.DataFrame(scrapeAPI(repos))
all_unique_pairs = df_data['Pair'].unique()

grouped_data = df_data.groupby(by=['Pair','Date','Author']).describe()
grouped_data = grouped_data['Author_Email'][['count']].reset_index()

In [4]:
aggregated_list = []

for pair in all_unique_pairs:

    df_pair = grouped_data[grouped_data['Pair'] == pair]

    members = df_pair['Author'].unique()

    if len(members) == 2: # best scenario

        for date in df_pair['Date'].unique():

            df_pair_date = df_pair[df_pair['Date'] == date]
            m_1 = df_pair_date[df_pair_date['Author'] == members[0]]['count'].sum()
            m_2 = df_pair_date[df_pair_date['Author'] == members[1]]['count'].sum()

        
            aggregated_list.append([pair, date, m_1, m_2])

    elif len(members) > 2: 

        
        print('\nIssue! More than 2 pair members with pair', pair)
        print('Members: ', members)

    elif len(members) < 2: 
        print('\nIssue! Less than 2 pair members with pair', pair)
        print('Members: ', members)


Issue! More than 2 pair members with pair 11
Members:  ['justinalittlefield' 'AER-Stud' 'kajiewang']

Issue! More than 2 pair members with pair 18
Members:  ['Steefkuh' 'diederikpel' 'dtpel']

Issue! Less than 2 pair members with pair 19
Members:  ['Chri5thiano']

Issue! More than 2 pair members with pair 8
Members:  ['invalid-email-address' 'willemsjob' 'RobHoogma']


## 2.1. Adressing the above issues manually

- Pair 18: user 'dtpel' will contain the commits of 'diederikpel'

- Pair 8: user 'RobHoogma' will contain the commits of 'invalid-email-address'

- Pair 19: user 'Chri5thiano' will contain all commits and member 0 will have 0 commits as a formality

- Pair 11: user 'AER-Stud' commits will be distributed among the other two members

In [5]:
# Pair 19

custom_p19 = []
pair = 19
df_pair = grouped_data[grouped_data['Pair'] == pair]
members = df_pair['Author'].unique()


for date in df_pair['Date'].unique():

    df_pair_date = df_pair[df_pair['Date'] == date]
    m_1 = df_pair_date[df_pair_date['Author'] == members[0]]['count'].sum()
    m_2 = 0

    custom_p19.append([pair, date, m_1, m_2])

aggregated_list += custom_p19

In [6]:
# Pair 18 

custom_p18 = []
pair = 18
df_pair = grouped_data[grouped_data['Pair'] == pair]
members = df_pair['Author'].unique()


for date in df_pair['Date'].unique():

    df_pair_date = df_pair[df_pair['Date'] == date]
    m_1 = df_pair_date[df_pair_date['Author'] == members[0]]['count'].sum()
    m_2 = df_pair_date[df_pair_date['Author'] == members[1]]['count'].sum()
    m_2 += df_pair_date[df_pair_date['Author'] == members[2]]['count'].sum()

    custom_p18.append([pair, date, m_1, m_2])

aggregated_list += custom_p18

In [7]:
# Pair 8 

custom_p8 = []
pair = 8
df_pair = grouped_data[grouped_data['Pair'] == pair]
members = df_pair['Author'].unique()


for date in df_pair['Date'].unique():

    df_pair_date = df_pair[df_pair['Date'] == date]
    m_1 = df_pair_date[df_pair_date['Author'] == members[0]]['count'].sum()
    m_2 = df_pair_date[df_pair_date['Author'] == members[1]]['count'].sum()
    m_1 += df_pair_date[df_pair_date['Author'] == members[2]]['count'].sum()

    custom_p8.append([pair, date, m_1, m_2])

aggregated_list += custom_p8

In [8]:
# Pair 11 

custom_p11 = []
pair = 11
df_pair = grouped_data[grouped_data['Pair'] == pair]
members = df_pair['Author'].unique()


for date in df_pair['Date'].unique():

    df_pair_date = df_pair[df_pair['Date'] == date]
    m_1 = df_pair_date[df_pair_date['Author'] == members[0]]['count'].sum()
    m_2 = df_pair_date[df_pair_date['Author'] == members[1]]['count'].sum()
    m_2 += df_pair_date[df_pair_date['Author'] == members[2]]['count'].sum()

    custom_p11.append([pair, date, m_1, m_2])

aggregated_list += custom_p11

In [9]:
cols = ['Pair', 'Date', 'Member_1', 'Member_2']
df_aggregated_member = pd.DataFrame(aggregated_list, columns=cols).sort_values(by='Pair')
df_aggregated_member['Total'] = df_aggregated_member['Member_1'] + df_aggregated_member['Member_2']

In [10]:
os.chdir('../data/')
df_aggregated_member.to_csv('ha_1_pairs_aggregated.csv', index=False)