In [12]:
import json
import os.path
import time
import requests
from io import StringIO
import pandas as pd

# This notebook will create `repo.csv` for scraped github data.

In [11]:
de_github = pd.read_excel('de_with_github.xlsx')
ds_github = pd.read_excel('ds_with_github.xlsx')

In [13]:
to_keep = ['fullName', 'description', 'company', 'jobTitle', 'location', 'school',
       'schoolDegree', 'allSkills', 'github_urls']

In [14]:
df = pd.concat([ds_github, de_github], ignore_index=True)
df = df[to_keep]

In [15]:
df['github_urls'].fillna("not available", inplace = True)

In [16]:
class GithubScarper():

    def __init__(self, username):
        self.username = username
        self.userURL = f'https://api.github.com/users/{self.username}'
        self.repoURL = f'https://api.github.com/users/{self.username}/repos'

    def _save_json(self, filename, json_data):
        with open(f'{filename}.json', 'w') as fp:
            json.dump(json_data, fp , indent= True)
    
    def _get_repo_stats(self):
        repoDataFromGithub = requests.get(self.repoURL).json()
        self.useRepos = {}
        for i in range(len(repoDataFromGithub)):
            repo = {}
            dataNeeded = [
                'name',
                'description',
                'language'
            ]
            for (k, v) in repoDataFromGithub[i].items():
                if k in dataNeeded:
                    repo[k] = v
    
            self.useRepos[repoDataFromGithub[i]['name']] = repo
        
        return self.useRepos
    
    
    def get_user_stats(self):
        # this is data from github, we dont need all of it
        userDataFromGithub = requests.get(self.userURL).json()
        dataNeeded = [
            'name'
        ]
        
        self.userData = {}    
        for (k, v) in userDataFromGithub.items():
            if k in dataNeeded:
                self.userData[k] = v
        
        self.userData['repos'] = self._get_repo_stats()
                
        self._save_json('output_of_User' , self.userData)
        
        return json.dumps(self.userData, indent=True)   

In [19]:
df['github_urls'][df['github_urls']!='not available'].values[:20]

array(['https://github.com/tkhaxton', 'https://github.com/NatalieNunez',
       'https://github.com/BrooksDiwu', 'https://github.com/sanjsvk',
       'https://github.com/larissalong', 'https://github.com/anil10iitr',
       'https://github.com/viveksnh', 'https://github.com/asy1113',
       'https://github.com/prasadpagade', 'https://github.com/bkhurley',
       'https://github.com/shwetashambhavi',
       'https://github.com/victorhunguyen',
       'https://github.com/setuparekh', 'https://github.com/meghaarora42',
       'https://github.com/ARiccGitHub', 'https://github.com/mitosborn',
       'https://github.com/lucas-reynolds',
       'https://github.com/Kiranbrar', 'https://github.com/YeonjooSmith',
       'https://github.com/julianeagu'], dtype=object)

# Note:  
- Class GithubScarper() will crash when feeding too many github_tokens at one time, in the project, github data was manually chose for around ~50 github users from linkedin profile data
- The output of GithubScarper() is json
- repo.csv will be saved for importing graph database

In [22]:
git_scraper = GithubScarper('NatalieNunez')
git_data = git_scraper.get_user_stats()
NatalieNunez = json.loads(git_data)
NatalieNunez

{'name': 'Natalie Nunez',
 'repos': {'c0321-code-solutions': {'name': 'c0321-code-solutions',
   'description': None,
   'language': 'JavaScript'},
  'code-journal': {'name': 'code-journal',
   'description': 'A dynamic HTML, CSS, and JavaScript journal application for coders who want to capture their notes.',
   'language': 'JavaScript'},
  'CodeSignal': {'name': 'CodeSignal',
   'description': None,
   'language': 'TypeScript'},
  'crackin-jokes': {'name': 'crackin-jokes',
   'description': 'A dynamic HTML, CSS, and JavaScript solo project',
   'language': 'JavaScript'},
  'edabit': {'name': 'edabit', 'description': None, 'language': 'JavaScript'},
  'JSPractice': {'name': 'JSPractice', 'description': None, 'language': None},
  'meme-gallery': {'name': 'meme-gallery',
   'description': 'An interactive HTML, CSS, and JavaScript app',
   'language': 'JavaScript'},
  'modern-javascript-2020': {'name': 'modern-javascript-2020',
   'description': None,
   'language': 'HTML'},
  'NatalieNu

In [233]:
# git_content = [NatalieNunez,tkhaxton,zhengweifz,kewalkothari,nickweimer,elainew96,
#      tongxinguo,WesCodes,jeremyrchow,christyono,hackcoderr,akan72,agorina91,
#      harshitagirase,shreyasgokhale,mmkvdev,kavitha89,zhengweifz,rabujamra,FalconiNicasio,
#     shbhamdbey,nithish08,davidjayjackson,alexwcheng,nestorghh,yuecchen,Sanil15,shbhamdbey,
#     colesobel,Avinash998,hackcoderr,sudheera96,codingkohli,rohithreddy024,xizhang,alexseong,venkateshnanduri,
#     chandlerphelps,adwaitsathe,Nehan95,davidjayjackson,adrianavesa,rahuldkjain,Williamovero,jinsongwei,raghuncstate,
#     raj1603chdry,dattatele,HarshaliWagh,ivangoldov]

In [213]:
# token_names = ['NatalieNunez','tkhaxton','zhengweifz','kewalkothari','nickweimer','elainew96',
#      'tongxinguo','WesCodes','jeremyrchow','christyono','hackcoderr','akan72','agorina91',
#      'harshitagirase','shreyasgokhale','mmkvdev','kavitha89','zhengweifz','rabujamra','FalconiNicasio',
#     'shbhamdbey','nithish08','davidjayjackson','alexwcheng','nestorghh','yuecchen','Sanil15','shbhamdbey',
#     'colesobel','Avinash998','hackcoderr','sudheera96','codingkohli','rohithreddy024','xizhang','alexseong','venkateshnanduri',
#     'chandlerphelps','adwaitsathe','Nehan95','davidjayjackson','adrianavesa','rahuldkjain','Williamovero','jinsongwei','raghuncstate',
#     'raj1603chdry','dattatele','HarshaliWagh','ivangoldov']

In [231]:
fullname_with_token = df[['fullName', 'token']]
fullname_with_token

Unnamed: 0,fullName,token
0,Tom Haxton,tkhaxton
1,Navata M.,not available
2,Austin Lin Gibbons,not available
3,Mike Woodward,not available
4,Tanuja Addanki,not available
...,...,...
447,Rakshitha J,not available
448,Karthik Inuganti,not available
449,Seho Kim,SeHo-Kim
450,Datta Tele,dattatele


In [None]:
token_df = pd.Dataframe(list(zip(token_names, tokens)), columns=[])

In [228]:
df['token'] = df['github_urls'].apply(lambda x: os.path.split(x)[1])

In [133]:
df_with_git_token = df.merge(git_df, how='outer', on='token')

In [253]:
repoName = []
description = []
language = []
repoOwnerToken = []

for i,t in enumerate(git_content):
    
    for r in t['repos'].items():
        repoName.append(r[1]['name'])
        description.append(r[1]['description'])
        language.append(r[1]['language'])
        repoOwnerToken.append(token_names[i])

In [255]:
repo_df = pd.DataFrame(list(zip(repoOwnerToken, repoName, description, language)), columns=['token','repoName','description','language'])
repo_df.index.names = ['repoID']
#repo_df.to_csv('repo.csv')

In [256]:
repo_df.shape

(834, 4)

In [257]:
repo_df.head()

Unnamed: 0_level_0,token,repoName,description,language
repoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,NatalieNunez,c0321-code-solutions,,JavaScript
1,NatalieNunez,code-journal,"A dynamic HTML, CSS, and JavaScript journal ap...",JavaScript
2,NatalieNunez,CodeSignal,,TypeScript
3,NatalieNunez,crackin-jokes,"A dynamic HTML, CSS, and JavaScript solo project",JavaScript
4,NatalieNunez,edabit,,JavaScript
