# Data Scraping

In [5]:
# This script requires github3.py version 0.9.6
# pip install github3.py

import os
import pandas as pd
from github3 import login

GITHUB_API_TOKEN = os.environ.get('GITHUB_API_TOKEN')
gh = login(token=GITHUB_API_TOKEN)
LOCATION = 'Recife'

gh.user('karlafalcao');

In [13]:

def queries_list(location=LOCATION):
    return (
        'location:{} followers:>1'.format(location),
        'location:{} followers:1'.format(location),
        'location:{} followers:0'.format(location),
    )


# obs.: gh.search_users() nao retorna todos os atributos do objeto
def search_users(query):
    return [u.user for u in gh.search_users(query, sort='followers')]


def fetch_gh_users():
    return [search_users(q) for q in queries_list()]



In [9]:
## procura pela lista de interesse
## porem, gh.search_users() nao retorna todos os atributos do objeto
ghusers_batches = fetch_gh_users()   

##################################################################
## teste

# print('object: ', ghusers_batches[0][0])
# print('name: ', ghusers_batches[0][0].name)
# print('id: ', ghusers_batches[0][0].id)
# print('numero de seguidores: ', ghusers_batches[0][0].followers)
# print('criado em: ', ghusers_batches[0][0].created_at)

# >>> object:  tarruda
# >>> name:  
# >>> id:  842846
# >>> numero de seguidores:  0
# >>> criado em:  None

print(ghusers_batches)


[[<User [tarruda:]>, <User [mairatma:]>, <User [joselitojunior1:]>, <User [marcelcaraciolo:]>, <User [luanfonceca:]>, <User [deividazevedo2:]>, <User [simoneas02:]>, <User [gileno:]>, <User [filipeximenes:]>, <User [renatooliveira:]>, <User [luiztiago:]>, <User [ac-pm:]>, <User [fernandocastor:]>, <User [henvic:]>, <User [clovisdasilvaneto:]>, <User [fjsj:]>, <User [zimmerle:]>, <User [adrielcafe:]>, <User [talitaoliveira:]>, <User [arthuralvim:]>, <User [paulorec:]>, <User [dmesquita:]>, <User [ktquez:]>, <User [guilhermefarias:]>, <User [brunofarache:]>, <User [lailsonbm:]>, <User [lmmenge:]>, <User [brunobasto:]>, <User [guiocavalcanti:]>, <User [pcstl:]>, <User [roselmamendes:]>, <User [horaciojcfilho:]>, <User [dakerfp:]>, <User [caiorss:]>, <User [interaminense:]>, <User [victorlaerte:]>, <User [diegocarloslima:]>, <User [miguelarauj1o:]>, <User [diegonvs:]>, <User [eduardocruz:]>, <User [henriquemenezes:]>, <User [chocoelho:]>, <User [karlafalcao:]>, <User [thiagodiniz:]>, <User

In [10]:

##################################################################

## chama gh.user() - tratamento individual
## porque assim retorna todos os atributos do objeto
full_ghusers_batches = [[]]

for u in ghusers_batches:
    for v in u:
        full_ghusers_batches[0].append(gh.user(v))

##################################################################
## teste

# print('\nobject: ', full_ghusers_batches[0][0])
# print('name: ', full_ghusers_batches[0][0].name)
# print('id: ', full_ghusers_batches[0][0].id)
# print('numero de seguidores: ', full_ghusers_batches[0][0].followers)
# print('criado em: ', full_ghusers_batches[0][0].created_at)

# >>> object:  tarruda
# >>> name:  Thiago de Arruda
# >>> id:  842846
# >>> numero de seguidores:  568
# >>> criado em:  2011-06-10 19:33:28+00:00
##################################################################

full_ghusers_batches

[[<User [tarruda:Thiago de Arruda]>,
  <User [mairatma:Maira Bello]>,
  <User [joselitojunior1:Joselito]>,
  <User [marcelcaraciolo:Marcel]>,
  <User [luanfonceca:Luan Fonseca]>,
  <User [deividazevedo2:Deivid Azevedo]>,
  <User [simoneas02:Simone Amorim]>,
  <User [gileno:Gileno Filho]>,
  <User [filipeximenes:Filipe A Ximenes]>,
  <User [renatooliveira:Renato Oliveira]>,
  <User [luiztiago:Luiz Tiago Oliveira]>,
  <User [ac-pm:acpm]>,
  <User [fernandocastor:Fernando Castor]>,
  <User [henvic:Henrique Vicente]>,
  <User [clovisdasilvaneto:Clóvis Neto]>,
  <User [fjsj:Flávio Juvenal da Silva Junior]>,
  <User [zimmerle:Felipe Zimmerle]>,
  <User [adrielcafe:Adriel Café]>,
  <User [talitaoliveira:Talita Oliveira]>,
  <User [arthuralvim:Arthur Alvim]>,
  <User [paulorec:Paulo Amorim]>,
  <User [dmesquita:Déborah Mesquita]>,
  <User [ktquez:Alan Ktquez]>,
  <User [guilhermefarias:Guilherme Farias]>,
  <User [brunofarache:Bruno Farache]>,
  <User [lailsonbm:Lailson Bandeira]>,
  <User [lm

## ghusers_batches (gh.search_users()) / full_ghusers_batches (gh.user())
![image.png](attachment:image.png)

In [25]:
full_ghusers_batches[0]

[<User [tarruda:Thiago de Arruda]>,
 <User [mairatma:Maira Bello]>,
 <User [joselitojunior1:Joselito]>,
 <User [marcelcaraciolo:Marcel]>,
 <User [luanfonceca:Luan Fonseca]>,
 <User [deividazevedo2:Deivid Azevedo]>,
 <User [simoneas02:Simone Amorim]>,
 <User [gileno:Gileno Filho]>,
 <User [filipeximenes:Filipe A Ximenes]>,
 <User [renatooliveira:Renato Oliveira]>,
 <User [luiztiago:Luiz Tiago Oliveira]>,
 <User [ac-pm:acpm]>,
 <User [fernandocastor:Fernando Castor]>,
 <User [henvic:Henrique Vicente]>,
 <User [clovisdasilvaneto:Clóvis Neto]>,
 <User [fjsj:Flávio Juvenal da Silva Junior]>,
 <User [zimmerle:Felipe Zimmerle]>,
 <User [adrielcafe:Adriel Café]>,
 <User [talitaoliveira:Talita Oliveira]>,
 <User [arthuralvim:Arthur Alvim]>,
 <User [paulorec:Paulo Amorim]>,
 <User [dmesquita:Déborah Mesquita]>,
 <User [ktquez:Alan Ktquez]>,
 <User [guilhermefarias:Guilherme Farias]>,
 <User [brunofarache:Bruno Farache]>,
 <User [lailsonbm:Lailson Bandeira]>,
 <User [lmmenge:Lucas Menge]>,
 <User

In [None]:

ghuser = full_ghusers_batches[0][12]

print([ghuser.iter_followers()])

# Users

In [None]:

def create_users_df(batches):
    return pd.concat(
        [pd.DataFrame({'User': u}) for u in batches],
        ignore_index=True)

In [14]:
gh_users = create_users_df(full_ghusers_batches)


In [None]:
gh_users.head()

In [None]:
gh_users.describe()

# Followers

In [92]:

def followers_map(gh_user):
    '''  Map  '''
#     print(gh_user)
    return ','.join([str(u) for u in gh_user.iter_followers()])
#     return pd.DataFrame({'User': 
#                          [str(u) for u in gh_user.iter_followers()]
#                         })


def map_f_batch(map_fn, users_iterator):
    '''
    
    '''
#     print()
    return pd.DataFrame({'Followers': [map_fn(u) for u in users_iterator]})


def create_followers_df(batches):
    return pd.concat(
        [map_f_batch(followers_map, u) for u in batches],
        ignore_index=True)

In [93]:
gh_followers = create_followers_df(full_ghusers_batches)

In [94]:
gh_followers.head()

Unnamed: 0,Followers
0,"Sannis,danielmahon,csjaba,FergusRedican,Victor..."
1,"brunocoelho,henvic,eduardolundgren,aperrelli,a..."
2,"renatooliveira,jeffesonmaia,jotaefe,duartefq,J..."
3,"thiagoarrais,brunojm,henriquebastos,macndesign..."
4,"brunohenrique,luizvarela,gladson,lucasbibiano,..."


In [66]:
gh_followers.describe()

Unnamed: 0,Followers
count,2015
unique,2015
top,Empty DataFrame Columns: [User] Index: []
freq,1


In [76]:
gh_followers['nF'] = gh_followers['Followers'].apply(len)
gh_followers.head()

Unnamed: 0,Followers,nF
0,User 0 S...,567
1,User 0 brunocoelh...,371
2,User 0 renatooliveira ...,352
3,User 0 thiagoarrais 1 ...,335
4,User 0 brunohenrique 1 ...,305


In [87]:
# pd.DataFrame({'User':gh_users['User']}).merge()
gh_users_followers = gh_users.join(gh_followers)
gh_users_followers.head()

Unnamed: 0,User,Followers,nF
0,tarruda,User 0 S...,567
1,mairatma,User 0 brunocoelh...,371
2,joselitojunior1,User 0 renatooliveira ...,352
3,marcelcaraciolo,User 0 thiagoarrais 1 ...,335
4,luanfonceca,User 0 brunohenrique 1 ...,305


In [46]:
gh_users_followers.describe()

Unnamed: 0,nF
count,2015.0
mean,6.079901
std,25.578395
min,0.0
25%,0.0
50%,1.0
75%,4.0
max,567.0


In [47]:
# gh_users_followers.to_csv('../data/tmp_users__followers.csv', index=False) 

# Repos

In [48]:
# aguardar API rate (em torno de 1h)

def repositories_map(gh_user, repo_type):
    return [u for u in gh.iter_user_repos(gh_user, type=repo_type)]

def map_r_batch(map_fn, users_iterator):
    return pd.DataFrame({'Repo_Owner':[map_fn(u, 'owner') for u in users_iterator],
                         'Repo_Member': [map_fn(u, 'member') for u in users_iterator]})

def create_repositories_df(batches):
    return pd.concat(
        [map_r_batch(repositories_map, u) for u in batches], 
        ignore_index=True)


gh_repositories = create_repositories_df(full_ghusers_batches)
gh_repositories

Unnamed: 0,Repo_Owner,Repo_Member
0,"[tarruda/Algoritmos, tarruda/archdb, tarruda/b...",[libmpack/libmpack]
1,"[mairatma/alloy-ui, mairatma/alloyui.com, mair...","[deprecate/steel-avatar, deprecate/steel-cell-..."
2,"[joselitojunior1/abigobaldo-nunes-adventures, ...","[acmh/maecoruja, Cisneiros/projeto-anfa, demia..."
3,"[marcelcaraciolo/apontador-api-libs, marcelcar...","[irgmedeiros/TCCRecommender, jg1141/Open-Allur..."
4,"[luanfonceca/168horas, luanfonceca/4stoq, luan...","[andrezap/analise_expressao_genica, andrezap/a..."
5,"[deividazevedo2/AprendendoPHP, deividazevedo2/...","[DarthHugh/Scream, IndyPaula/HumQueCaro, IndyP..."
6,"[simoneas02/2017-goals, simoneas02/2018-goals,...","[afonsopacifer/css-grid-layout-manual, CodeMig..."
7,"[gileno/analise-dados-imobiliarios, gileno/API...",[demacdolincoln/Analise_de_Dados_de_Redes_Soci...
8,"[filipeximenes/agendacultural, filipeximenes/b...","[tapanpandita/pocket, vintasoftware/django-app..."
9,"[renatooliveira/authomatic, renatooliveira/cel...","[alessandroHenrique/coinpricemonitor, filipecm..."


In [50]:
gh_repositories.head()

Unnamed: 0,Repo_Owner,Repo_Member
0,"[tarruda/Algoritmos, tarruda/archdb, tarruda/b...",[libmpack/libmpack]
1,"[mairatma/alloy-ui, mairatma/alloyui.com, mair...","[deprecate/steel-avatar, deprecate/steel-cell-..."
2,"[joselitojunior1/abigobaldo-nunes-adventures, ...","[acmh/maecoruja, Cisneiros/projeto-anfa, demia..."
3,"[marcelcaraciolo/apontador-api-libs, marcelcar...","[irgmedeiros/TCCRecommender, jg1141/Open-Allur..."
4,"[luanfonceca/168horas, luanfonceca/4stoq, luan...","[andrezap/analise_expressao_genica, andrezap/a..."


## Repo Languages

In [52]:
# linguagem do repositorio
gh_repositories['mRepo_Language'] = gh_repositories['Repo_Member'].apply(lambda x: ','.join([str(y.language) for y in x]))
gh_repositories['oRepo_Language'] = gh_repositories['Repo_Owner'].apply(lambda x: ','.join([str(y.language) for y in x]))

# normalizando
gh_repositories['Repo_Member'] = gh_repositories['Repo_Member'].apply(lambda x: ','.join([str(y) for y in x]))
gh_repositories['Repo_Owner']  =  gh_repositories['Repo_Owner'].apply(lambda x: ','.join([str(y) for y in x]))

gh_repositories = gh_repositories.reindex(['Repo_Member', 'mRepo_Language', 'Repo_Owner','oRepo_Language'], axis=1)

# gh_repositories.head()
gh_users_repositories = gh_users.join(gh_repositories)

gh_users_repositories.head()

Unnamed: 0,User,Repo_Member,mRepo_Language,Repo_Owner,oRepo_Language
0,tarruda,libmpack/libmpack,C,"tarruda/Algoritmos,tarruda/archdb,tarruda/back...","JavaScript,JavaScript,JavaScript,JavaScript,Py..."
1,mairatma,"deprecate/steel-avatar,deprecate/steel-cell-de...","CSS,CSS,HTML,CSS,HTML,CSS,JavaScript,CSS,JavaS...","mairatma/alloy-ui,mairatma/alloyui.com,mairatm...","JavaScript,JavaScript,TypeScript,JavaScript,Ja..."
2,joselitojunior1,"acmh/maecoruja,Cisneiros/projeto-anfa,demianbo...","JavaScript,Java,Java,HTML,JavaScript,None","joselitojunior1/abigobaldo-nunes-adventures,jo...","JavaScript,HTML,CSS,None,None,None,None,GCC Ma..."
3,marcelcaraciolo,"irgmedeiros/TCCRecommender,jg1141/Open-Allure-DS","Python,Python","marcelcaraciolo/apontador-api-libs,marcelcarac...","PHP,Python,Python,None,Python,Python,Python,No..."
4,luanfonceca,"andrezap/analise_expressao_genica,andrezap/arv...","Java,Java,Java,Ruby,Ruby,JavaScript,JavaScript...","luanfonceca/168horas,luanfonceca/4stoq,luanfon...","CSS,Ruby,Python,JavaScript,Python,Python,None,..."


In [None]:
# export
# gh_users_repositories.to_csv('../data/tmp_users__repositories.csv', index=False)

## Backup dos dados coletados no momento do scraping
**Note que o usuário pode não existir mais ou ter mudado o seu username, etc.**

In [55]:
# # Users trabalhados no projeto
# backup_batch_df = pd.read_csv('../data/users__followers.csv')
# b_ghusers_batch = backup_batch_df['User'].tolist()

# # trata users nao encontrados
# full_ghusers_batches2 = [[gh.user(x) for x in b_ghusers_batch if gh.user(x) is not None]]

# gh_users2 = create_users_df(full_ghusers_batches2)

# # atualiza os seguidores dos users
# gh_followers2 = create_followers_df(full_ghusers_batches2)

In [None]:
# aguardar API rate 
# atualiza os repositorios dos users
# gh_repositories2 = create_repositories_df(full_ghusers_batches2)