In [3]:
import requests
import json

In [4]:
results = []

In [5]:
# this is the first query that we are going to use. It retrieves all 
#repositories created after 1 January 2020

q = "created:2019-01-01..2019-12-31"

In [6]:
#this code uses get method to extract data from github api

#def search_repo_paging(q):

url = 'https://api.github.com/search/repositories'
params = {'q' : q, 'sort' : 'forks', 'order' : 'desc', 
          'per_page' : 100}



while True:
    res = requests.get(url, params = params)
    result = res.json()
    results.extend(result['items'])         #data of result is appended
                                            #to results 
    params = {}
        
    try:
        url = res.links['next']['url']      #through this we get the 
                                            #link of next page.
    except:
        break
    

In [7]:
from pandas.io.json import json_normalize
import json
import pandas as pd
import bson.json_util as json_util

In [8]:
sanitized = json.loads(json_util.dumps(results))

In [9]:
# Converting the obtained data into a tabular form

normalized = pd.json_normalize(sanitized)

In [10]:
df = pd.DataFrame(normalized)

In [11]:
import csv

df.to_csv('Github_repo_data_2019.csv')



In [12]:
df.columns

Index(['id', 'node_id', 'name', 'full_name', 'private', 'html_url',
       'description', 'fork', 'url', 'forks_url', 'keys_url',
       'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url',
       'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url',
       'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url',
       'languages_url', 'stargazers_url', 'contributors_url',
       'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url',
       'comments_url', 'issue_comment_url', 'contents_url', 'compare_url',
       'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url',
       'milestones_url', 'notifications_url', 'labels_url', 'releases_url',
       'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url',
       'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size',
       'stargazers_count', 'watchers_count', 'language', 'has_issues',
       'has_projects', 'has_downloads', 'has_wiki', 'has_pages', 

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 96 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         1000 non-null   int64  
 1   node_id                    1000 non-null   object 
 2   name                       1000 non-null   object 
 3   full_name                  1000 non-null   object 
 4   private                    1000 non-null   bool   
 5   html_url                   1000 non-null   object 
 6   description                767 non-null    object 
 7   fork                       1000 non-null   bool   
 8   url                        1000 non-null   object 
 9   forks_url                  1000 non-null   object 
 10  keys_url                   1000 non-null   object 
 11  collaborators_url          1000 non-null   object 
 12  teams_url                  1000 non-null   object 
 13  hooks_url                  1000 non-null   object

# Cleaning and Analyzing description of repos

In [14]:
#removing empty strings in description
df = df.dropna(subset = ['description'])

In [15]:
#now removing non-english descriptions using langdetect library for detecting the language
from langdetect import detect

type(df['description'])
df['lang'] = df.apply(lambda x: detect(x['description']), axis = 1)

In [16]:
df.head()

Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,owner.received_events_url,owner.type,owner.site_admin,license.key,license.name,license.spdx_id,license.url,license.node_id,license,lang
0,177736533,MDEwOlJlcG9zaXRvcnkxNzc3MzY1MzM=,996.ICU,996icu/996.ICU,False,https://github.com/996icu/996.ICU,Repo for counting stars and contributing. Pres...,False,https://api.github.com/repos/996icu/996.ICU,https://api.github.com/repos/996icu/996.ICU/forks,...,https://api.github.com/users/996icu/received_e...,User,False,other,Other,NOASSERTION,,MDc6TGljZW5zZTA=,,en
1,189621607,MDEwOlJlcG9zaXRvcnkxODk2MjE2MDc=,docs,github/docs,False,https://github.com/github/docs,The open-source repo for docs.github.com,False,https://api.github.com/repos/github/docs,https://api.github.com/repos/github/docs/forks,...,https://api.github.com/users/github/received_e...,Organization,False,other,Other,NOASSERTION,,MDc6TGljZW5zZTA=,,en
2,228683419,MDEwOlJlcG9zaXRvcnkyMjg2ODM0MTk=,it-cert-automation-practice,google/it-cert-automation-practice,False,https://github.com/google/it-cert-automation-p...,Google IT Automation with Python Professional ...,False,https://api.github.com/repos/google/it-cert-au...,https://api.github.com/repos/google/it-cert-au...,...,https://api.github.com/users/google/received_e...,Organization,False,apache-2.0,Apache License 2.0,Apache-2.0,https://api.github.com/licenses/apache-2.0,MDc6TGljZW5zZTI=,,en
3,212094940,MDEwOlJlcG9zaXRvcnkyMTIwOTQ5NDA=,DO180-apps,RedHatTraining/DO180-apps,False,https://github.com/RedHatTraining/DO180-apps,DO180 Repository for Sample Applications,False,https://api.github.com/repos/RedHatTraining/DO...,https://api.github.com/repos/RedHatTraining/DO...,...,https://api.github.com/users/RedHatTraining/re...,Organization,False,,,,,,,en
4,177216275,MDEwOlJlcG9zaXRvcnkxNzcyMTYyNzU=,mslearn-tailspin-spacegame-web,MicrosoftDocs/mslearn-tailspin-spacegame-web,False,https://github.com/MicrosoftDocs/mslearn-tails...,Code used in Microsoft Learn modules to suppor...,False,https://api.github.com/repos/MicrosoftDocs/msl...,https://api.github.com/repos/MicrosoftDocs/msl...,...,https://api.github.com/users/MicrosoftDocs/rec...,Organization,False,other,Other,NOASSERTION,,MDc6TGljZW5zZTA=,,en


In [17]:
df['lang'].count()

767

In [18]:
df.to_csv('Github_repo_data_2019_with_lang_column.csv')