In [9]:
import utils
utils.logging.basicConfig(level=utils.logging.INFO)

In [10]:
from perceval.backends.core.git import Git

In [11]:
settings = utils.read_config_file('settings.yml')
es = utils.establish_connection(settings['es_host'])

INFO:root:settings.yml settings file readed and parsed
INFO:urllib3.connectionpool:Starting new HTTP connection (1): localhost
INFO:elasticsearch:HEAD http://localhost:9200/ [status:200 request:0.010s]
INFO:root:Connection established with http://localhost:9200


In [12]:
MAPPING_GIT = {
    "mappings": {
        "item": {
            "properties": {
                "date": {
                    "type": "date",
                    "format" : "E MMM d HH:mm:ss yyyy Z",
                    "locale" : "US"
                },
                "commit": {"type": "keyword"},
                "author": {"type": "keyword"},
                "domain": {"type": "keyword"},
                "file": {"type": "keyword"},
                "added": {"type": "integer"},
                "removed": {"type": "integer"},
                "repository": {"type": "keyword"}
            }
        }
    }
}

In [13]:
index_name = 'git-vscode'
utils.create_ES_index(es, index_name, MAPPING_GIT)

INFO:elasticsearch:DELETE http://localhost:9200/git-vscode [status:404 request:0.006s]
INFO:elasticsearch:PUT http://localhost:9200/git-vscode [status:200 request:0.262s]
INFO:root:git-vscode index created


In [14]:
for repo_url in settings['git']:
    
    repo_name = repo_url.split('/')[-1]
    repo = Git(uri=repo_url, gitpath='/tmp/'+repo_name)
    
    utils.logging.info('Go for {}'.format(repo_name))
    
    items = []
    bulk_size = 10000
    
    for commit in repo.fetch():
        
        author_name = commit['data']['Author'].split('<')[0][:-1]
        author_domain = commit['data']['Author'].split('@')[-1][:-1]
        
        for file in commit['data']['files']:
            if 'added' not in file.keys() or file['added'] == '-':
                file['added'] = 0
            if 'removed' not in file.keys() or file['removed'] == '-':
                file['removed'] = 0

            summary = {
                'date': commit['data']['AuthorDate'],
                'commit': commit['data']['commit'],
                'author': author_name,
                'domain': author_domain,
                'file': file['file'],
                'added': file['added'],
                'removed': file['removed'],
                'repository': repo_name
            }
            
            items.append({'_index': index_name, '_type': 'item', '_source': summary})
            
            if len(items) > bulk_size:
                utils.helpers.bulk(es, items)
                items = []
                utils.logging.info('{} items uploaded'.format(bulk_size))
            
    if len(items) != 0:
        utils.helpers.bulk(es, items)
        utils.logging.info('Remaining {} items uploaded'.format(len(items)))

INFO:root:Go for vscode.git
INFO:perceval.backends.core.git:Fetching commits: 'https://github.com/Microsoft/vscode.git' git repository from 1970-01-01 00:00:00+00:00; all branches
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.074s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.092s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.086s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.074s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.070s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.069s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.069s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.124s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.074s]
INFO:elasticsearch:POST http://localhost:9200/_bulk [status:200 request:0.073s]
INFO:elasticsearch:P