In [6]:
import re
import sys
import csv
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pymongo
import pytz
from dateutil.parser import parse as dateParser
from datetime import datetime

import datautil

from tqdm.auto import tqdm   # progress bar keeps human being happy

In [7]:
# save raw data to cache/
if not os.path.exists('cache/maven-data.csv.tar.xz'):
    !wget -P cache/ https://zenodo.org/record/1489120/files/maven-data.csv.tar.xz
if not os.path.exists('cache/next_all.csv'):
    !tar -xvf cache/maven-data.csv.tar.xz
    !mv maven-data.csv/* cache/
    !rmdir maven-data.csv

In [8]:
df_release = pd.read_csv('cache/release_all.csv', low_memory=False)
df_next = pd.read_csv('cache/next_all.csv', low_memory=False)
df_links = pd.read_csv('cache/links_all.csv', low_memory=False)
df_release, df_next, df_links

(                                                  artifact packaging  \
 0        org.apache.directory.shared:shared-ldap-client...       Jar   
 1        org.apache.directory.shared:shared-ldap-schema...       Jar   
 2         org.apache.directory.shared:shared-i18n:1.0.0-M7       Jar   
 3        org.apache.directory.shared:shared-ldap-extras...       Jar   
 4                                        antlr:antlr:2.7.7       Jar   
 ...                                                    ...       ...   
 2407330        ai.h2o:sparkling-water-examples_2.11:2.2.24       Jar   
 2407331             ai.h2o:sparkling-water-doc_2.11:2.1.38       Jar   
 2407332         ai.h2o:sparkling-water-package_2.11:2.1.38       Jar   
 2407333        ai.h2o:sparkling-water-examples_2.11:2.1.38       Jar   
 2407334            ai.h2o:sparkling-water-repl_2.11:2.1.38       Jar   
 
                            release  
 0        2011-08-15T06:26:46Z[GMT]  
 1        2011-08-15T06:26:08Z[GMT]  
 2      

In [4]:
# save dependency graph as nx format
if not os.path.exists('cache/dep_graph.gz'):
    import networkx as nx
    G = nx.from_pandas_edgelist(df_links, 'source', 'target')
    nx.write_gpickle(G, path='cache/dep_graph.gz')
    f'{G.number_of_nodes()} Nodes, {G.number_of_edges()} Edges'

In [9]:
# checkpoint here
G = nx.read_gpickle(path='cache/dep_graph.gz')
f'{G.number_of_nodes()} Nodes, {G.number_of_edges()} Edges'

'1965374 Nodes, 9700176 Edges'

In [12]:
def parse_version(version: str) -> str:
    # ref: https://stackoverflow.com/questions/6618868
    if match := re.search('(\d+(?:\.\d+)+[-.]?[a-zA-Z\d]*)', version):
        return match.group(1)
    else:
        return ''

# remove appendix in version str
def parse_version_strict(version: str) -> str:
    # ref: https://stackoverflow.com/questions/6618868
    if match := re.search('(\d+(?:\.\d+)+)', version):
        return match.group(1)
    else:
        return ''

parse_version("[2.1.RC1,)"), parse_version_strict("[1.3.2-alpha,)")

('2.1.RC1', '1.3.2')

In [13]:
from packaging import version
# split lib/version
df_temp = df_release['artifact'].str.rsplit(':', n=1, expand=True)
df_release['lib'] = df_temp[0]
df_release['version'] = df_temp[1]
df_release.fillna('', inplace=True)

# version contains '~v1.1.15-beta', should be parsed as well
tqdm.pandas()
df_release['version'] = df_release.progress_apply(lambda x: parse_version(str(x['version'])), axis=1)
# [WARNING] you should parse version as well before saving to graph!

# 'release' is unreliable, use version as main key
df_release.sort_values(by='release', ascending=False, inplace=True)  # pandas sort should keep things in order
df_release.sort_values(by='version', ascending=False, inplace=True, key=lambda x: parse_version_strict(x))
# version like 1.33-SNAPSHOT would be parsed as 'LegacyVersion', which has no len()
# df_release.sort_values(by='version', ascending=False, inplace=True, key=lambda x: version.parse(str(x)))

df_release

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2407335.0), HTML(value='')))




Unnamed: 0,artifact,packaging,release,lib,version
1751901,org.epics:ca:999.999.999,Jar,2016-08-10T15:08:35Z[GMT],org.epics:ca,999.999.999
1880648,org.apache.zookeeper:zookeeper:99.99,Jar,2016-08-10T15:08:35Z[GMT],org.apache.zookeeper:zookeeper,99.99
1341611,com.sun:tools:99.9.9,Jar,2016-08-10T15:08:35Z[GMT],com.sun:tools,99.9.9
1867714,com.ontology2:centipede-parser:99.9,Jar,2014-11-27T01:05:29Z[GMT],com.ontology2:centipede-parser,99.9
1923786,com.ontology2:centipede-parent:99.8,Jar,2016-08-10T15:08:35Z[GMT],com.ontology2:centipede-parent,99.8
...,...,...,...,...,...
2161575,net.nemerosa.ontrack:ontrack-extension-github:...,Jar,2017-06-02T14:36:50Z[GMT],net.nemerosa.ontrack:ontrack-extension-github,
2161548,net.nemerosa.ontrack:ontrack-ui:feature-512-aq...,Jar,2017-06-02T14:37:06Z[GMT],net.nemerosa.ontrack:ontrack-ui,
2161623,net.nemerosa.ontrack:ontrack-git:feature-512-a...,Jar,2017-06-02T14:37:21Z[GMT],net.nemerosa.ontrack:ontrack-git,
2204580,org.apache.uima:parent-pom:11,Jar,2016-08-10T15:08:35Z[GMT],org.apache.uima:parent-pom,


In [8]:
dep_unique = pd.read_csv('cache/dep_unique.csv', low_memory=False)
dep_unique.fillna('', inplace=True)
dep_unique.sort_values(by='lib', inplace=True)
dep_unique[dep_unique['lib']=='?']
# strange behavior here, handle afterwards

Unnamed: 0,dep_name,lib,version,published_time
99930,?:commons-codec-1.4,?,1.4,
99919,?:jgrapht-jdk1.6-0.8.1,?,1.6-0,
99920,?:guava-13.0,?,13.0,
99922,?:slf4j-api-1.6.1,?,1.6.1,
99923,?:commons-httpclient-3.0.1,?,3.0.1,
...,...,...,...,...
178264,?:org.apache.commons,?,,
67563,?:1.3.174,?,1.3.174,
5,?:?,?,,
349188,?:${project.parent.version},?,,


In [15]:
versions = df_release[df_release['lib']=='ant:ant']
versions[versions['version'].str.match('^1.6.5')]

Unnamed: 0,artifact,packaging,release,lib,version
1661,ant:ant:1.6.5,Jar,2005-11-22T18:06:39Z[GMT],ant:ant,1.6.5


In [16]:
# use dict to accelerate lookup
# ref: https://stackoverflow.com/questions/59554213
versions_dict = {i: group.reset_index(drop=True) for i, group in tqdm(df_release.groupby('lib'))}
versions_dict['ant:ant']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=223478.0), HTML(value='')))




Unnamed: 0,artifact,packaging,release,lib,version
0,ant:ant:1.6.5,Jar,2005-11-22T18:06:39Z[GMT],ant:ant,1.6.5
1,ant:ant:1.6.4,Jar,2005-11-22T18:06:37Z[GMT],ant:ant,1.6.4
2,ant:ant:1.6.3,Jar,2005-11-22T18:06:36Z[GMT],ant:ant,1.6.3
3,ant:ant:1.6.2,Jar,2005-11-22T18:06:36Z[GMT],ant:ant,1.6.2
4,ant:ant:1.6.1,Jar,2005-11-22T18:06:35Z[GMT],ant:ant,1.6.1
5,ant:ant:1.6,Jar,2005-11-22T18:06:39Z[GMT],ant:ant,1.6
6,ant:ant:1.5.4,Jar,2005-11-22T18:06:35Z[GMT],ant:ant,1.5.4
7,ant:ant:1.5.3-1,Jar,2005-11-22T18:06:35Z[GMT],ant:ant,1.5.3-1
8,ant:ant:1.5.2,Jar,2005-11-22T18:06:34Z[GMT],ant:ant,1.5.2
9,ant:ant:1.5.1,Jar,2005-11-22T18:06:33Z[GMT],ant:ant,1.5.1


In [17]:
# use leveldb to cache version lookups
# k: str, v: str
import json
import plyvel

try:
    if db_ver_dep.closed:
        db_ver_dep = plyvel.DB('cache/ver_dep_graph', create_if_missing=True)
except NameError:
    db_ver_dep = plyvel.DB('cache/ver_dep_graph', create_if_missing=True)

def cache_put(k: str, v: str, db=db_ver_dep):
    key = k.encode('utf-8')
    value = v.encode('utf-8')
    db.put(key, value)

def cache_get(k: str, db=db_ver_dep):
    key = k.encode('utf-8')
    res = db.get(key)
    return res.decode('utf-8') if res else None

def cache_delete(k: str, db=db_ver_dep):
    key = k.encode('utf-8')
    db.delete(key)

cache_put('shuwarin', 'dreaming'), cache_get('shuwarin'), cache_delete('shuwarin')

(None, 'dreaming', None)

In [18]:
from functools import wraps

lookups = 0
hits = 0

# cache version lookups
def cache_ver(f, db=db_ver_dep):
    @wraps(f)
    def wrapper(lib: str, version="", commitTime=None):
        global lookups, hits
        lookups += 1
        # if no version found, cache doesn't make any sence
        cache = cache_get(lib+':'+version, db=db) if version else None
        if cache:
            hits += 1
            return cache
        else:
            res = f(lib, version, commitTime)
            if version:
                cache_put(lib+':'+version, res, db=db)  # argument
            cache_put(lib+':'+res, res, db=db)  # real version
            return res
    return wrapper

In [19]:
versions_dict['xstream:xstream']

Unnamed: 0,artifact,packaging,release,lib,version
0,xstream:xstream:1.1.3,Jar,2006-01-15T17:40:57Z[GMT],xstream:xstream,1.1.3
1,xstream:xstream:1.1.2,Jar,2005-11-22T18:29:17Z[GMT],xstream:xstream,1.1.2
2,xstream:xstream:1.1.1,Jar,2005-11-22T18:29:17Z[GMT],xstream:xstream,1.1.1
3,xstream:xstream:1.1,Jar,2005-11-22T18:29:17Z[GMT],xstream:xstream,1.1
4,xstream:xstream:1.0.3.2-dev,Jar,2016-08-10T15:08:35Z[GMT],xstream:xstream,1.0.3.2-dev
5,xstream:xstream:1.0.2,Jar,2005-11-22T18:29:17Z[GMT],xstream:xstream,1.0.2
6,xstream:xstream:1.0.1,Jar,2005-11-22T18:29:17Z[GMT],xstream:xstream,1.0.1
7,xstream:xstream:1.0-rc1,Jar,2005-11-22T18:29:17Z[GMT],xstream:xstream,1.0-rc1
8,xstream:xstream:1.0-SNAPSHOT,Jar,2016-08-10T15:08:35Z[GMT],xstream:xstream,1.0-SNAPSHOT
9,xstream:xstream:1.0,Jar,2005-11-22T18:29:17Z[GMT],xstream:xstream,1.0


In [39]:
# lookup direct dependencies in libraryVersionToDependency
# return format: { lib: 'lib', version: '1.1', dep: {'a': '1.0 } }

# # simple caching
# from functools import lru_cache

# @timethis
# @cache_ver
def get_version_dep_graph(lib: str, version="", commitTime=None) -> str:
    res = ''
    version_ = None  # candidate version
    # versions = df_release[df_release['lib']==lib]  # slow,,,
    try:
        versions = versions_dict[lib]
    except KeyError:
        return ''  # Not Found  
    # skip 1-4 if version is empty
    if version:
        # 1. exact matching
        res = versions[versions['version'].str.match("^{version}$")]
        # 2. version*
        if not len(res):
            version_ = version
            res = versions[versions['version'].str.match("^{version_}")]
        # 3. strict parser
        if not len(res):
            version_ = parse_version_strict(version)
            res = versions[versions['version'].str.match("^{version_}")]
        # 4. remove last dot (3.6.5-alpha -> 3.6.5 -> 3.6)
        if not len(res):
            version_='.'.join(version_.split('.')[:-1])
            # version_ may be an empty string?
            if version_:
                res = versions[versions['version'].str.match("^{version_}")] 
        
        ## skip checking 'release' time
      
        # #  sanity check
        # if version_ and commitTime:
        #     res = res[[dateParser(row['release']) < commitTime for index, row in res.iterrows()]]

    # # 5. find last version before commit;
    # if not len(res) and commitTime:
    #     res = versions[[dateParser(row['release']) < commitTime for index, row in versions.iterrows()]]
    
    # 6. find all results
    if not len(res):
        res = versions
    return res.iloc[0]['version'] if len(res) else ''

res = get_version_dep_graph('xstream:xstream' ,'1.1.3')
res

'1.1.3'

In [40]:
from tqdm.contrib.concurrent import process_map  # basically a ProcessPoolExecutor

lookups = 0
hits = 0

def thread_worker(df: pd.DataFrame):
    tqdm.pandas()
    df['ver_dep_graph'] = df.progress_apply(lambda x: get_version_dep_graph(str(x['lib']), str(x['version'])), axis=1)

thread_worker(dep_unique)

# # multiprocessing with progress bar
# import multiprocessing as mp
# n_workers = 4
# df_parts = np.array_split(dep_unique ,n_workers)
# dep_unique = pd.concat(process_map(thread_worker, df_parts, max_workers=n_workers))

if lookups:
    print(f'hits: {hits}/{lookups} {(hits*100)//lookups}%')
dep_unique.to_csv('cache/dep_unique_1.csv', index=False)
dep_unique

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=657426.0), HTML(value='')))




KeyboardInterrupt: 

In [32]:
dep_unique = pd.read_csv('cache/dep_unique_1.csv', low_memory=False)
if lookups:
    print(f'hits: {hits}/{lookups} {(hits*100)//lookups}%')
dep_unique[dep_unique['ver_dep_graph']=='version']

Unnamed: 0,dep_name,lib,version,published_time,ver_dep_graph


In [31]:
from packaging import version
version.parse("")

<LegacyVersion('')>