In [59]:
import pandas as pd
import numpy as np
import requests
import re
import logging
from bs4 import BeautifulSoup
import sys

In [41]:
df = pd.read_excel('data/cls.xlsx')
package_names = list(df['Unnamed: 1'])
package_names

['ant:ant',
 'org.apache.ant:ant',
 'org.apache.ant:ant-apache-regexp',
 'org.apache.ant:ant-junit',
 'org.apache.ant:ant-nodeps',
 'org.apache.ant:ant-trax',
 'commons-io:commons-io',
 'org.apache.commons:commons-io',
 'org.codehaus.plexus:plexus-utils',
 'antlr:antlr',
 'org.antlr:antlr-runtime',
 'org.antlr:antlr4-runtime',
 'com.tunnelvisionlabs:antlr4-runtime',
 'org.codehaus.mojo:javacc-maven-plugin',
 'org.parboiled:parboiled-java',
 'antlr:stringtemplate',
 'org.antlr:stringtemplate',
 'aopalliance:aopalliance',
 'com.google.inject.extensions:guice-jmx',
 'com.google.inject:guice',
 'org.springframework:spring-beans',
 'org.springframework:spring-context',
 'org.springframework:spring-core',
 'com.google.inject.extensions:guice-assistedinject',
 'com.google.gwt.inject:gin',
 'com.squareup.dagger:dagger',
 'com.squareup.dagger:dagger-compiler',
 'com.google.inject.extensions:guice-multibindings',
 'org.sonatype.sisu:sisu-guice',
 'com.google.inject.extensions:guice-servlet',
 'c

In [61]:
from fake_useragent import UserAgent
import time

class QueryException(Exception):
    pass

def get(url: str, headers):
    sleep_time = 2
    tries = 5
    
    try:
        res = requests.get(url, headers)
    except Exception as e:
        print(f'Error: {url}: {e}', file=sys.stderr)
    time.sleep(sleep_time)
    # repeat request if not successful
    while res and res.status_code != 200 and tries > 0:
        print(f'[{altered_name}] Error: {res.status_code} sleep_time={sleep_time} tries={10-tries}', file=sys.stderr)
        try:
            res = requests.get(url, headers)
        except Exception as e:
            print(f'Error: {url}: {e}', file=sys.stderr)
        time.sleep(sleep_time)
        sleep_time *= 2
        tries -= 1
    
    if res.status_code != 200:
        print(res, file=sys.stderr)
        raise QueryException
    
    return res
    

def query_package(package_name: str):
    altered_name = '/'.join(package_name.split(':'))
    url = 'http://mvnrepository.com/artifact/' + altered_name
    ua = UserAgent()
    headers = {
        'User-Agent': ua.random
    }
    res = get(url, headers)
    return res.text

In [62]:
text = query_package(package_names[1])

In [63]:
def parse_package(html: str):
    res = {"category":[], "tag":[]}
    soup = BeautifulSoup(text, 'html.parser')
    table = soup.find('table', attrs={'class':'grid'})
#     print(table)
#     rows = table.find_all('tr')
#     for row in rows:
#         cols = row.find('td')
#         print(cols)
#         res["category"] = cols[1]
#         res["tag"] = cols[2]
#     print(res)
    trs = table.findAll('tr')
    for tr in trs:
        if tr.th.text == 'Categories':
            res["category"] = [ele.text.strip() for ele in tr.td]
        if tr.th.text == 'Tags':
            res["tag"] = [ele.text.strip() for ele in tr.td]
    return res

In [64]:
parse_package(text)

{'category': ['Build Tools'], 'tag': ['apache', 'ant', 'build', 'tools']}

In [None]:
import time
from tqdm.auto import tqdm

res_all = {}

for package_name in tqdm(package_names):
    try:
        text = query_package(package_name)
    except Exception:
        continue
    res = parse_package(text)
    print(f'[{package_name}]', res)
    res_all[package_name] = res

HBox(children=(FloatProgress(value=0.0, max=904.0), HTML(value='')))

[ant:ant] {'category': ['Build Tools'], 'tag': ['ant', 'build', 'tools']}
[org.apache.ant:ant] {'category': ['Build Tools'], 'tag': ['apache', 'ant', 'build', 'tools']}
[org.apache.ant:ant-apache-regexp] {'category': [], 'tag': ['regexp', 'ant', 'apache']}
[org.apache.ant:ant-junit] {'category': [], 'tag': ['testing', 'junit', 'ant', 'apache']}
[org.apache.ant:ant-nodeps] {'category': [], 'tag': ['ant', 'apache']}
[org.apache.ant:ant-trax] {'category': [], 'tag': ['ant', 'apache']}
[commons-io:commons-io] {'category': ['I/O Utilities'], 'tag': ['io']}
[org.apache.commons:commons-io] {'category': ['I/O Utilities'], 'tag': ['apache', 'io', 'commons']}
[org.codehaus.plexus:plexus-utils] {'category': ['Core Utilities'], 'tag': ['codehaus']}
[antlr:antlr] {'category': ['Parser Generators'], 'tag': ['parser', 'compiler', 'generator']}
[org.antlr:antlr-runtime] {'category': [], 'tag': ['parser', 'compiler', 'runtime']}
[org.antlr:antlr4-runtime] {'category': [], 'tag': ['parser', 'compiler', 

In [None]:
# save py obj with pickle
import pickle
with open('data/res_all.pkl', 'w') as f:
    pickle.dump(res_all, f)

In [66]:
# save changes to excel
df = pd.read_excel('data/cls.xlsx')
df = df.rename(columns={'Unnamed: 1': 'package', 0: 'class'})
df2 = df.set_index('package')
df2['category'] = ""
df2['tag'] = ""
df2

Unnamed: 0_level_0,class,category,tag
package,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ant:ant,0,,
org.apache.ant:ant,0,,
org.apache.ant:ant-apache-regexp,0,,
org.apache.ant:ant-junit,0,,
org.apache.ant:ant-nodeps,0,,
...,...,...,...
org.ocpsoft.rewrite:rewrite-servlet,113,,
org.webjars:bootstrap,114,,
de.agilecoders.wicket:wicket-bootstrap-core,114,,
org.xerial.snappy:snappy-java,115,,


In [72]:
for key, value in res_all.items():
    if value['category']:
        df2.loc[key, 'category'] = value['category'][0]
    if value['tag']:
        df2.loc[key, 'tag'] = ','.join(value['tag'])
df2.to_excel('data/cls2.xlsx')
df2

Unnamed: 0_level_0,class,category,tag
package,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ant:ant,0,Build Tools,"ant,build,tools"
org.apache.ant:ant,0,Build Tools,"apache,ant,build,tools"
org.apache.ant:ant-apache-regexp,0,,"regexp,ant,apache"
org.apache.ant:ant-junit,0,,"testing,junit,ant,apache"
org.apache.ant:ant-nodeps,0,,"ant,apache"
...,...,...,...
org.ocpsoft.rewrite:rewrite-servlet,113,,servlet
org.webjars:bootstrap,114,Web Assets,"web,bootstrap,assets"
de.agilecoders.wicket:wicket-bootstrap-core,114,,"wicket,bootstrap"
org.xerial.snappy:snappy-java,115,Compression Libraries,compression


#### Concurrent scraper is unusable with rate limit :(

In [None]:
from fake_useragent import UserAgent
import time
import asyncio
import aiohttp

class QueryException(Exception):
    pass

async def async_query_package(package_name: str):
    altered_name = '/'.join(package_name.split(':'))
    url = 'https://mvnrepository.com/artifact/' + altered_name
    
    sleep_time = 0.5
    tries = 10
    
    async with aiohttp.ClientSession() as session:
        ua = UserAgent()
        headers = {'User-Agent': ua.random}
        async with session.get(url) as response:
            res = await response.read()
            print(res, tries)
            time.sleep(sleep_time)
    
    # repeat request if not successful
    while not res and tries > 0:
        try:
            async with aiohttp.ClientSession() as session:
                ua = UserAgent()
                headers = {'User-Agent': ua.random}
                async with session.get(url) as response:
                    res = await response.read()
                    print(res, tries)
                    time.sleep(sleep_time)
        except Exception as e:
            sleep_time *= 2
            tries -= 1
    
    if tries == 0:
        raise QueryException
        
    return res.text

In [80]:
async def handle_package(package_name: str):
    text = await async_query_package(package_name)
    res = parse_package(text)
    res['package'] = package_name
    print(res)
    return res

In [78]:
res_all = await asyncio.gather(*(handle_package(pack) for pack in package_names))

TypeError: get() takes 2 positional arguments but 3 were given