In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt

## Papers with code taxonomy

In [2]:
with open('papers_with_code_taxonomy.json', 'r') as f:
    papers_with_code = json.load(f)
#papers_with_code

In [3]:
papers_with_code[0]

{'url': 'https://paperswithcode.com/method/deep-ensembles',
 'name': 'Deep Ensembles',
 'full_name': 'Deep Ensembles',
 'description': '',
 'paper': {'title': 'Simple and Scalable Predictive Uncertainty Estimation using Deep Ensembles',
  'url': 'https://paperswithcode.com/paper/simple-and-scalable-predictive-uncertainty'},
 'introduced_year': 2000,
 'source_url': 'http://arxiv.org/abs/1612.01474v3',
 'source_title': 'Simple and Scalable Predictive Uncertainty Estimation using Deep Ensembles',
 'code_snippet_url': None,
 'num_papers': 103,
 'collections': [{'collection': 'Stochastic Optimization',
   'area_id': 'general',
   'area': 'General'}]}

In [4]:
terms = set([])


In [5]:
term_count = {}
def add_to_count(term_count, term, taxonomy):
    if term in term_count.keys():
        term_count[term]['#papers'] += 1
        #print(term_count[term])
        if taxonomy not in term_count[term]['taxonomies']:
            term_count[term]['taxonomies'].append(taxonomy)
        
    else:
        term_count[term] = {'taxonomies': [taxonomy], '#papers': 1}
        #term_count[term]['taxonomies'].add()

In [6]:
for paper in papers_with_code:
    if 'collections' in paper.keys():
        for collection in paper['collections']:
            if 'area' in collection.keys() and 'collection' in collection.keys():
                mainclass = collection['area'].lower()
                subclass = collection['collection'].lower()
                terms.add(mainclass)
                terms.add(subclass)
                add_to_count(term_count, mainclass, 'papers_with_code')
                add_to_count(term_count, subclass, 'papers_with_code')

## Wikidata taxonomy

### AI

In [7]:
with open('query_ai2.json', 'r') as f:
    wikidata_ml = json.load(f)
for ml in wikidata_ml:
    mainclass = ml['aiclassLabel'].lower()
    add_to_count(term_count, mainclass, 'wikidata_ai')
    terms.add(mainclass)

### ML

In [8]:
with open('query_ml1.json', 'r') as f:
    wikidata_ml = json.load(f)
for ml in wikidata_ml:
    mainclass = ml['aiclassLabel'].lower()
    add_to_count(term_count, mainclass, 'wikidata_ml')
    terms.add(mainclass)

## CSO

In [25]:
df_cso = pd.read_csv('CSO.3.3_taxonomy.csv', header=None)
important_stuff = []
for ind, x in enumerate(df_cso.values):
    print(x)
    break
    if 'artificial_intelligence' in x[0]:
        subclass = x[2].split('/')[-1][:-1].replace('_', ' ').lower()
        add_to_count(term_count, subclass, 'cso')
        important_stuff.append(subclass)

['<https://cso.kmi.open.ac.uk/topics/computer_science>'
 '<http://cso.kmi.open.ac.uk/schema/cso#superTopicOf>'
 '<https://cso.kmi.open.ac.uk/topics/artificial_intelligence>']


In [10]:
important_stuff

['decision theory',
 'intelligent control',
 'natural language processing systems',
 'formal logic',
 'cognitive systems',
 'soft computing',
 'inference engines',
 'medical computing',
 'system theory',
 'decision support systems',
 'decision support system %28dss%29',
 'bayesian networks',
 'bayesian network',
 'machine learning',
 'machine-learning',
 'cellular automata',
 'cellular automata %28ca%29',
 'knowledge based systems',
 'knowledge-based systems',
 'expert systems',
 'expert system',
 'multi agent system %28mas%29',
 'multi-agent system',
 'multi-agent systems',
 'multi agent systems',
 'multiagent systems',
 'intelligent robots',
 'ambient intelligence',
 'heuristic programming',
 'intelligent tutoring',
 'intelligent tutoring system',
 'intelligent tutoring systems',
 'intelligence analysis',
 'ai planning',
 'planning algorithms',
 'constraint satisfaction problems %28csp%29',
 'software engineering',
 'data mining',
 'image interpretation%2c computer-assisted',
 'compu

In [11]:
for k in term_count.keys():
    print(f"term: {k} - #papers: {term_count[k]['#papers']} - taxonomies: {term_count[k]['taxonomies']}")

term: general - #papers: 990 - taxonomies: ['papers_with_code']
term: stochastic optimization - #papers: 57 - taxonomies: ['papers_with_code']
term: computer vision - #papers: 974 - taxonomies: ['papers_with_code']
term: image generation models - #papers: 8 - taxonomies: ['papers_with_code']
term: one-stage object detection models - #papers: 22 - taxonomies: ['papers_with_code']
term: object detection models - #papers: 61 - taxonomies: ['papers_with_code']
term: graphs - #papers: 108 - taxonomies: ['papers_with_code']
term: graph representation learning - #papers: 10 - taxonomies: ['papers_with_code']
term: face recognition models - #papers: 6 - taxonomies: ['papers_with_code']
term: image retrieval models - #papers: 3 - taxonomies: ['papers_with_code']
term: skip connection blocks - #papers: 46 - taxonomies: ['papers_with_code']
term: audio - #papers: 57 - taxonomies: ['papers_with_code']
term: audio model blocks - #papers: 13 - taxonomies: ['papers_with_code']
term: generative advers

### Paper support for wikidata

In [12]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import json

In [13]:
endpoint_url = "https://query.wikidata.org/sparql"

In [14]:
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

In [15]:
with open('query_ai2.json', 'r') as f:
    wikidata_ml = json.load(f)
for ml in wikidata_ml:
    mainclass = ml['aiclass'].split('/')[-1]
    mainclassLabel = ml['aiclassLabel'].lower()

    query_ml_related = query_ml = """select ?tool ?toolLabel ?paper ?paperLabel where {
      ?paper wdt:P921 wd:%s .
      ?tool wdt:P31 wd:Q188860 .
      ?tool wdt:P1343 ?paper .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }}
    """ % (mainclass)
    results_ml = get_results(endpoint_url, query_ml_related)
    
    for result_ml in results_ml["results"]["bindings"]:
        term_count[mainclassLabel]['#papers'] += 1
    
    query_ml_related = query_ml = """select ?tool ?toolLabel where {
      ?tool wdt:P366 wd:%s .
      ?tool wdt:P31 wd:Q188860 .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }}
    """ % (mainclass)
    results_ml1 = get_results(endpoint_url, query_ml_related)
    for ind, result_ml1 in enumerate(results_ml1["results"]["bindings"]):
        term_count[mainclassLabel]['#papers'] += 1
    count = len(results_ml1["results"]["bindings"]) + len(results_ml1["results"]["bindings"])
    print(f'{mainclassLabel} - {count}')

machine learning - 14
natural language processing - 2
expert system - 0
decision support system - 0
computational intelligence - 0
affective computing - 0
evolutionary computation - 0
knowledge engineering - 0
heuristic algorithm - 0
automated planning and scheduling - 0
distributed artificial intelligence - 0
knowledge representation and reasoning - 0
self-management - 0
applications of artificial intelligence - 0
symbolic artificial intelligence - 0
neurorobotics - 0
intelligent robotics - 0
fictional robot - 0
artificial intelligence in fiction - 0
artificial empathy - 0
artificial intelligence in healthcare - 0
explainable ai - 0
federated learning - 0
multimodal sentiment analysis - 0
artificial intelligence in wikimedia projects - 0
quantum artificial intelligence - 0
artificial chemist - 0
associatron - 0
automated music production - 0
autonomous navigation - 0
green ai - 0
red ai - 0
ethical artificial intelligence - 0
responsible ai - 0
prompt engineering - 0
vision and langua

In [19]:
with open('query_ml1.json', 'r') as f:
    wikidata_ml = json.load(f)
for ml in wikidata_ml:
    mainclass = ml['aiclass'].split('/')[-1]
    mainclassLabel = ml['aiclassLabel'].lower()
    #print(mainclassLabel + '\n')

    query_ml_related = query_ml = """select ?tool ?toolLabel ?paper ?paperLabel where {
      ?paper wdt:P921 wd:%s .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }}
    """ % (mainclass)
    results_ml = get_results(endpoint_url, query_ml_related)
    for result_ml in results_ml["results"]["bindings"]:
        term_count[mainclassLabel]['#papers'] += 1
    break
    query_ml_related = query_ml = """select ?tool ?toolLabel where {
      ?tool wdt:P366 wd:%s .
      ?tool wdt:P31 wd:Q188860 .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }}
    """ % (mainclass)
    results_ml1 = get_results(endpoint_url, query_ml_related)
    for ind, result_ml1 in enumerate(results_ml1["results"]["bindings"]):
        term_count[mainclassLabel]['#papers'] += 1
    count = len(results_ml1["results"]["bindings"]) + len(results_ml1["results"]["bindings"])
    print(f'{mainclassLabel} - {count}')

explanation-based learning - 0
pattern recognition - 0
hierarchical temporal memory - 0
reinforcement learning - 0
chemometrics - 0
unsupervised learning - 0
statistical classification - 0
transfer learning - 0
meta-learning - 0
multi-task learning - 0
nonlinear dimensionality reduction - 0
offline learning - 0
one-shot learning - 0
online machine learning - 0
preference learning - 0
robot learning - 0
statistical relational learning - 0
structured prediction - 0
feature learning - 0
quantum machine learning - 0
adversarial machine learning - 0
multiple kernel learning - 0
multimodal learning - 0
m-theory - 0
rule-based machine learning - 0
incremental learning - 0
machine learning in bioinformatics - 0
sequence-to-sequence learning - 0
automated machine learning - 0
image-to-image translation - 0
federated learning - 0
interactive machine learning - 0
end-to-end learning - 0
knowledge distillation - 0
self-supervised learning - 2
machine learning in physics - 0
zero-shot learning - 0


In [22]:
for k in term_count.keys():
    if term_count[k]['#papers'] > 20:
        print(f"term: {k} - #papers: {term_count[k]['#papers']} - taxonomies: {term_count[k]['taxonomies']}")

term: general - #papers: 990 - taxonomies: ['papers_with_code']
term: stochastic optimization - #papers: 57 - taxonomies: ['papers_with_code']
term: computer vision - #papers: 974 - taxonomies: ['papers_with_code']
term: one-stage object detection models - #papers: 22 - taxonomies: ['papers_with_code']
term: object detection models - #papers: 61 - taxonomies: ['papers_with_code']
term: graphs - #papers: 108 - taxonomies: ['papers_with_code']
term: skip connection blocks - #papers: 46 - taxonomies: ['papers_with_code']
term: audio - #papers: 57 - taxonomies: ['papers_with_code']
term: generative adversarial networks - #papers: 43 - taxonomies: ['papers_with_code']
term: graph models - #papers: 61 - taxonomies: ['papers_with_code']
term: feature extractors - #papers: 26 - taxonomies: ['papers_with_code']
term: attention mechanisms - #papers: 72 - taxonomies: ['papers_with_code']
term: sequential - #papers: 73 - taxonomies: ['papers_with_code']
term: activation functions - #papers: 62 - t

In [23]:
for k in term_count.keys():
    if len(term_count[k]['taxonomies']) > 1:
        print(f"term: {k} - #papers: {term_count[k]['#papers']} - taxonomies: {term_count[k]['taxonomies']}")

term: natural language processing - #papers: 286 - taxonomies: ['papers_with_code', 'wikidata_ai', 'cso']
term: reinforcement learning - #papers: 101 - taxonomies: ['papers_with_code', 'wikidata_ai', 'wikidata_ml']
term: heuristic search algorithms - #papers: 7 - taxonomies: ['papers_with_code', 'cso']
term: self-supervised learning - #papers: 51 - taxonomies: ['papers_with_code', 'wikidata_ai', 'wikidata_ml']
term: active learning - #papers: 6 - taxonomies: ['papers_with_code', 'wikidata_ai', 'wikidata_ml']
term: knowledge distillation - #papers: 12 - taxonomies: ['papers_with_code', 'wikidata_ai', 'wikidata_ml']
term: structured prediction - #papers: 5 - taxonomies: ['papers_with_code', 'wikidata_ai', 'wikidata_ml']
term: prompt engineering - #papers: 3 - taxonomies: ['papers_with_code', 'wikidata_ai']
term: speech recognition - #papers: 5 - taxonomies: ['papers_with_code', 'wikidata_ai']
term: inference engines - #papers: 2 - taxonomies: ['papers_with_code', 'cso']
term: machine lea

In [18]:
with open('tax_results.json', 'w') as outfile:
    json.dump(term_count, outfile, indent=4)