In [1]:
# default_exp paperswithcode_tasks

In [2]:
# export
import pandas as pd
import re

In [3]:
%cd ..

/home/kuba/Projects/github_search


In [4]:
pd.options.display.max_colwidth = 200

In [5]:
# export

# export


def clean_task_name(task_name):
    task_name = re.sub(r"\d+d", "", task_name)
    task_name = task_name.replace("-", " ")
    return task_name.lower().strip()


def get_paperswithcode_dfs(
    paperswithcode_filename="data/links-between-papers-and-code.json.gz",
    papers_filename="data/papers-with-abstracts.json.gz",
):
    paperswithcode_df = pd.read_json(paperswithcode_filename)
    paperswithcode_df["repo"] = paperswithcode_df["repo_url"].str.replace(
        "https://github.com/", ""
    )

    all_papers_df = pd.read_json(papers_filename)
    return paperswithcode_df, all_papers_df


def get_papers_with_repo_df(all_papers_df, paperswithcode_df, repo_names):
    """
    add repo information to arxiv paper information
    """
    paperswithcode_with_repo_df = paperswithcode_df[
        paperswithcode_df["repo"].isin(repo_names)
    ]
    paperswithcode_diff_columns = list(
        paperswithcode_with_repo_df.columns.difference(all_papers_df.columns)
    ) + ["paper_url"]
    papers_with_repo_df = all_papers_df[
        all_papers_df["paper_url"].isin(paperswithcode_with_repo_df["paper_url"])
    ]

    return papers_with_repo_df.merge(
        paperswithcode_with_repo_df[paperswithcode_diff_columns], on="paper_url"
    )


def get_papers_with_biggest_tasks(papers_with_repo_df, n_biggest_tasks):
    """
    fetch papers which contain at least one task that is in n_biggest_tasks (by number of task occurrences)
    """
    all_tasks = papers_with_repo_df.explode("tasks")["tasks"]
    biggest_tasks = all_tasks.value_counts()[:n_biggest_tasks]

    papers_with_repo_with_biggest_tasks_df = papers_with_repo_df[
        papers_with_repo_df["tasks"].apply(
            lambda tasks: any(task in biggest_tasks.index for task in tasks)
        )
    ]
    papers_with_repo_with_biggest_tasks_df[
        "most_common_task"
    ] = papers_with_repo_with_biggest_tasks_df["tasks"].apply(
        lambda tasks: biggest_tasks[
            [t for t in tasks if t in biggest_tasks.index]
        ].idxmax()
        if len(biggest_tasks[[t for t in tasks if t in biggest_tasks.index]]) > 0
        else None
    )
    return papers_with_repo_with_biggest_tasks_df


def get_papers_with_biggest_tasks_df(n_biggest_tasks=None):
    paperswithcode_df, all_papers_df = get_paperswithcode_dfs()
    n_biggest_tasks = (
        n_biggest_tasks if not n_biggest_tasks is None else len(paperswithcode_df)
    )
    papers_with_repo_df = get_papers_with_repo_df(
        all_papers_df, paperswithcode_df, paperswithcode_df["repo"]
    )
    return get_papers_with_biggest_tasks(
        papers_with_repo_df, n_biggest_tasks=n_biggest_tasks
    )

In [6]:
paperswithcode_df = pd.read_json("data/links-between-papers-and-code.json.gz")
paperswithcode_df["repo"] = paperswithcode_df["repo_url"].str.replace(
    "https://github.com/", ""
)

  paperswithcode_df['repo'] = paperswithcode_df['repo_url'].str.replace('https://github.com/', '')


In [7]:
all_papers_df = pd.read_json("data/papers-with-abstracts.json.gz")

In [8]:
# python_files_df = pd.read_csv('data/python_files.csv')

In [9]:
# python_files_df

In [10]:
# export


def get_task_counts(cleaned_tasks):
    all_cleaned_tasks = cleaned_tasks.explode().dropna().apply(clean_task_name)
    cleaned_tasks = all_cleaned_tasks.drop_duplicates()
    return all_cleaned_tasks.value_counts()


def get_papers_with_valid_tasks(all_papers_df, cleaned_tasks, min_task_occurrences):
    task_counts = get_task_counts(cleaned_tasks)
    valid_tasks = task_counts[task_counts >= min_task_occurrences].index
    filtered_papers_tasks = cleaned_tasks.apply(
        lambda ts: [t for t in ts if t in valid_tasks]
    )
    papers_with_tasks_df = all_papers_df[filtered_papers_tasks.apply(len) > 0]
    papers_with_tasks_df["valid_tasks"] = filtered_papers_tasks[
        filtered_papers_tasks.apply(len) > 0
    ]
    return papers_with_tasks_df


def add_least_common_task(
    paperswithcode_with_tasks_df, cleaned_tasks, min_task_occurrences
):
    task_counts = get_task_counts(cleaned_tasks).sort_values()
    task_counts = task_counts[task_counts > min_task_occurrences]
    least_common_task = cleaned_tasks.apply(
        lambda ts: task_counts.loc[[t for t in ts if t in task_counts.index]].index[0]
        if any([t for t in ts if t in task_counts.index])
        else None
    )
    paperswithcode_with_tasks_df["least_common_task"] = least_common_task

In [12]:
# export


def get_paperswithcode_with_tasks_df(
    paperswithcode_df, all_papers_df, min_task_occurrences=10
):
    all_papers_df["cleaned_tasks"] = all_papers_df["tasks"].apply(
        lambda ts: [clean_task_name(t) for t in ts]
    )
    papers_with_valid_tasks_df = get_papers_with_valid_tasks(
        all_papers_df, all_papers_df["cleaned_tasks"], min_task_occurrences
    )
    paperswithcode_with_tasks_df = paperswithcode_df.merge(
        papers_with_valid_tasks_df[["title", "valid_tasks", "abstract"]],
        left_on="paper_title",
        right_on="title",
    )
    paperswithcode_with_tasks_df["tasks"] = paperswithcode_with_tasks_df["valid_tasks"]
    paperswithcode_with_tasks_df = paperswithcode_with_tasks_df.groupby("repo").apply(
        lambda df: df.loc[df["tasks"].apply(len).idxmax()]
    )
    add_least_common_task(
        paperswithcode_with_tasks_df,
        paperswithcode_with_tasks_df["valid_tasks"],
        min_task_occurrences,
    )
    paperswithcode_with_tasks_df.drop("valid_tasks", axis=1, inplace=True)
    all_valid_tasks = paperswithcode_with_tasks_df["least_common_task"].unique()
    paperswithcode_with_tasks_df["tasks"] = paperswithcode_with_tasks_df["tasks"].apply(
        lambda ts: [t for t in ts if t in all_valid_tasks]
    )
    paperswithcode_with_tasks_df = paperswithcode_with_tasks_df.dropna(
        axis=0, subset=["least_common_task"]
    )
    return paperswithcode_with_tasks_df

In [13]:
# export


def get_area_grouped_tasks(paperswithcode_tasks_path="data/paperswithcode_tasks.csv"):
    area_grouped_tasks = pd.read_csv("data/paperswithcode_tasks.csv").dropna()
    area_grouped_tasks["task"] = area_grouped_tasks["task"].apply(clean_task_name)
    area_counts = area_grouped_tasks["area"].value_counts()
    area_grouped_tasks = area_grouped_tasks[
        area_grouped_tasks["area"].isin(area_counts.index[area_counts > 1])
    ]
    return area_grouped_tasks

In [14]:
all_papers_df["tasks"][all_papers_df["tasks"].apply(len) > 0].shape

(113837,)

In [15]:
paperswithcode_with_tasks_df = get_paperswithcode_with_tasks_df(
    paperswithcode_df, all_papers_df, 10
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers_with_tasks_df['valid_tasks'] = filtered_papers_tasks[filtered_papers_tasks.apply(len) > 0]


In [16]:
paperswithcode_with_tasks_df.reset_index(drop=True)

Unnamed: 0,paper_url,paper_title,paper_arxiv_id,paper_url_abs,paper_url_pdf,repo_url,mentioned_in_paper,mentioned_in_github,framework,repo,title,abstract,tasks,least_common_task
0,https://paperswithcode.com/paper/a-unifying-generative-model-for-graph,"A Unifying Generative Model for Graph Learning Algorithms: Label Propagation, Graph Convolutions, and Combinations",2101.07730,https://arxiv.org/abs/2101.07730v2,https://arxiv.org/pdf/2101.07730v2.pdf,https://github.com/000Justin000/GaussianMRF,True,False,none,000Justin000/GaussianMRF,"A Unifying Generative Model for Graph Learning Algorithms: Label Propagation, Graph Convolutions, and Combinations",Semi-supervised learning on graphs is a widely applicable problem in network science and machine learning. Two standard algorithms -- label propagation and graph neural networks -- both operate by...,[graph learning],graph learning
1,https://paperswithcode.com/paper/graph-based-semi-supervised-active-learning,Graph-based Semi-Supervised & Active Learning for Edge Flows,1905.07451,https://arxiv.org/abs/1905.07451v1,https://arxiv.org/pdf/1905.07451v1.pdf,https://github.com/000Justin000/ssl_edge,True,True,none,000Justin000/ssl_edge,Graph-based Semi-Supervised & Active Learning for Edge Flows,"We present a graph-based semi-supervised learning (SSL) method for learning edge flows defined on a graph. Specifically, given flow measurements on a subset of edges, we want to predict the flows ...",[active learning],active learning
2,https://paperswithcode.com/paper/neural-ordinary-differential-equations,Neural Ordinary Differential Equations,1806.07366,https://arxiv.org/abs/1806.07366v5,https://arxiv.org/pdf/1806.07366v5.pdf,https://github.com/000Justin000/torchdiffeq,False,True,pytorch,000Justin000/torchdiffeq,Neural Ordinary Differential Equations,"We introduce a new family of deep neural network models. Instead of specifying a discrete sequence of hidden layers, we parameterize the derivative of the hidden state using a neural network. The ...","[latent variable models, multivariate time series forecasting, multivariate time series imputation]",latent variable models
3,https://paperswithcode.com/paper/one-shot-segmentation-in-clutter,One-Shot Segmentation in Clutter,1803.09597,http://arxiv.org/abs/1803.09597v2,http://arxiv.org/pdf/1803.09597v2.pdf,https://github.com/000c000l/oneShotLearningForSemanticSegmentation,False,True,tf,000c000l/oneShotLearningForSemanticSegmentation,One-Shot Segmentation in Clutter,"We tackle the problem of one-shot segmentation: finding and segmenting a\npreviously unseen object in a cluttered scene based on a single instruction\nexample. We propose a novel dataset, which we...",[omniglot],omniglot
4,https://paperswithcode.com/paper/speaker-recognition-from-raw-waveform-with,Speaker Recognition from Raw Waveform with SincNet,1808.00158,https://arxiv.org/abs/1808.00158v3,https://arxiv.org/pdf/1808.00158v3.pdf,https://github.com/008karan/SincNet_demo,False,True,pytorch,008karan/SincNet_demo,Speaker Recognition from Raw Waveform with SincNet,Deep learning is progressively gaining popularity as a viable alternative to i-vectors for speaker recognition. Promising results have been recently obtained with Convolutional Neural Networks (CN...,"[speaker identification, speaker recognition, speaker verification]",speaker identification
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43198,https://paperswithcode.com/paper/stargan-v2-diverse-image-synthesis-for,StarGAN v2: Diverse Image Synthesis for Multiple Domains,1912.01865,https://arxiv.org/abs/1912.01865v2,https://arxiv.org/pdf/1912.01865v2.pdf,https://github.com/zzz2010/starganv2_paddle,False,True,pytorch,zzz2010/starganv2_paddle,StarGAN v2: Diverse Image Synthesis for Multiple Domains,A good image-to-image translation model should learn a mapping between different visual domains while satisfying the following properties: 1) diversity of generated images and 2) scalability over ...,"[image generation, image to image translation]",image generation
43199,https://paperswithcode.com/paper/explaining-image-classifiers-by,Explaining Image Classifiers by Counterfactual Generation,1807.08024,http://arxiv.org/abs/1807.08024v3,http://arxiv.org/pdf/1807.08024v3.pdf,https://github.com/zzzace2000/FIDO-saliency,True,True,pytorch,zzzace2000/FIDO-saliency,Explaining Image Classifiers by Counterfactual Generation,"When an image classifier makes a prediction, which parts of the image are\nrelevant and why? We can rephrase this question to ask: which parts of the\nimage, if they were not seen by the classifie...",[image classification],image classification
43200,https://paperswithcode.com/paper/dynamic-measurement-scheduling-for-event,Dynamic Measurement Scheduling for Event Forecasting using Deep RL,1901.09699,https://arxiv.org/abs/1901.09699v3,https://arxiv.org/pdf/1901.09699v3.pdf,https://github.com/zzzace2000/autodiagnosis,True,True,tf,zzzace2000/autodiagnosis,Dynamic Measurement Scheduling for Event Forecasting using Deep RL,"Imagine a patient in critical condition. What and when should be measured to forecast detrimental events, especially under the budget constraints? We answer this question by deep reinforcement lea...",[mortality prediction],mortality prediction
43201,https://paperswithcode.com/paper/dropout-feature-ranking-for-deep-learning,Dropout Feature Ranking for Deep Learning Models,1712.08645,http://arxiv.org/abs/1712.08645v2,http://arxiv.org/pdf/1712.08645v2.pdf,https://github.com/zzzace2000/dropout-feature-ranking,False,True,pytorch,zzzace2000/dropout-feature-ranking,Dropout Feature Ranking for Deep Learning Models,"Deep neural networks (DNNs) achieve state-of-the-art results in a variety of\ndomains. Unfortunately, DNNs are notorious for their non-interpretability, and\nthus limit their applicability in hypo...",[time series],time series


In [None]:
paperswithcode_with_tasks_df["tasks"].explode().value_counts()

In [None]:
paperswithcode_with_tasks_df

In [None]:
least_common_task_counts = paperswithcode_with_tasks_df[
    "least_common_task"
].value_counts()

In [None]:
selected_least_common_tasks = least_common_task_counts[least_common_task_counts > 4]

In [None]:
paperswithcode_with_tasks_df[
    paperswithcode_with_tasks_df["least_common_task"].isin(
        selected_least_common_tasks.index
    )
].shape

In [72]:
paperswithcode_with_tasks_df.reset_index(drop=True).to_csv(
    "data/paperswithcode_with_tasks.csv"
)

In [None]:
paperswithcode_with_tasks_df

In [None]:
paperswithcode_with_tasks_df["least_common_task"].value_counts()

In [None]:
paperswithcode_with_tasks_df["tasks"].apply(len).value_counts()

In [None]:
papers_with_tasks_df = all_papers_df[all_papers_df["tasks"].apply(len) > 0]

In [None]:
papers_with_tasks_df.shape

In [None]:
all_papers_df["tasks"]

In [None]:
paperswithcode_df["repo"] = paperswithcode_df["repo_url"].str.replace(
    "https://github.com/", ""
)

In [None]:
paperswithcode_repos = paperswithcode_df["repo"]

In [None]:
len(set(paperswithcode_repos))

In [None]:
len(repo_names)

In [None]:
len(set(repo_names).intersection(paperswithcode_repos))

In [None]:
paperswithcode_df["repo"]

In [None]:
repo_names

In [None]:
paperswithcode_df.columns

In [None]:
papers_with_repo_df = get_papers_with_repo_df(
    all_papers_df, paperswithcode_df, repo_names
)
papers_with_repo_df["tasks"].apply(len).value_counts()

In [None]:
all_tasks = papers_with_repo_df.explode("tasks")["tasks"]

In [None]:
all_tasks.nunique()

In [None]:
all_tasks.value_counts()[all_tasks.value_counts() > 10]  # [:101].to_dict()

In [None]:
set(all_tasks)

In [None]:
papers_with_repo_with_biggest_tasks_df = get_papers_with_biggest_tasks(
    papers_with_repo_df, None
)

In [None]:
papers_with_repo_with_biggest_tasks_df[
    papers_with_repo_with_biggest_tasks_df["tasks"].apply(
        lambda tasks: "Hierarchical structure" in tasks
    )
]["title"]

In [None]:
papers_with_repo_with_biggest_tasks_df[
    papers_with_repo_with_biggest_tasks_df["tasks"].apply(len) > 1
]

In [None]:
papers_with_repo_with_biggest_tasks_df.shape

## Selecting most common task

In [None]:
papers_with_repo_with_biggest_tasks_df["most_common_task"].value_counts()[:100].sum()

In [None]:
duplicated_classes = {
    "Document Classification": "Text Classification",
    "Abstractive Text Summarization": "Text Summarization",
    "3D Human Pose Estimation": "Pose Estimation",
    "Semantic Similarity": "Semantic Textual Similarity",
    "Trajectory Prediction": "Autonomous Vehicles",
    "Autonomous Driving": "Autonomous Vehicles",
    "Feature Importance": "Feature Selection",
    "Visual Tracking": "Object Tracking",
    "Object Recognition": "Object Detection",
    "Multi-Task Learning": "Transfer Learning",
}

In [None]:
questionable_duplicated_classes = {
    "Adversarial Attack": "Adversarial Machine Learning",
    "Adversarial Defense": "Adversarial Machine Learning",
    "Voice Conversion": "Speech Generation",
    "Lesion Segmentation": "Semantic Segmentation",
}

In [None]:
invalid_classes = ["Text-To-Sql", "Hiearchical structure"]

In [None]:
most_common_task_counts = papers_with_repo_with_biggest_tasks_df[
    "most_common_task"
].value_counts()

In [None]:
most_common_task_counts[
    most_common_task_counts > 10
].sum()  # most_common_task_counts[:150].to_dict()

In [None]:
most_common_task_counts.shape

In [None]:
papers_with_repo_with_biggest_tasks_df["most_common_task"].value_counts()[
    :100
].plot.bar()

## Selecting most matching task

Matching is defined using similarity of embeddings of task name and article title

In [None]:
import numpy as np
import tqdm
from sklearn import metrics

In [None]:
text = papers_with_repo_with_biggest_tasks_df.iloc[2]["title"]
matched_texts = papers_with_repo_with_biggest_tasks_df.iloc[2]["tasks"]

In [None]:
matched_texts

In [None]:
import paperswithcode

client = paperswithcode.PapersWithCodeClient()

In [None]:
len(client.area_task_list("computer-vision", page=2, items_per_page=1000).results)

In [None]:
papers_with_repo_with_biggest_tasks_df.head()

In [None]:
dict(client.task_get("trajectory-prediction"))

In [None]:
client.task_paper_list("trajectory-prediction")

In [None]:
paper_id = papers_with_repo_with_biggest_tasks_df["paper_url"].iloc[1].split("/")[-1]

In [None]:
paper_id.split("/")[-1]

In [None]:
dict(client.paper_get(paper_id))

In [None]:
paper_id

In [None]:
tasks = client.http.get(f"/papers/{paper_id}/tasks/")["results"]

In [None]:
[paperswithcode.models.Task(**task) for task in tasks]

id='adversarial' name='Adversarial'
id='adversarial' name='Adversarial'
adversarial : 15
id='audio' name='Audio'
id='audio' name='Audio'
audio : 41
id='computer-code' name='Computer Code'
id='computer-code' name='Computer Code'
computer-code : 40
id='computer-vision' name='Computer Vision'
id='computer-vision' name='Computer Vision'
id='computer-vision' name='Computer Vision'
computer-vision : 961
id='graphs' name='Graphs'
id='graphs' name='Graphs'
graphs : 65
id='knowledge-base' name='Knowledge Base'
id='knowledge-base' name='Knowledge Base'
knowledge-base : 24
id='medical' name='Medical'
id='medical' name='Medical'
medical : 199
id='methodology' name='Methodology'
id='methodology' name='Methodology'
methodology : 157
id='miscellaneous' name='Miscellaneous'
id='miscellaneous' name='Miscellaneous'
miscellaneous : 143
id='music' name='Music'
id='music' name='Music'
music : 17
id='natural-language-processing' name='Natural Language Processing'
id='natural-language-processing' name='Natur

In [16]:
len(client.area_task_list("computer-vision", page=3, items_per_page=1000).results)

HttpClientError: HttpClientError(404: Not found.)

In [17]:
area_tasks_df = pd.DataFrame(
    {"area": area_grouped_tasks.keys(), "task": area_grouped_tasks.values()}
).explode("task")

In [20]:
papers_with_repo_df["task"] = papers_with_repo_df["tasks"]

NameError: name 'papers_with_repo_df' is not defined

In [22]:
area_tasks_df["area"].value_counts()

computer-vision                961
natural-language-processing    458
medical                        199
methodology                    157
miscellaneous                  143
time-series                     68
graphs                          65
speech                          51
audio                           41
computer-code                   40
playing-games                   40
robots                          31
knowledge-base                  24
reasoning                       21
music                           17
adversarial                     15
Name: area, dtype: int64

In [19]:
papers_task_exploded_df = papers_with_repo_df.explode("task")

NameError: name 'papers_with_repo_df' is not defined

In [None]:
task_api_normalized = papers_task_exploded_df["task"].str.lower().str.replace(" ", "-")

In [None]:
task_api_normalized

In [None]:
tasks_without_area = task_api_normalized[
    ~task_api_normalized.isin(area_tasks_df["task"])
].unique()

In [None]:
other_tasks_df = pd.DataFrame({"area": "miscellaneous", "task": tasks_without_area})

In [None]:
all_area_tasks_df = pd.concat([area_tasks_df, other_tasks_df])

In [None]:
all_area_tasks_df.head()

In [None]:
all_area_tasks_df.to_csv("data/paperswithcode_tasks.csv", index=None)

In [None]:
papers_task_exploded_df["normalized_task"] = task_api_normalized

In [None]:
papers_area_df = papers_task_exploded_df.merge(
    all_area_tasks_df, left_on="normalized_task", right_on="task", suffixes=["", "_"]
).drop(columns=["task_"])

In [None]:
all_area_tasks_df

In [None]:
from sklearn import model_selection

In [None]:
papers_area_df.columns

In [None]:
papers_area_df["area"].value_counts()

In [None]:
papers_area_df.groupby(["area", "task"]).agg("count")["paper_url"]

In [None]:
area_counts = papers_area_df["area"].value_counts()
area_weights = area_counts.copy()
area_weights = area_weights / area_weights.sum()

In [None]:
area_tasks_df[area_tasks_df["area"] == "adversarial"]

In [None]:
papers_area_df[papers_area_df["area"] == "adversarial"]["task"]

In [None]:
train_tasks_df, test_tasks_df = model_selection.train_test_split(
    all_area_tasks_df, test_size=0.2, stratify=all_area_tasks_df["area"]
)

In [None]:
test_tasks_df

In [None]:
papers_train_df, papers_test_df = 

In [None]:
papers_test_df.shape

In [None]:
papers_train_df

In [None]:
with mlutil.maybe_pickler("/tmp/foo.pkl") as writer:
    writer.write_pickle_if_not_exists(lambda: papers_area_df.iloc[1:])

In [None]:
"https://dfkiqyg0xf.execute-api.us-east-2.amazonaws.com/DEV2/storage/humtap-contributions/"
audio_contributions/audio/10_0FCBDDA0-953F-46A9-86B8-0AC8EAC89F03.opus|