In [1]:
#default_exp paperswithcode_tasks

In [2]:
#export
import pandas as pd

In [3]:
%cd ..

/home/kuba/Projects/github_search


In [4]:
pd.options.display.max_colwidth = 200

In [5]:
#export


def get_paperswithcode_dfs(paperswithcode_filename='data/links-between-papers-and-code.json.gz', papers_filename='data/papers-with-abstracts.json.gz'):
    paperswithcode_df = pd.read_json(paperswithcode_filename)
    paperswithcode_df['repo'] = paperswithcode_df['repo_url'].str.replace('https://github.com/', '')

    all_papers_df = pd.read_json(papers_filename)
    return paperswithcode_df, all_papers_df


def get_papers_with_repo_df(all_papers_df, paperswithcode_df, repo_names):
    """
    add repo information to arxiv paper information
    """
    paperswithcode_with_repo_df = paperswithcode_df[paperswithcode_df['repo'].isin(repo_names)]
    paperswithcode_diff_columns = list(paperswithcode_with_repo_df.columns.difference(all_papers_df.columns)) + ['paper_url']
    papers_with_repo_df = all_papers_df[all_papers_df['paper_url'].isin(paperswithcode_with_repo_df['paper_url'])]
    
    return papers_with_repo_df.merge(paperswithcode_with_repo_df[paperswithcode_diff_columns], on='paper_url')


def get_papers_with_biggest_tasks(papers_with_repo_df, n_biggest_tasks):
    """
    fetch papers which contain at least one task that is in n_biggest_tasks (by number of task occurrences)
    """
    all_tasks = papers_with_repo_df.explode('tasks')['tasks'] 
    biggest_tasks = all_tasks.value_counts()[:200]
    
    papers_with_repo_with_biggest_tasks_df = papers_with_repo_df[papers_with_repo_df['tasks'].apply(lambda tasks: any(task in biggest_tasks.index for task in tasks))]
    papers_with_repo_with_biggest_tasks_df['most_common_task'] = papers_with_repo_with_biggest_tasks_df['tasks'].apply(
        lambda tasks: biggest_tasks[[t for t in tasks if t in biggest_tasks.index]].idxmax() if len(biggest_tasks[ [t for t in tasks if t in biggest_tasks.index]]) > 0 else None
    )
    return papers_with_repo_with_biggest_tasks_df 

In [6]:
paperswithcode_df = pd.read_json('data/links-between-papers-and-code.json.gz')
paperswithcode_df['repo'] = paperswithcode_df['repo_url'].str.replace('https://github.com/', '')

In [7]:
all_papers_df = pd.read_json('data/papers-with-abstracts.json.gz')

In [None]:
python_files_df = pd.read_csv('data/python_files.csv')

In [None]:
python_files_df

In [None]:
python_files_df['repo_name_with_owner'] = python_files_df['owner']  + '/' + python_files_df['repo_name']
repo_names = python_files_df['repo_name_with_owner'].unique()

In [None]:
all_papers_df.info()

In [None]:
paperswithcode_df['repo'] = paperswithcode_df['repo_url'].str.replace('https://github.com/', '')

In [None]:
paperswithcode_repos = paperswithcode_df['repo']

In [None]:
len(set(paperswithcode_repos))

In [None]:
len(repo_names)

In [None]:
len(set(repo_names).intersection(paperswithcode_repos))

In [None]:
paperswithcode_df['repo']

In [None]:
repo_names

In [None]:
paperswithcode_df.columns

In [None]:
papers_with_repo_df = get_papers_with_repo_df(all_papers_df, paperswithcode_df, repo_names)
papers_with_repo_df['tasks'].apply(len).value_counts()

In [None]:
all_tasks = papers_with_repo_df.explode('tasks')['tasks'] 

In [None]:
all_tasks.nunique()

In [None]:
all_tasks.value_counts()[all_tasks.value_counts() > 10]#[:101].to_dict()

In [None]:
papers_with_repo_with_biggest_tasks_df = get_papers_with_biggest_tasks(papers_with_repo_df, 200)

In [None]:
papers_with_repo_with_biggest_tasks_df[papers_with_repo_with_biggest_tasks_df['tasks'].apply(lambda tasks: 'Hierarchical structure' in tasks)]['title']

In [None]:
papers_with_repo_with_biggest_tasks_df[papers_with_repo_with_biggest_tasks_df['tasks'].apply(len) > 1]

In [None]:
papers_with_repo_with_biggest_tasks_df.shape

## Selecting most common task

In [None]:
papers_with_repo_with_biggest_tasks_df['most_common_task'].value_counts()[:100].sum()

In [None]:
duplicated_classes = {
    "Document Classification": "Text Classification",
    "Abstractive Text Summarization": "Text Summarization",
    "3D Human Pose Estimation": "Pose Estimation",
    "Semantic Similarity": "Semantic Textual Similarity",
    "Trajectory Prediction": "Autonomous Vehicles",
    "Autonomous Driving": "Autonomous Vehicles",
    "Feature Importance": "Feature Selection",
    "Visual Tracking": "Object Tracking",
    "Object Recognition": "Object Detection",
    "Multi-Task Learning": "Transfer Learning"
}

In [None]:
questionable_duplicated_classes = {
    "Adversarial Attack": "Adversarial Machine Learning",
    "Adversarial Defense": "Adversarial Machine Learning",
    "Voice Conversion": "Speech Generation",
    "Lesion Segmentation": "Semantic Segmentation"
}

In [None]:
invalid_classes = [
    "Text-To-Sql",
    "Hiearchical structure"
]

In [None]:
most_common_task_counts = papers_with_repo_with_biggest_tasks_df['most_common_task'].value_counts()

In [None]:
most_common_task_counts[most_common_task_counts > 10].sum()#most_common_task_counts[:150].to_dict()

In [None]:
most_common_task_counts.shape

In [None]:
papers_with_repo_with_biggest_tasks_df['most_common_task'].value_counts()[:100].plot.bar()

## Selecting most matching task

Matching is defined using similarity of embeddings of task name and article title

In [None]:
import numpy as np
import tqdm
from sklearn import metrics

In [None]:
text = papers_with_repo_with_biggest_tasks_df.iloc[2]['title']
matched_texts = papers_with_repo_with_biggest_tasks_df.iloc[2]['tasks']

In [None]:
matched_texts

In [None]:
from sentence_transformers import SentenceTransformer
sentence_embedder = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [None]:
def select_best_matches(text, matched_texts, similarity=metrics.pairwise.cosine_similarity):
    text_features = sentence_embedder.encode([text])
    matched_features = sentence_embedder.encode(matched_texts)
    similarities = similarity(text_features, matched_features)
    return matched_texts[np.argmax(similarities)]

In [None]:
matched_texts

In [None]:
select_best_matches(text, matched_texts)

In [None]:
papers_with_repo_with_biggest_tasks_df['title_matched_task'] = ''

In [None]:
papers_with_repo_with_biggest_tasks_df.shape

In [None]:
for i, row in tqdm.tqdm(papers_with_repo_with_biggest_tasks_df.iterrows(), total=papers_with_repo_with_biggest_tasks_df.shape[0]):
    papers_with_repo_with_biggest_tasks_df['title_matched_task'].loc[i] = select_best_matches(row['title'], row['tasks'])

In [None]:
matched_task_counts = papers_with_repo_with_biggest_tasks_df['title_matched_task'].value_counts()

In [None]:
matched_task_counts[matched_task_counts > 10].sum()

In [None]:
papers_with_repo_with_biggest_tasks_df[['title', 'most_common_task']].head(20)

In [None]:
papers_with_repo_with_biggest_tasks_df['title_matched_task'].value_counts()[:150].sum()#.to_dict()

In [None]:
papers_with_repo_with_biggest_tasks_df['title_matched_task'].value_counts()[:200].plot.bar()

In [None]:
import paperswithcode

client = paperswithcode.PapersWithCodeClient()

In [None]:
client.area_task_list('computer-vision', items_per_page=1000).results

In [None]:
papers_with_repo_with_biggest_tasks_df.head()

In [None]:
dict(client.task_get('trajectory-prediction'))

In [None]:
client.task_paper_list('trajectory-prediction')

In [None]:
paper_id = papers_with_repo_with_biggest_tasks_df['paper_url'].iloc[1].split('/')[-1]

In [None]:
paper_id.split('/')[-1]

In [None]:
dict(client.paper_get(paper_id))

In [None]:
paper_id

In [None]:
tasks = client.http.get(f"/papers/{paper_id}/tasks/")['results']

In [None]:
tasks

In [None]:
[paperswithcode.models.Task(**task) for task in tasks]

In [None]:
import paperswithcode

client = paperswithcode.PapersWithCodeClient()
areas = client.area_list().results
s = 0

area_grouped_tasks = {}

for a in areas:
    area_tasks = [t.id for t in client.area_task_list(a.id, items_per_page=1000).results]
    area_grouped_tasks[a.id] = area_tasks
    n_tasks_per_area = len(area_tasks)
    print(a.id, ':', n_tasks_per_area)
    s += n_tasks_per_area
print('total tasks:', s)


In [None]:
area_tasks_df = pd.DataFrame({'area': area_grouped_tasks.keys(), 'task': area_grouped_tasks.values()}).explode('task')

In [None]:
papers_with_repo_df['task'] = papers_with_repo_df['tasks']

In [None]:
papers_task_exploded_df = papers_with_repo_df.explode('task')

In [None]:
task_api_normalized = papers_task_exploded_df['task'].str.lower().str.replace(' ', '-')

In [None]:
task_api_normalized

In [None]:
tasks_without_area = task_api_normalized[~task_api_normalized.isin(area_tasks_df['task'])].unique()

In [None]:
other_tasks_df = pd.DataFrame({"area": "other", "task": tasks_without_area})

In [None]:
all_area_tasks_df = pd.concat([area_tasks_df, other_tasks_df])

In [None]:
all_area_tasks_df.head()

In [None]:
all_area_tasks_df.to_csv('data/paperswithcode_tasks.csv')