In [1]:
import pandas as pd
import json

In [2]:
dataset_file = "/home/ben/code/arxiv/arxiv-metadata-oai-snapshot.json"

https://www.kaggle.com/code/lucafuligni/exploratory-data-analysis-arxiv-dataset

In [18]:
import json
import re

# Defines the path for the complete and partial dataset
complete_dataset = dataset_file
partial_dataset = "./arxiv_cs.json"

# Defines the CS category
pattern = r"(^|\s)cs\."

# Filters the data with respect to the pattern and creates a new file
with open(complete_dataset, 'r') as f_in, open(partial_dataset, 'w') as f_out:
    for line in f_in:
        entry = json.loads(line)
        if re.search(pattern, entry["categories"]):
            json.dump(entry, f_out)
            f_out.write('\n')


In [1]:
partial_dataset = "./arxiv_cs.json"

In [2]:
import pandas as pd
df = pd.read_json(partial_dataset, lines=True)

In [3]:
df = df[['title', 'categories', 'abstract', 'id']]

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://arxiv.org/category_taxonomy"

# Send a GET request to the URL
response = requests.get(url)

# Get the page source from the response content
page_source = response.text

# Parse the page source
soup = BeautifulSoup(page_source, 'html.parser')

# Find the category list element
category_list = soup.find(id='category_taxonomy_list')

# Extract the category information
categories = []
main_category = None
for category in category_list.find_all('h4'):
    if category.find_previous('h2'):
        main_category = category.find_previous('h2').text
    category_id = category.text.split(' (')[0]  # Switched with 'category_name'
    category_name = category.text.split('(')[1].split(')')[0]  # Switched with 'category_id'
    category_description = category.find_next('p').text
    categories.append({
        "ID": category_id,  # Switched with 'category_name'
        "Main Category": main_category,
        "Name": category_name,  # Switched with 'category_id'
        "Description": category_description
    })

# Create a dataframe from the categories list
categories_df = pd.DataFrame(categories)
#categories_df.set_index("ID", inplace=True)
categories_df

Unnamed: 0,ID,Main Category,Name,Description
0,cs.AI,Computer Science,Artificial Intelligence,"Covers all areas of AI except Vision, Robotics..."
1,cs.AR,Computer Science,Hardware Architecture,Covers systems organization and hardware archi...
2,cs.CC,Computer Science,Computational Complexity,"Covers models of computation, complexity class..."
3,cs.CE,Computer Science,"Computational Engineering, Finance, and Science",Covers applications of computer science to the...
4,cs.CG,Computer Science,Computational Geometry,Roughly includes material in ACM Subject Class...
...,...,...,...,...
150,stat.CO,Statistics,Computation,"Algorithms, Simulation, Visualization"
151,stat.ME,Statistics,Methodology,"Design, Surveys, Model Selection, Multiple Tes..."
152,stat.ML,Statistics,Machine Learning,"Covers machine learning papers (supervised, un..."
153,stat.OT,Statistics,Other Statistics,Work in statistics that does not fit into the ...


In [5]:
cs_categories_df = categories_df[categories_df["Main Category"] == "Computer Science"]
cs_categories_filtered_df = cs_categories_df[["ID", "Main Category", "Name"]]

In [6]:
import dataframe_image as dfi

dfi.export(cs_categories_filtered_df, 'cs_categories.png', table_conversion='html', max_rows=-1)

In [7]:
import altair as alt

# Merge cs_arxiv_df with categories_df on 'categories' column
merged_df = pd.merge(df, categories_df, left_on='categories', right_on='ID', how='left')

# Calculate the count of papers for each subcategory
subcategory_counts = merged_df['Name'].value_counts().reset_index()

# Rename the columns for clarity
subcategory_counts.columns = ['Subcategory', 'Count']

# Sort the subcategories by count in descending order
subcategory_counts = subcategory_counts.sort_values('Count', ascending=False)

# Create the bar chart using Altair
chart = alt.Chart(subcategory_counts).mark_bar().encode(
    x='Count:Q',
    y=alt.Y('Subcategory:N', sort='-x'),
    color=alt.Color('Subcategory:N', legend=alt.Legend(title='Subcategory'))
).properties(
    title='Number of Computer Science Papers by Subcategory',
    width=800,  # Adjust the width of the chart
    height=500  # Adjust the height of the chart
)

# Display the bar chart
chart