# CP421_Project

In [1]:
import json
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:

# File path (adjust if needed)
file_path = "resources/arxiv-metadata-oai-snapshot.json"

# Desired category prefixes
valid_prefixes = ['cs.', 'math.', 'stat.', 'physics.comp-']

# Initialize storage
filtered_data = []

# Read and filter
with open(file_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Loading and filtering JSON"):
        try:
            record = json.loads(line)
            primary_cat = record.get("categories", "").split()[0]
            if any(primary_cat.startswith(prefix) for prefix in valid_prefixes):
                filtered_data.append({
                    "id": record.get("id"),
                    "title": record.get("title", "").strip(),
                    "abstract": record.get("abstract", "").strip(),
                    "categories": record.get("categories"),
                    "authors": record.get("authors_parsed"),
                    "date": record.get("update_date", "")[:10]  # YYYY-MM-DD
                })
        except json.JSONDecodeError:
            continue

# Convert to DataFrame
df = pd.DataFrame(filtered_data)
print("Filtered Data Shape:", df.shape)
df.head()

Loading and filtering JSON: 0it [00:00, ?it/s]Loading and filtering JSON: 1550it [00:00, 15498.91it/s]Loading and filtering JSON: 4572it [00:00, 24149.67it/s]Loading and filtering JSON: 6987it [00:00, 22795.44it/s]Loading and filtering JSON: 9275it [00:00, 18337.94it/s]Loading and filtering JSON: 11210it [00:00, 17507.02it/s]Loading and filtering JSON: 15018it [00:00, 23468.41it/s]Loading and filtering JSON: 19283it [00:00, 29082.97it/s]Loading and filtering JSON: 22354it [00:00, 29378.07it/s]Loading and filtering JSON: 27166it [00:01, 34907.57it/s]Loading and filtering JSON: 31493it [00:01, 37385.61it/s]Loading and filtering JSON: 35641it [00:01, 38601.60it/s]Loading and filtering JSON: 39835it [00:01, 39593.53it/s]Loading and filtering JSON: 43941it [00:01, 40030.34it/s]Loading and filtering JSON: 48017it [00:01, 40248.10it/s]Loading and filtering JSON: 52163it [00:01, 40608.71it/s]Loading and filtering JSON: 56445it [00:01, 41270.52it/s]Loading and filtering JSON: 

In [2]:
#df = pd.read_csv("resources/loaded_files.csv")
#df.head()

Unnamed: 0.1,Unnamed: 0,id,title,abstract,categories,authors,date
0,0,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,"[['Streinu', 'Ileana', ''], ['Theran', 'Louis'...",2008-12-13
1,1,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle n...,math.CO,"[['Callan', 'David', '']]",2007-05-23
2,2,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\Lam...,math.CA math.FA,"[['Abu-Shammala', 'Wael', ''], ['Torchinsky', ...",2013-10-15
3,3,704.001,"Partial cubes: structures, characterizations, ...",Partial cubes are isometric subgraphs of hyper...,math.CO,"[['Ovchinnikov', 'Sergei', '']]",2007-05-23
4,4,704.0011,Computing genus 2 Hilbert-Siegel modular forms...,In this paper we present an algorithm for comp...,math.NT math.AG,"[['Cunningham', 'Clifton', ''], ['Dembele', 'L...",2008-08-20


In [3]:

# Drop null or empty abstracts
df = df[df['abstract'].notnull() & df['abstract'].str.strip().ne('')]

# Vectorized cleaning
df["clean_abstract"] = (
    df["abstract"]
    .str.lower()
    .str.replace(r'\s+', ' ', regex=True)
    .str.replace(r'[^a-z\s]', '', regex=True)
    .str.strip()
)

# Filter very short abstracts (≤20 words)
mask_long_enough = df["clean_abstract"].str.split().str.len() > 20
df = df[mask_long_enough]

print("Data after fast cleaning:", df.shape)
df[["title", "clean_abstract"]].head(3)

Data after fast cleaning: (5, 8)


Unnamed: 0,title,clean_abstract
0,Sparsity-certifying Graph Decompositions,we describe a new algorithm the kellpebble gam...
1,A determinant of Stirling cycle numbers counts...,we show that a determinant of stirling cycle n...
2,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,in this paper we show how to compute the lambd...
