### ArXiv Metadata Analysis
#### Capstone Project, DSI-911 cohort, Lisa Paul

**Current Notebook:** 00-json-to-csv
>Run this First to read-in JSON data and perform preliminary data cleaning step

In [5]:
import pandas as pd

In [6]:
#original data = 3 gigabyte JSON file
#Then it was split into 48 smaller pieces, with these shell commands:
#jq -c . < arxiv-metadata-oai-snapshot.json | gsplit -l 50000 --additional-suffix=.json - chunks_json/arxiv_meta_

# Selecting input files for training and validation data
# Using "pseudorandomly" chosen files within the -aa through -bv chunks
# The original dataset is sorted by publication date
# (as described in https://info.arxiv.org/help/arxiv_identifier_for_services.html)
# This approach should partially mitigate any potential effects of date-related biases on the dataset

train_file = 'arxiv_meta_aa.json'
valid_file = 'arxiv_meta_bb.json'

In [9]:
data_path = '../data/'
subdir = 'chunks_json/'

#reads JSON file, outputs dataframe w/o multiple-"categoried" rows
def single_cat(curr_file):
    df = pd.read_json(data_path+subdir + curr_file, lines = True)

    # Drop rows where 'categories' column contains multiple values
    df = df[~df['categories'].str.contains('\s')]

    return df

#writes out the dataframe to a csv file
def df_to_csv(df, curr_file):
    
    #create filename
    csv_partial = data_path + curr_file.rstrip('.json')
    csv_file = csv_partial + '-single-cat' + '.csv'
    
    #output to filename
    df.to_csv(csv_file, index=False)

    return csv_file

In [10]:
train_df = single_cat(train_file)
df_to_csv(train_df, train_file)

valid_df = single_cat(valid_file)
df_to_csv(valid_df, valid_file)

'../data/arxiv_meta_bb-single-cat.csv'

In [11]:
train_df.shape, valid_df.shape
#training file has now only 32031 records
#validation file has now only 22154 records

((32031, 14), (22154, 14))

In [13]:
train_df.head(1)

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."


In [14]:

valid_df.head(1)

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
2,2009.08499,Benjamin Kay,"B. P. Kay, J. P. Schiffer, S. J. Freeman, T. L...",Consistency of nucleon-transfer sum rules in w...,"7 pages, 4 figures","Phys. Rev. C 103, 024319 (2021)",10.1103/PhysRevC.103.024319,,nucl-ex,http://arxiv.org/licenses/nonexclusive-distrib...,Nucleon-transfer sum rules have been assesse...,"[{'version': 'v1', 'created': 'Thu, 17 Sep 202...",2021-03-03,"[[Kay, B. P., ], [Schiffer, J. P., ], [Freeman..."


### Next file: Preprocessing 
#### (using the CSVs we now have)