# This notebook is dedicated to grouping of data classes, diminishing the number of classes from 155 to 8 according to https://arxiv.org/category_taxonomy/.

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('../data/arxiv_data.csv')
data.head()

Unnamed: 0,ids,titles,abstracts,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,1809.10336,Multi-task Learning for Financial Forecasting,Financial forecasting is challenging and att...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2305.11021,The Wisdom of Strategic Voting,We study the voting game where agents' prefe...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1704.00646,A correlation game for unsupervised learning y...,Much has been learned about plasticity of bi...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2104.03906,Explaining decisions made with AI: A workbook ...,"Over the last two years, The Alan Turing Ins...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1207.6361,Magneto-elastic coupling in the spin-Peierls g...,We report an ultrasonic study of the magneto...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
init_classes = data.loc[:, 'astro-ph.CO':'stat.TH'].columns

In [5]:
cs_df = data.loc[:, 'cs.AI':'cs.SY']
econ_df = data.loc[:, 'econ.EM':'econ.TH']
eess_df = data.loc[:, 'eess.AS':'eess.SY']
math_df = data.loc[:, 'math.AC':'math.ST']
qbio_df = data.loc[:, 'q-bio.BM':'q-bio.TO']
qfin_df = data.loc[:, 'q-fin.CP':'q-fin.TR']
stat_df = data.loc[:, 'stat.AP':'stat.TH']
phys_df = (
            data.loc[:, 'astro-ph.CO':'cond-mat.supr-con']
            .merge(data.loc[:, 'nlin.AO':'physics.space-ph'], how='left', left_index=True, right_index=True)
            .merge(data.loc[:, 'gr-qc':'math-ph'], how='left', left_index=True, right_index=True)
            .merge(data.loc[:, 'quant-ph'], how='left', left_index=True, right_index=True)
)

cs = []
econ = []
eess = []
math = []
phys = []
qbio = []
qfin = []
stat = []

for i in range(len(data)):
    cs.append(int(any(x == 1 for x in list(cs_df.iloc[i]))))
    econ.append(int(any(x == 1 for x in list(econ_df.iloc[i]))))
    eess.append(int(any(x == 1 for x in list(eess_df.iloc[i]))))
    math.append(int(any(x == 1 for x in list(math_df.iloc[i]))))
    phys.append(int(any(x == 1 for x in list(phys_df.iloc[i]))))
    qbio.append(int(any(x == 1 for x in list(qbio_df.iloc[i]))))
    qfin.append(int(any(x == 1 for x in list(qfin_df.iloc[i]))))
    stat.append(int(any(x == 1 for x in list(stat_df.iloc[i]))))

data['Computer Science'] = cs
data['Economics'] = econ
data['Electrical Engineering and Systems Science'] = eess
data['Mathematics'] = math
data['Physics'] = phys
data['Quantitative Biology'] = qbio
data['Quantitative Finance'] = qfin
data['Statistics'] = stat

data = data.drop(columns=init_classes, axis=1)
data.head()

Unnamed: 0,ids,titles,abstracts,Computer Science,Economics,Electrical Engineering and Systems Science,Mathematics,Physics,Quantitative Biology,Quantitative Finance,Statistics
0,1809.10336,Multi-task Learning for Financial Forecasting,Financial forecasting is challenging and att...,1,0,0,0,0,0,0,1
1,2305.11021,The Wisdom of Strategic Voting,We study the voting game where agents' prefe...,1,0,0,0,0,0,0,0
2,1704.00646,A correlation game for unsupervised learning y...,Much has been learned about plasticity of bi...,1,0,0,0,0,1,0,0
3,2104.03906,Explaining decisions made with AI: A workbook ...,"Over the last two years, The Alan Turing Ins...",1,0,0,0,0,0,0,0
4,1207.6361,Magneto-elastic coupling in the spin-Peierls g...,We report an ultrasonic study of the magneto...,0,0,0,0,1,0,0,0


In [6]:
data.to_csv('../data/arxiv_data_grouped.csv', index=False)