In [1]:
import json
import pickle

In [2]:
import pandas as pd
import numpy as np

## Load data

Note that the data contains information on STEM and economic research papers and journals, sourced from arXiv.org.

In [3]:
data = []
for line in open('data/arxiv-metadata-oai-2020-03-23.json'):
    data.append(json.loads(line))

In [6]:
df_raw = pd.DataFrame(data)

In [7]:
df_raw.head(3)

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,abstract,report-no,categories,versions
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,A fully differential calculation in perturba...,ANL-HEP-PR-07-12,[hep-ph],"[v1, v2]"
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,"We describe a new algorithm, the $(k,\ell)$-...",,[math.CO cs.CG],"[v1, v2]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,The evolution of Earth-Moon system is descri...,,[physics.gen-ph],"[v1, v2, v3]"


In [8]:
with open('data/authors-parsed.json') as f:
    authors_parsed = json.load(f)

In [None]:
authors_parsed.items()[0]

In [108]:
df_raw.loc[:, ['title', 'authors', 'abstract', 'comments']]

Unnamed: 0,title,authors,abstract,comments
0,Calculation of prompt diphoton production cros...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",A fully differential calculation in perturba...,"37 pages, 15 figures; published version"
1,Sparsity-certifying Graph Decompositions,Ileana Streinu and Louis Theran,"We describe a new algorithm, the $(k,\ell)$-...",To appear in Graphs and Combinatorics
2,The evolution of the Earth-Moon system based o...,Hongjun Pan,The evolution of Earth-Moon system is descri...,"23 pages, 3 figures"
3,A determinant of Stirling cycle numbers counts...,David Callan,We show that a determinant of Stirling cycle...,11 pages
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,Wael Abu-Shammala and Alberto Torchinsky,In this paper we show how to compute the $\L...,
...,...,...,...,...
1133984,Chaotic diffusion of complex trajectory and it...,"Wen-Lei Zhao, Jiaozi Wang, Qian Wang, and Peiq...",We investigate both the quantum and classica...,"10 pages, 7 figures"
1133985,A Decentralized IoT Data Marketplace,"P. Gupta, S. Kanhere, R. Jurdak",This paper proposes an architecture for dyna...,"6 pages, 7 figures"
1133986,Theory of Single Photon Detection by a Photore...,N. J. Harmon and M. E. Flatt\'e,The long spin coherence times in ambient con...,
1133987,CreativeBioMan: Brain and Body Wearable Comput...,"Min Chen, Yingying Jiang, Yong Cao, Albert Y. ...",Current artificial intelligence (AI) technol...,


In [109]:
df_raw.to_parquet('data/papers.parquet')

## Preprocess data

In [10]:
df_raw.sort_values('id')

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,abstract,report-no,categories,versions
0,0704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,A fully differential calculation in perturba...,ANL-HEP-PR-07-12,[hep-ph],"[v1, v2]"
1,0704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,"We describe a new algorithm, the $(k,\ell)$-...",,[math.CO cs.CG],"[v1, v2]"
2,0704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,The evolution of Earth-Moon system is descri...,,[physics.gen-ph],"[v1, v2, v3]"
3,0704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,We show that a determinant of Stirling cycle...,,[math.CO],[v1]
4,0704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,In this paper we show how to compute the $\L...,,[math.CA math.FA],[v1]
...,...,...,...,...,...,...,...,...,...,...,...
1133984,1906.01798,Qian Wang,"Wen-Lei Zhao, Jiaozi Wang, Qian Wang, and Peiq...",Chaotic diffusion of complex trajectory and it...,"10 pages, 7 figures",,,We investigate both the quantum and classica...,,[quant-ph],"[v1, v2, v3]"
1133985,1906.01799,Pooja Gupta,"P. Gupta, S. Kanhere, R. Jurdak",A Decentralized IoT Data Marketplace,"6 pages, 7 figures",,,This paper proposes an architecture for dyna...,,[cs.NI],[v1]
1133986,1906.01800,Michael E. Flatt\'e,N. J. Harmon and M. E. Flatt\'e,Theory of Single Photon Detection by a Photore...,,,,The long spin coherence times in ambient con...,,[quant-ph cond-mat.mes-hall],[v1]
1133987,1906.01801,Yingying Jiang,"Min Chen, Yingying Jiang, Yong Cao, Albert Y. ...",CreativeBioMan: Brain and Body Wearable Comput...,,,,Current artificial intelligence (AI) technol...,,[cs.HC],[v1]


In [11]:
s_authors = pd.Series(authors_parsed).rename('authors')

In [12]:
s_authors.sort_index()

0704.0001     [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...
0704.0002              [[Streinu, Ileana, ], [Theran, Louis, ]]
0704.0003                                    [[Pan, Hongjun, ]]
0704.0004                                   [[Callan, David, ]]
0704.0005     [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]
                                    ...                        
1906.01798    [[Zhao, Wen-Lei, ], [Wang, Jiaozi, ], [Wang, Q...
1906.01799     [[Gupta, P., ], [Kanhere, S., ], [Jurdak, R., ]]
1906.01800               [[Harmon, N. J., ], [Flatté, M. E., ]]
1906.01801    [[Chen, Min, ], [Jiang, Yingying, ], [Cao, Yon...
1906.01802            [[Murphy, Jason, ], [Nakanishi, Kenji, ]]
Name: authors, Length: 1133989, dtype: object

In [13]:
(s_authors.index.values == df_raw.loc[:, 'id'].values).all()

True

In [14]:
df_raw.drop(columns='authors')

Unnamed: 0,id,submitter,title,comments,journal-ref,doi,abstract,report-no,categories,versions
0,0704.0001,Pavel Nadolsky,Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,A fully differential calculation in perturba...,ANL-HEP-PR-07-12,[hep-ph],"[v1, v2]"
1,0704.0002,Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,"We describe a new algorithm, the $(k,\ell)$-...",,[math.CO cs.CG],"[v1, v2]"
2,0704.0003,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,The evolution of Earth-Moon system is descri...,,[physics.gen-ph],"[v1, v2, v3]"
3,0704.0004,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,We show that a determinant of Stirling cycle...,,[math.CO],[v1]
4,0704.0005,Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,In this paper we show how to compute the $\L...,,[math.CA math.FA],[v1]
...,...,...,...,...,...,...,...,...,...,...
1133984,1906.01798,Qian Wang,Chaotic diffusion of complex trajectory and it...,"10 pages, 7 figures",,,We investigate both the quantum and classica...,,[quant-ph],"[v1, v2, v3]"
1133985,1906.01799,Pooja Gupta,A Decentralized IoT Data Marketplace,"6 pages, 7 figures",,,This paper proposes an architecture for dyna...,,[cs.NI],[v1]
1133986,1906.01800,Michael E. Flatt\'e,Theory of Single Photon Detection by a Photore...,,,,The long spin coherence times in ambient con...,,[quant-ph cond-mat.mes-hall],[v1]
1133987,1906.01801,Yingying Jiang,CreativeBioMan: Brain and Body Wearable Comput...,,,,Current artificial intelligence (AI) technol...,,[cs.HC],[v1]


In [15]:
df = pd.concat([df_raw.drop(columns='authors').set_index('id'), s_authors], axis=1)

In [16]:
df.head(3)

Unnamed: 0_level_0,submitter,title,comments,journal-ref,doi,abstract,report-no,categories,versions,authors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
704.0001,Pavel Nadolsky,Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,A fully differential calculation in perturba...,ANL-HEP-PR-07-12,[hep-ph],"[v1, v2]","[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
704.0002,Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,"We describe a new algorithm, the $(k,\ell)$-...",,[math.CO cs.CG],"[v1, v2]","[[Streinu, Ileana, ], [Theran, Louis, ]]"
704.0003,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,The evolution of Earth-Moon system is descri...,,[physics.gen-ph],"[v1, v2, v3]","[[Pan, Hongjun, ]]"


See a sample of what the 'comments' column contains

In [115]:
for i, comment in enumerate(df.loc[:, 'comments'].head(20)):
    print('{}: {}'.format(i, comment))

0: 37 pages, 15 figures; published version
1: To appear in Graphs and Combinatorics
2: 23 pages, 3 figures
3: 11 pages
4: None
5: 6 pages, 4 figures, accepted by PRA
6: 16 pages, no figures. Typos corrected to match published version
7: Minor corrections
8: None
9: 36 pages, 17 figures
10: 14 pages; title changed; to appear in Experimental Mathematics
11: None
12: None
13: 18 pages, 1 figure
14: 22 pages; signs and coefficients adjusted for anticommuting
  superfields, section 4.3 changed accordingly, reference added
15: 17 pages, 3 figures and 1 table
16: 10 pages, 11 figures (figures 3, 4, 7 and 8 at reduced resolution,
  originals available on request). Accepted for publication in Monthly Notices
  of the Royal Astronomical Society
17: 20 pages, v2: an overall sign and typos corrected
18: 6 pages, Journal-ref added
19: 21 pages, 13 postscript figures, submitted to Phys. Rev. D,
  contributed to 42nd Rencontres de Moriond: QCD and Hadronic Interactions


See a sample of what the 'journal-ref' column contains

In [116]:
for i, data in enumerate(df.loc[:, 'journal-ref'].head(20)):
    print('{}: {}'.format(i, data))

0: Phys.Rev.D76:013009,2007
1: None
2: None
3: None
4: Illinois J. Math. 52 (2008) no.2, 681-689
5: None
6: Phys.Rev.D76:044016,2007
7: Journal of Applied Physics, vol 104, 073536 (2008)
8: Astrophys.J.663:1149-1173,2007
9: None
10: None
11: None
12: None
13: None
14: JHEP 0705:034,2007
15: Commun.Theor.Phys.49:993-1000,2008
16: Mon.Not.Roy.Astron.Soc.378:211-220,2007
17: None
18: RIMS Kokyuroku, No.1551, pp.57-62 (2007)
19: Phys.Rev.D76:052005,2007


### Explore ambiguous columns

#### journal-ref

Explore number of null vs non-null values

In [19]:
# Null values
df.loc[:, 'journal-ref'].isna().sum()

721031

In [20]:
# Non-null values
(~df.loc[:, 'journal-ref'].isna()).sum()

412958

In [21]:
df.loc[:, 'journal-ref'].value_counts().head(10)

Dans Design, Automation and Test in Europe - DATE'05, Munich :\n  Allemagne (2005)                                                        128
Dans Symposium on Design, Test, Integration and Packaging of\n  MEMS/MOEMS - DTIP 2008, Nice : France (2008)                               77
Dans Symposium on Design, Test, Integration and Packaging of\n  MEMS/MOEMS - DTIP 2007, Stresa, lago Maggiore : Italie (2007)              66
Prog Theor Exp Phys (2018)                                                                                                                 57
Prog Theor Exp Phys (2017)                                                                                                                 53
Prog Theor Exp Phys (2019)                                                                                                                 52
Dans Symposium on Design, Test, Integration and Packaging of\n  MEMS/MOEMS - DTIP 2006, Stresa, Lago Maggiore : Italie (2006)              49
Dans D

Do the same fo report-no

#### report-no

Explore number of null vs non-null values

In [22]:
# Null values
df.loc[:, 'report-no'].isna().sum()

1067617

In [23]:
# Non-null values
(~df.loc[:, 'report-no'].isna()).sum()

66372

In [24]:
df.loc[:, 'report-no'].value_counts()

CPH-SYM-DNRF92                              317
CPH-SYM-00                                  138
Roma01.Math                                 134
ISSN 1947 5500                              131
Mittag-Leffler-2011spring                    58
                                           ... 
LHCb-PAPER-2012-024; CERN-PH-EP-2012-263      1
CERN-LHCC-2018-027 LHCB-PUB-2018-009          1
IMS-AOAS-AOAS466                              1
DESY 15-124                                   1
Argonne report ANL-HEP-CP-09-92               1
Name: report-no, Length: 64476, dtype: int64

#### comments

In [25]:
# Null values
df.loc[:, 'comments'].isna().sum()

262263

In [26]:
# Non-null values
(~df.loc[:, 'comments'].isna()).sum()

871726

Conclusion: journal-ref and commentts are potentially useful, but an overwhelming number of report-no values are null and can thus be excluded

#### Extract the useful columns only

In [27]:
df.columns

Index(['submitter', 'title', 'comments', 'journal-ref', 'doi', 'abstract',
       'report-no', 'categories', 'versions', 'authors'],
      dtype='object')

In [28]:
columns = ['title', 'authors', 'submitter', 'abstract', 'categories', 'journal-ref', 'comments']

In [29]:
papers = df.loc[:, columns]

In [30]:
papers.head()

Unnamed: 0_level_0,title,authors,submitter,abstract,categories,journal-ref,comments
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
704.0001,Calculation of prompt diphoton production cros...,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...",Pavel Nadolsky,A fully differential calculation in perturba...,[hep-ph],"Phys.Rev.D76:013009,2007","37 pages, 15 figures; published version"
704.0002,Sparsity-certifying Graph Decompositions,"[[Streinu, Ileana, ], [Theran, Louis, ]]",Louis Theran,"We describe a new algorithm, the $(k,\ell)$-...",[math.CO cs.CG],,To appear in Graphs and Combinatorics
704.0003,The evolution of the Earth-Moon system based o...,"[[Pan, Hongjun, ]]",Hongjun Pan,The evolution of Earth-Moon system is descri...,[physics.gen-ph],,"23 pages, 3 figures"
704.0004,A determinant of Stirling cycle numbers counts...,"[[Callan, David, ]]",David Callan,We show that a determinant of Stirling cycle...,[math.CO],,11 pages
704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]",Alberto Torchinsky,In this paper we show how to compute the $\L...,[math.CA math.FA],"Illinois J. Math. 52 (2008) no.2, 681-689",


## Manipulate categories to make the information more useful

In [31]:
papers.iloc[1]['categories']

['math.CO cs.CG']

In [32]:
import typing

Define a function to process the categories columns and convert them from a list of a single string to a list/set of distinct strings

In [33]:
def process_categories(cat: typing.List[str]) -> typing.List[str]:
    s = cat[0]
    cat_list = s.split()
    return set(sorted(cat_list))

In [34]:
all_categories = set()
for c in papers.loc[:, 'categories']:
    all_categories |= process_categories(c)

In [35]:
all_categories

{'astro-ph',
 'astro-ph.CO',
 'astro-ph.EP',
 'astro-ph.GA',
 'astro-ph.HE',
 'astro-ph.IM',
 'astro-ph.SR',
 'cond-mat.dis-nn',
 'cond-mat.mes-hall',
 'cond-mat.mtrl-sci',
 'cond-mat.other',
 'cond-mat.quant-gas',
 'cond-mat.soft',
 'cond-mat.stat-mech',
 'cond-mat.str-el',
 'cond-mat.supr-con',
 'cs.AI',
 'cs.AR',
 'cs.CC',
 'cs.CE',
 'cs.CG',
 'cs.CL',
 'cs.CR',
 'cs.CV',
 'cs.CY',
 'cs.DB',
 'cs.DC',
 'cs.DL',
 'cs.DM',
 'cs.DS',
 'cs.ET',
 'cs.FL',
 'cs.GL',
 'cs.GR',
 'cs.GT',
 'cs.HC',
 'cs.IR',
 'cs.IT',
 'cs.LG',
 'cs.LO',
 'cs.MA',
 'cs.MM',
 'cs.MS',
 'cs.NA',
 'cs.NE',
 'cs.NI',
 'cs.OH',
 'cs.OS',
 'cs.PF',
 'cs.PL',
 'cs.RO',
 'cs.SC',
 'cs.SD',
 'cs.SE',
 'cs.SI',
 'cs.SY',
 'econ.EM',
 'econ.GN',
 'econ.TH',
 'eess.AS',
 'eess.IV',
 'eess.SP',
 'eess.SY',
 'gr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'math-ph',
 'math.AC',
 'math.AG',
 'math.AP',
 'math.AT',
 'math.CA',
 'math.CO',
 'math.CT',
 'math.CV',
 'math.DG',
 'math.DS',
 'math.FA',
 'math.GM',
 'math.

### processed_categories

Create a column 'processed_categories' with the categories now listed as separate strings

In [36]:
papers['processed_categories'] = papers['categories'].apply(process_categories)

In [37]:
papers.head()

Unnamed: 0_level_0,title,authors,submitter,abstract,categories,journal-ref,comments,processed_categories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
704.0001,Calculation of prompt diphoton production cros...,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...",Pavel Nadolsky,A fully differential calculation in perturba...,[hep-ph],"Phys.Rev.D76:013009,2007","37 pages, 15 figures; published version",{hep-ph}
704.0002,Sparsity-certifying Graph Decompositions,"[[Streinu, Ileana, ], [Theran, Louis, ]]",Louis Theran,"We describe a new algorithm, the $(k,\ell)$-...",[math.CO cs.CG],,To appear in Graphs and Combinatorics,"{cs.CG, math.CO}"
704.0003,The evolution of the Earth-Moon system based o...,"[[Pan, Hongjun, ]]",Hongjun Pan,The evolution of Earth-Moon system is descri...,[physics.gen-ph],,"23 pages, 3 figures",{physics.gen-ph}
704.0004,A determinant of Stirling cycle numbers counts...,"[[Callan, David, ]]",David Callan,We show that a determinant of Stirling cycle...,[math.CO],,11 pages,{math.CO}
704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]",Alberto Torchinsky,In this paper we show how to compute the $\L...,[math.CA math.FA],"Illinois J. Math. 52 (2008) no.2, 681-689",,"{math.FA, math.CA}"


In [38]:
num_of_categories = papers.loc[:, 'processed_categories'].apply(len)

In [39]:
(num_of_categories > 1).sum()

498186

To make the categories more useful, generalize them into their prefixes (i.e. math, physics, cs, etc.)

In [40]:
def generalize_categories(cat: typing.Set[str]) -> typing.Union[typing.List[str], str]:
    gen_cats = []
    for c in cat:
        ss = c.split('.')
        prefix = ss[0]
        gen_cats.append(prefix)
    
    if len(set(gen_cats)) == 1:
        categories = gen_cats[0]
    else:
        categories = ', '.join(sorted(set(gen_cats)))
    
    return categories

In [41]:
papers['general_categories'] = papers.loc[:, 'processed_categories'].apply(generalize_categories)

In [42]:
papers.head(3)

Unnamed: 0_level_0,title,authors,submitter,abstract,categories,journal-ref,comments,processed_categories,general_categories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
704.0001,Calculation of prompt diphoton production cros...,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...",Pavel Nadolsky,A fully differential calculation in perturba...,[hep-ph],"Phys.Rev.D76:013009,2007","37 pages, 15 figures; published version",{hep-ph},hep-ph
704.0002,Sparsity-certifying Graph Decompositions,"[[Streinu, Ileana, ], [Theran, Louis, ]]",Louis Theran,"We describe a new algorithm, the $(k,\ell)$-...",[math.CO cs.CG],,To appear in Graphs and Combinatorics,"{cs.CG, math.CO}","cs, math"
704.0003,The evolution of the Earth-Moon system based o...,"[[Pan, Hongjun, ]]",Hongjun Pan,The evolution of Earth-Moon system is descri...,[physics.gen-ph],,"23 pages, 3 figures",{physics.gen-ph},physics


Check how many papers have more than one category

In [43]:
papers.loc[:, 'general_categories'].value_counts()

math                                   232992
astro-ph                               131434
cond-mat                               126090
cs                                     124306
physics                                 59257
                                        ...  
math, math-ph, nlin, physics, stat          1
math, math-ph, nlin, physics, q-bio         1
hep-th, math, physics, q-bio, stat          1
gr-qc, hep-ph, nucl-th, quant-ph            1
cond-mat, cs, q-bio, quant-ph, stat         1
Name: general_categories, Length: 1357, dtype: int64

In [44]:
all_general_categories = set()
for c in papers.loc[:, 'general_categories']:
    all_general_categories |= (c if type(c) == set else {c})

In [45]:
len(all_general_categories)

1357

Get the number of category groupings with multiple categories

In [46]:
len([x for x in all_general_categories if ',' in x])

1338

Get the list of single categories

In [47]:
sorted([x for x in all_general_categories if ',' not in x])

['astro-ph',
 'cond-mat',
 'cs',
 'econ',
 'eess',
 'gr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'math',
 'nlin',
 'nucl-ex',
 'nucl-th',
 'physics',
 'q-bio',
 'q-fin',
 'quant-ph',
 'stat']

In [None]:
# Save useful information from model


## Start using scikit-learn

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans

In [49]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [50]:
tfidf = tfidf_vectorizer.fit_transform(papers.loc[:, 'abstract'])

In [86]:
vectorize_text = (papers.loc[:, 'title'] + ' ') * 3 + papers.loc[:, 'abstract']

In [87]:
vectorize_text.iloc[0]

'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies   A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermi

In [88]:
tfidf_wt = tfidf_vectorizer.fit_transform(vectorize_text)

In [87]:
kmeans_250 = MiniBatchKMeans(250)

In [88]:
model_250 = kmeans.fit(tfidf)

### Play around with some examples

In [54]:
abs_1 = """
In this paper, we draw on control theory to understand the conditions under which the use of agile practices is most 
effective in improving software project quality. Although agile development methodologies offer the potential of 
improving software development outcomes, limited research has examined how project managers can structure the 
software development environment to maximize the benefits of agile methodology use during a project. As a result, 
project managers have little guidance on how to manage teams who are using agile methodologies. Arguing that the most 
effective control modes are those that provide teams with autonomy in determining the methods for achieving project 
objectives, we propose hypotheses related to the interaction between control modes, agile methodology use, and 
requirements change. We test the model in a field study of 862 software developers in 110 teams. The model explains 
substantial variance in four objective measures of project quality—bug severity, component complexity, coordinative 
complexity, and dynamic complexity. Results largely support our hypotheses, highlighting the interplay between 
project control, agile methodology use, and requirements change. The findings contribute to extant literature by 
integrating control theory into the growing literature on agile methodology use and by identifying specific 
contingencies affecting the efficacy of different control modes. We discuss the theoretical and practical 
implications of our results.
"""

In [55]:
abs_2 = """
Simultaneously achieving efficiency and flexibility in enterprise software production has been a considerable 
challenge for firms. Newer software development paradigms such as component-based and model-driven development 
attempt to overcome this challenge by emphasizing modular design of complex systems. However, there is a paucity of 
rigorous empirical research on the use of such software methodologies and the associated extent to which trade-offs 
between efficiency and flexibility can be influenced. Addressing this gap, we investigate the performance outcomes of 
a model-driven, component-based software development methodology using data collected from an enterprise software 
development firm that deployed such a methodology for its product development processes. Examining the design, 
development, and implementation of 92 business software components of the firm's enterprise resource planning product,
we discuss how the design of software components, specifically component granularity, affects development efficiency 
(development effort and defects) and flexibility (customization effort). Our results suggest that (a) components that 
are coarse grained are associated with higher flexibility (lower customization effort) but are also associated with 
lower development efficiency (more development effort and defects), and (b) defect proneness of a component plays a 
mediating role on the relationship between component granularity and flexibility. These findings present strong 
evidence for the existence of trade-offs between efficiency and flexibility in mass-customized software product life 
cycles. They establish component granularity as a key design dimension that needs to be managed judiciously to enable
potential trade-off shifting mechanisms through the use of software methodologies that emphasize modular design 
approaches.
"""

In [56]:
cb_v = """
Humans possess the remarkable ability to perceive color, shape, and motion, and to differentiate between light
intensities varied by over nine orders of magnitude. Phototransduction—the process in which absorbed photons are 
converted into electrical responses—is the first stage of visual processing, and occurs in the outer segment, the 
light-sensing organelle of the photoreceptor cell. Studies of genes linked to human inherited blindness have been 
crucial to understanding the biogenesis of the outer segment and membrane-trafficking of photoreceptors.
"""

In [57]:
cb_t = """
Taste buds are aggregates of 50–100 polarized neuroepithelial cells that detect nutrients and other compounds. 
Combined analyses of gene expression and cellular function reveal an elegant cellular organization within the taste 
bud. This review discusses the functional classes of taste cells, their cell biology, and current thinking on how 
taste information is transmitted to the brain.
"""

In [77]:
abs_3 = """
This article will discuss the various windowing systems which are currently (or will be shortly) available. The 
difficulties of developing under these new environments will lead into a discussion of object-oriented programming 
techniques and how they can be used to facilitate the development of complex windows-based programs.
"""

In [78]:
abs_4 = """
Software contracting is a multi-faceted issue that involves legal, economic, managerial and technological 
considerations. To better understand the economic aspect of software contracting, this paper provides a summary review
of software development contracts, followed by a game-theoretic model developed to incorporate incentive and 
information issues associated with software contracting. In the model an outside contractor is hired to develop a 
software system over multiple periods. Due to the uncertainties about costs or technology, the developer faces the 
risk of having to abandon the project at an intermediate phase. The user is better informed of the benefit of the 
system, while the developer privately discovers the development costs as the project advances. Given the limited 
information, the contracting parties make decisions in their own interest, leaving each party vulnerable to the 
other's opportunistic behavior. In this setting, we construct a viable contract that aligns the incentives of the 
contracting parties and produces the same equilibrium outcome as in in-house development. We also relate the 
implications of the model to the actual contract cases.
"""

In [93]:
cb_v_vector = tfidf_vectorizer.transform([cb_v])

In [98]:
cb_t_vector = tfidf_vectorizer.transform([cb_t])

In [102]:
# kmeans_250.labels_

array([ 85, 230,  27, ...,  31,  36, 246], dtype=int32)

In [97]:
# model_250.predict(cb_v_vector)

array([14], dtype=int32)

In [99]:
# model_250.predict(cb_t_vector)

array([14], dtype=int32)

In [100]:
# model_250.predict(tfidf_vectorizer.transform([abs_1, abs_2, cb_v, cb_t]))

array([14, 14, 14, 14], dtype=int32)

All 4 of those abstracts probably should not be the same category. Try more clusters.

### n_clusters = 1000

In [75]:
kmeans_1000 = MiniBatchKMeans(n_clusters=1000, batch_size=1500)

In [52]:
model_1000 = kmeans_1000.fit(tfidf)

In [58]:
model_1000.predict(tfidf_vectorizer.transform([abs_1, abs_2, cb_v, cb_t]))

array([697, 697, 916, 221], dtype=int32)

In [59]:
model_1000.predict(tfidf_vectorizer.transform([]))

array([794], dtype=int32)

In [60]:
model_1000.predict(tfidf_vectorizer.transform([]))

array([697], dtype=int32)

### n_clusters = 750 (currently in use)

In [89]:
kmeans_750 = MiniBatchKMeans(n_clusters=750, batch_size=1000)

In [90]:
model_750 = kmeans_750.fit(tfidf_wt)

In [91]:
model_750.predict(tfidf_vectorizer.transform([abs_1, abs_2, cb_t, cb_v, abs_3, abs_4]))

array([547, 163, 163,  72, 622, 163], dtype=int32)

In [92]:
model_750.labels_

array([720,  97,  41, ..., 311, 397, 433], dtype=int32)

In [95]:
model_750.predict(tfidf_vectorizer.transform([3 * papers.iloc[0]['title'] + papers.iloc[0]['abstract']]))

array([720], dtype=int32)

In [96]:
papers.iloc[0]['title']

'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies'

In [101]:
print(papers.iloc[0]['abstract'])

  A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
that enhanced sensitivity to the signal can be obtained with judicious
selection of events.



In [105]:
cluster_720 = np.where(model_750.labels_ == 720)[0]

In [107]:
papers.iloc[cluster_720]

Unnamed: 0_level_0,title,authors,submitter,abstract,categories,journal-ref,comments,processed_categories,general_categories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0704.0001,Calculation of prompt diphoton production cros...,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...",Pavel Nadolsky,A fully differential calculation in perturba...,[hep-ph],"Phys.Rev.D76:013009,2007","37 pages, 15 figures; published version",{hep-ph},hep-ph
0704.0031,Crystal channeling of LHC forward protons with...,"[[Biryukov, V. M., , Serpukhov, IHEP]]",Valery M. Biryukov,"We show that crystal can trap a broad (x, x'...",[hep-ph],"Phys.Lett.B658:7-12,2007","11 pages, 3 figures",{hep-ph},hep-ph
0704.0235,The Determination of the Helicity of $W'$ Boso...,"[[Rizzo, Thomas G., ]]",Thomas G. Rizzo,"Apart from its mass and width, the most impo...",[hep-ph hep-ex],"JHEP 0705:037,2007","29 pages, 11 figures; discussion and reference...","{hep-ph, hep-ex}","hep-ex, hep-ph"
0704.0254,Unravelling the sbottom spin at the CERN LHC,"[[Alves, Alexandre, ], [Eboli, Oscar, ]]",Alexandre Alves,Establishing that a signal of new physics is...,[hep-ph],"Phys.Rev.D75:115013,2007","9 pages, 11 figures",{hep-ph},hep-ph
0704.0294,QED x QCD Resummation and Shower/ME Matching f...,"[[Ward, B. F. L., ], [Yost, S. A., ]]",Scott A. Yost,We present the theory of QED x QCD resummati...,[hep-ph],"ActaPhys.Polon.B38:2395-2403,2007","12 pages, LaTex with packages amsmath, amssymb",{hep-ph},hep-ph
...,...,...,...,...,...,...,...,...,...
1905.10106,LHC Constraints on a $(B-L)_3$ Gauge Boson,"[[Elahi, Fatemeh, ], [Martin, Adam, ]]",Fatemeh Elahi,"In this paper, we explore the constraints th...",[hep-ph],"Phys. Rev. D 100, 035016 (2019)","28 pages, 12 figures",{hep-ph},hep-ph
1905.10419,Rescattering effects in antiproton-induced exc...,"[[Larionov, A. B., ], [Gillitzer, A., ], [Stri...",Alexei Larionov,On the basis of the generalized eikonal appr...,[nucl-th hep-ex hep-ph nucl-ex],,"28 pages, 8 figures, modified sec. 1 and 2, ne...","{nucl-ex, hep-ph, hep-ex, nucl-th}","hep-ex, hep-ph, nucl-ex, nucl-th"
1905.12953,Non-Resonant Searches for Axion-Like Particles...,"[[Gavela, M. B., ], [No, J. M., ], [Sanz, V., ...",Veronica Sanz,We propose a new collider probe for axion-li...,[hep-ph hep-ex],"Phys. Rev. Lett. 124, 051802 (2020)","10 pages, 7 figures. New version includes inte...","{hep-ph, hep-ex}","hep-ex, hep-ph"
1906.00894,Near threshold $J/\psi$ and $\Upsilon$ photo-p...,"[[Hatta, Yoshitaka, ], [Rajan, Abha, ], [Yang,...",Yoshitaka Hatta,We update our previous calculation of $J/\ps...,[hep-ph],"Phys. Rev. D 100, 014032 (2019)","10 pages, 9 figures",{hep-ph},hep-ph


### Save whichever model is most suitable

In [61]:
with open('model/kmeans.pkl', 'wb') as f:
    pickle.dump(model_1000, f)

In [62]:
with open('model/tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [63]:
with open('model/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

## Try using model from loaded .pkl file

In [64]:
with open('model/kmeans.pkl', 'rb') as f:
    km = pickle.load(f)

In [65]:
km

MiniBatchKMeans(batch_size=1500, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=1000, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [66]:
with open('model/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vec = pickle.load(f)

In [67]:
tfidf_vec

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [68]:
km.predict(tfidf_vec.transform(["""
This article puts Marxist and queer theories in conversation with one another to advance thinking about gender and 
sexuality. I argue that Marxist concepts such as class, mode of production, and struggle need to be "borrowed" to 
sharpen queer theory. My arguments are situated in the context of the Philippine literature that has examined the 
local queer experience. Such approach productively re-imagines contemporary discourses and performances of sexuality 
in Philippine society.
"""]))

array([916], dtype=int32)

In [69]:
km.predict(tfidf_vec.transform([abs_1, abs_2, cb_t, cb_v]))

array([697, 697, 221, 916], dtype=int32)

In [70]:
km.predict(tfidf_vec.transform(["""
A galvanic or voltaic cell uses a redox reaction to produce an electric current. It consists of two half cells, with 
each half cell containing an electrode (a metal) in a solution of its own ions (Owen, Ahmed, 2015). One type of 
galvanic cell that is frequently used in labs to gather information and analyze data in electrochemistry is the 
Daniell cell (Lauren, 1965). The Daniell cell consists of a zinc electrode in a solution of zinc sulfate, or 
ZnSO4(aq) (which contains Zn2+ ions) and a copper electrode in a solution of copper (II) sulfate, or CuSO4(aq) 
(which contains Cu2+ ions). The two electrodes are connected via a wire, and the solutions are connected via a salt 
bridge containing a solution of sodium sulfate, or Na2SO4(aq), to allow the flow of ions to complete the circuit 
(Queen’s University, 2014). This would cause the following chemical reactions (1) and (2), resulting in the net ionic 
equation (3)
"""]))

array([221], dtype=int32)

In [71]:
km.predict(tfidf_vec.transform(["""
For guest editor George Henson, it's been a long journey from reading The Front Runner in 1977 in Sapulpa, Oklahoma, to writing about queer lit for World Literature Today. But just as he has found his place here, the writers featured in this issue have taken their place alongside a long list of notable world authors.
"""]))

array([916], dtype=int32)