# Advanced Text Extraction Challenge #

In [39]:
import pandas as pd
import numpy as np
import arxiv
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

nlp = spacy.load("en_core_web_md")

In [53]:
result = arxiv.query("cs",max_results=3000)

cs = pd.DataFrame(result)

In [54]:
result = arxiv.query("math",max_results=3000)

math = pd.DataFrame(result)

In [55]:
result = arxiv.query("physics",max_results=3000)

physics = pd.DataFrame(result)

In [56]:
df = pd.concat([cs, math, physics])

In [57]:
df.shape

(9000, 22)

In [58]:
df.columns

Index(['affiliation', 'arxiv_comment', 'arxiv_primary_category', 'arxiv_url',
       'author', 'author_detail', 'authors', 'doi', 'guidislink', 'id',
       'journal_reference', 'links', 'pdf_url', 'published',
       'published_parsed', 'summary', 'summary_detail', 'tags', 'title',
       'title_detail', 'updated', 'updated_parsed'],
      dtype='object')

In [59]:
df['text'] = df['summary'] + " " + df['title']
df = df.drop(columns=['affiliation','arxiv_comment','arxiv_url',
                     'author_detail','doi','guidislink','id',
                     'pdf_url','links','published','summary_detail',
                     'title_detail','updated','updated_parsed',
                     'journal_reference','tags','author','authors',
                     'published_parsed','summary','title'])

In [60]:
df['length'] = df['text'].apply(len)

In [61]:
df.head()

Unnamed: 0,arxiv_primary_category,text,length
0,"{'term': 'astro-ph', 'scheme': 'http://arxiv.o...","Aims. Our goals are to compare the CS, N2H+ an...",1723
1,"{'term': 'astro-ph', 'scheme': 'http://arxiv.o...",Using data from IRAM's Plateau de Bure Interfe...,397
2,"{'term': 'cs.IT', 'scheme': 'http://arxiv.org/...",We consider the problem of recursively reconst...,587
3,"{'term': 'cs.IT', 'scheme': 'http://arxiv.org/...",Compressed sensing (CS) is an important theory...,1637
4,"{'term': 'astro-ph.GA', 'scheme': 'http://arxi...",We present a theoretical study of CS line prof...,787


In [62]:
df['arxiv_primary_category'] = df['arxiv_primary_category'].astype(str).str.replace('{\'term\': \'','').str.replace(r'\.(.*)','').str.replace(r'\',(.*)','')

In [63]:
df['arxiv_primary_category'].unique()

array(['astro-ph', 'cs', 'quant-ph', 'cond-mat', 'math', 'physics',
       'nucl-ex', 'eess', 'nucl-th', 'hep-th', 'hep-ph', 'math-ph',
       'gr-qc', 'stat', 'nlin', 'q-bio', 'q-alg', 'q-fin', 'hep-lat',
       'chao-dyn', 'funct-an', 'hep-ex', 'econ', 'cmp-lg', 'solv-int',
       'adap-org', 'alg-geom', 'dg-ga', 'atom-ph', 'acc-phys', 'chem-ph',
       'plasm-ph'], dtype=object)

In [64]:
df['arxiv_primary_category'].value_counts()

math        2610
physics     1092
math-ph      762
hep-ph       747
cs           732
hep-th       612
astro-ph     579
cond-mat     527
quant-ph     500
hep-ex       245
gr-qc        152
hep-lat      105
nucl-th       71
nlin          60
stat          38
eess          36
nucl-ex       29
q-bio         22
cmp-lg        16
q-alg         16
solv-int      11
dg-ga         10
funct-an       8
alg-geom       6
chao-dyn       3
chem-ph        3
q-fin          2
econ           2
atom-ph        1
adap-org       1
plasm-ph       1
acc-phys       1
Name: arxiv_primary_category, dtype: int64

In [67]:
tokens = []
lemma = []
pos = []

for doc in nlp.pipe(df['text'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

df['tokens'] = tokens
df['lemma'] = lemma
df['pos'] = pos

In [68]:
df['text_parsed'] = df['text'].apply(lambda x: nlp(x).vector)

In [69]:
df.isnull().sum()

arxiv_primary_category    0
text                      0
length                    0
tokens                    0
lemma                     0
pos                       0
text_parsed               0
dtype: int64

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [78]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(df['text']).todense()
vectors.shape

(9000, 27740)

In [79]:
vocab = np.array(vectorizer.get_feature_names())

In [80]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=1)
tfidf = tfidf_vectorizer.fit_transform(df['text'])
tfidf.shape

(9000, 27740)

In [81]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

### Latent Semantic Analysis ###

In [82]:
lsa = TruncatedSVD(n_components=3)
W1 = lsa.fit_transform(tfidf)
H1 = lsa.components_

In [83]:
show_topics(H1)

['physics quantum cs theory new model physical field',
 'physics lhc standard new review collider particle neutrino',
 'quantum theory physical mechanics group algebras field groups']

In [85]:
lsa = TruncatedSVD(n_components=5)
W1 = lsa.fit_transform(tfidf)
H1 = lsa.components_

In [86]:
show_topics(H1)

['physics quantum cs theory new model physical field',
 'physics lhc standard new review collider particle neutrino',
 'quantum theory physical mechanics group algebras space groups',
 'sensing sparse signal reconstruction compressive compressed algorithm recovery',
 'quantum physical mechanics sensing theory signal reconstruction sparse']

In [87]:
lsa = TruncatedSVD(n_components=10)
W1 = lsa.fit_transform(tfidf)
H1 = lsa.components_

In [88]:
show_topics(H1)

['physics quantum cs theory new model physical field',
 'physics lhc standard new review collider particle neutrino',
 'quantum theory physical mechanics group algebras field groups',
 'sensing sparse signal reconstruction compressive compressed algorithm recovery',
 'quantum physical mechanics sensing theory signal reconstruction sparse',
 'gauge theory field chern simons theories lattice gravity',
 'physical molecular equations gas star emission theory theories',
 'theory group groups gauge chern field simons cs',
 'physical quark lattice qcd flavor heavy algebras groups',
 'model standard new quantum decays cp violation quark']

### Latent Dirichlet Allocation ###

In [89]:
lda = LatentDirichletAllocation(n_components=3, random_state=0)

W1 = lda.fit_transform(tfidf)
H1 = lda.components_

In [90]:
show_topics(H1)

['physics new standard review quark model particle qcd',
 'quantum theory physical cs space paper group field',
 'cs molecular magnetic atoms gas _2 atomic state']

In [92]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)

W1 = lda.fit_transform(tfidf)
H1 = lda.components_

In [93]:
show_topics(H1)

['physics new standard model review particle lhc quark',
 'cs sensing sparse reconstruction based signal algorithm compressive',
 'cs molecular magnetic _2 atoms state gas spin',
 'walk withdrawn holm camassa balls park ph berwald',
 'quantum theory space group physical equations functions dimensional']

In [94]:
lda = LatentDirichletAllocation(n_components=10, random_state=0)

W1 = lda.fit_transform(tfidf)
H1 = lda.components_

In [95]:
show_topics(H1)

['physics new lhc standard neutrino collider model review',
 'group groups prove manifolds spaces surfaces theorem algebras',
 'hera spinor zeus h1 lep2 emphasizing h2s ep',
 'hodge balls morita bernoulli momenta snowmass symmetrized symposium',
 'adic jordan car adelic riesz dyson isomorphisms brs',
 'tetraquark lefschetz clic topos dinger tesla camassa holm',
 'toda superalgebras majorana hierarchies presentations lefschetz lie yangian',
 '_4 _2 fe triangular cucl antiferromagnet _3 t_',
 'cs quantum physics theory physical field model systems',
 'calogero encyclopedia mc dglap ia quarkonium jack elsevier']

### Non-negative Matrix Factorization ###

In [96]:
clf = NMF(n_components=3, random_state=1)

W1 = clf.fit_transform(tfidf)
H1 = clf.components_

In [97]:
show_topics(H1)

['cs sensing signal sparse reconstruction based data compressive',
 'physics new standard model lhc particle review quark',
 'quantum theory physical field space group dimensional gauge']

In [98]:
clf = NMF(n_components=5, random_state=1)

W1 = clf.fit_transform(tfidf)
H1 = clf.components_

In [99]:
show_topics(H1)

['sensing cs signal sparse reconstruction compressive compressed based',
 'physics new standard model lhc particle review collider',
 'group groups space functions equations algebras dimensional algebra',
 'cs molecular spin magnetic gas _2 atoms states',
 'quantum physical theory mechanics field systems states classical']

In [100]:
clf = NMF(n_components=10, random_state=1)

W1 = clf.fit_transform(tfidf)
H1 = clf.components_

In [101]:
show_topics(H1)

['sensing cs signal sparse reconstruction compressive compressed based',
 'physics particle lhc neutrino collider future energy students',
 'group groups algebras algebra space finite functions lie',
 'cs molecular gas _2 atoms magnetic line molecules',
 'quantum mechanics theory classical states systems relativistic state',
 'theory field gauge chern simons theories gravity cs',
 'physical systems mathematical space time information laws models',
 'equations equation solutions differential nonlinear solution wave systems',
 'quark lattice qcd flavor heavy recent results spin',
 'model new standard decays cp violation physics electroweak']