In [3]:
# Modules
import xml.etree.ElementTree as ET
import random
import os
import glob
import pandas as pd
import nltk
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

  from numpy.core.umath_tests import inner1d


In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
# Local
import parsing_xml as px

In [13]:
exml = ET.parse('tests/latexmled_files/math.0407523.xml')
ns = {'latexml': 'http://dlmf.nist.gov/LaTeXML' }
para_lst = exml.findall('.//latexml:para', ns)
para_text = [px.recutext1(p) for p in para_lst]

In [16]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in para_text] 

In [36]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [39]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [40]:
for l in ldamodel.print_topics(num_topics=3, num_words=3):
    print(l)

(0, '0.023*"extension" + 0.023*"system" + 0.023*"coherent"')
(1, '0.022*"coherent" + 0.022*"system" + 0.018*"follows"')
(2, '0.034*"value" + 0.024*"suppose" + 0.023*"critical"')


### Title:
#### ON THE GEOMETRY OF MODULI SPACES OF COHERENT SYSTEMS ON ALGEBRAIC CURVES

In [12]:
#prepare the dataset
allData = pd.DataFrame()
with open('data/out_defs.txt','r') as f1:
    all_data_texts = f1.readlines()
all_data_labels = len(all_data_texts)*[1.0]
with open('data/out_rand.txt', 'r') as f2:
    all_data_texts_rand = f2.readlines()
all_data_texts += all_data_texts_rand
all_data_labels += len(all_data_texts_rand)*[0.0]

# 1.0 will represent definitions is true 0.0 means it is false (not a definition)
allData['labels'] = all_data_labels
allData['texts'] = all_data_texts

In [18]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(allData['texts'], allData['labels'])

In [63]:
train_x.head(25)

168    there is a homomorphism           which takes ...
183    laboratoire j.–a. dieudonné,   université de n...
0             une orbifolde pure est un espace analyt...
16     the functor             is the covariant funct...
75     the functor                                   ...
28              [ in the special case that ] has a ge...
82            let  be a subscheme of . then a resolut...
101    given a vector bundle         on , a sub-bundl...
150                                                   \n
237           soit  une surface  lisse, et  un divise...
208    in the next paragraph, we just need that      ...
177                                                   \n
203    we keep the same notation as in the proof of t...
40     fix         and a -graded -module . the euler–...
130    by morita equivalence,         is isomorphic t...
94     a semi-stable curve           is called stable...
115    let           be a semisimple conjugacy class ...
163    this work is devoted to 

In [40]:
count_vect = CountVectorizer(analyzer='word', tokenizer=nltk.word_tokenize, ngram_range=(1,2))
count_vect.fit(allData['texts'])
xtrain = count_vect.transform(train_x)
xtest = count_vect.transform(test_x)

In [41]:
clf = naive_bayes.MultinomialNB().fit(xtrain, train_y)
predictions = clf.predict(xtest)
print(metrics.classification_report(predictions,test_y))

             precision    recall  f1-score   support

        0.0       0.61      0.87      0.71        23
        1.0       0.90      0.67      0.76        39

avg / total       0.79      0.74      0.75        62



In [42]:
print(metrics.confusion_matrix(predictions,test_y))

[[20  3]
 [13 26]]


In [54]:
out_file = open('../out_rand.txt','a')
ns = {'latexml': 'http://dlmf.nist.gov/LaTeXML' }
for f in glob.glob('../2004_ends_with_3/*/*.xml'):
    try:
        exml = ET.parse(f)
        para_lst_nonrand = exml.findall('.//latexml:para',ns)
        para_lst = random.sample(para_lst_nonrand, 2)
        for p in para_lst:
            out_file.write(px.recutext1(p) + "\n")
    except ET.ParseError:
        pass
    except ValueError:
        print('article %s has few paragraphs'%f)
out_file.close()

article ../2004_ends_with_3/math.0412073/psfrag.xml has few paragraphs
article ../2004_ends_with_3/math.0412073/bibliography.xml has few paragraphs
article ../2004_ends_with_3/math.0412073/preamble.xml has few paragraphs
article ../2004_ends_with_3/math.0412533/gtoutput.xml has few paragraphs
