In [1]:
%matplotlib inline

# import dependencies
import tika
tika.initVM()
from tika import parser

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import matplotlib
from matplotlib import pyplot as plt


# web resources used
# https://www.kaggle.com/itratrahman/nlp-tutorial-using-python
    # ideally parse pdf data into dataframe format for feature engineering?
# https://stackoverflow.com/questions/50985619/how-to-read-pdf-files-which-are-in-asian-languages-chinese-japanese-thai-etc - not that useful but oh well
# https://stackoverflow.com/questions/46389254/how-to-parse-text-extracted-from-pdf-file-with-delimiter-using-python - answer by Grijesh Chauhan
# various other general programming google search results

In [2]:
# regex for dealing with strings of text
import re

In [3]:
# read in pdf
parsed = parser.from_file('AAAI-19_Accepted_Papers.pdf')
metadata = parsed["metadata"]
content = parsed["content"]

In [4]:
# checked content, is a string type, looks good when printed but is a load of formatting when called
# re split at new lines and \xa0
split_at_nums = re.split(r'\s+(?=\d)|(?<=\d)\s+', content)
# split_at_nums = re.split("\d+:?", content)
# this isn't that great because it's splitting at any instance that says 3D
# extractedText = re.split('\n|\xa0', content) # parsedText = [''.join([*filter(str.isalnum, e)]) for e in extractedText]

In [5]:
entries = [re.sub('\xa0|\n|\xad', '', line) for line in split_at_nums]
len(entries)

1179

In [6]:
# https://blog.manash.me/configuring-stanford-parser-and-stanford-ner-tagger-with-nltk-in-python-on-windows-f685483c374a

import nltk
import os

# core nlp needs java 8 to run
import os
java_path = "C:\Program Files (x86)\Common Files\Oracle\Java\javapath\java.exe"
os.environ['JAVAHOME'] = java_path

# downloaded punkt separately because error text demanded it
nltk.download('punkt')
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# change the path according to my system
stanford_classifier = r"C:\Users\Qiao\Downloads\sdnlp\stanford-ner-2018-10-16\stanford-ner-2018-10-16\classifiers\english.all.3class.distsim.crf.ser.gz"
stanford_ner_path = r"C:\Users\Qiao\Downloads\sdnlp\stanford-ner-2018-10-16\stanford-ner-2018-10-16\stanford-ner.jar"


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Qiao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Creating Tagger Object
# checking to see if it worked
# takes a long time to process entries line by line, will process single file then clean up
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

test_text = entries[11]

test_tokenized_text = word_tokenize(test_text)
test_classified_text = st.tag(test_tokenized_text)

print(test_classified_text)

[('81', 'O'), (':', 'O'), ('PhoneMD', 'O'), (':', 'O'), ('Learning', 'O'), ('to', 'O'), ('Diagnose', 'O'), ('Parkinson', 'O'), ("'s", 'O'), ('Disease', 'O'), ('from', 'O'), ('Smartphone', 'ORGANIZATION'), ('Data', 'ORGANIZATION'), ('Patrick', 'ORGANIZATION'), ('Schwab', 'ORGANIZATION'), ('(', 'ORGANIZATION'), ('ETH', 'ORGANIZATION'), ('Zurich', 'ORGANIZATION'), (')', 'O'), ('*', 'O'), (';', 'O'), ('Walter', 'PERSON'), ('Karlen', 'PERSON'), ('(', 'O'), ('ETH', 'O'), ('Zurich', 'LOCATION'), (')', 'O')]


In [8]:
# clean up formatters in single file from parsed content
single_file = re.sub('\xa0|\n|\xad', '', content)

In [9]:
# apply to string parsed from pdf
text = single_file

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

In [10]:
# to dataframe format
df = pd.DataFrame(classified_text[92:]
                 , columns=['tokenized_text','tag']) # content after header
col_list = df.columns
#df.loc[(df[col_list] == 'O').any(axis=1)]

In [11]:
# looping to find each tag in tuples too long, I can't optimize this; using dataframes to make categories
raw_context = pd.DataFrame(df.loc[df['tag'] == 'O', 'tokenized_text'], index=None)
raw_persons = pd.DataFrame(df.loc[df['tag'] == 'PERSON', 'tokenized_text'], index=None)
raw_org = pd.DataFrame(df.loc[df['tag'] == 'ORGANIZATION', 'tokenized_text'], index=None)

In [12]:
context = raw_context['tokenized_text'].values.tolist()
persons = raw_persons['tokenized_text'].values.tolist()
orgs = raw_org['tokenized_text'].values.tolist()

In [13]:
# manual data cleaning phase
# organization tags
temp_list = []
orgs_list = []
for item in orgs:
    if item not in "()*": # assuming vast majority of organizatins delimited by round brackets
        temp_list.append(item)
    else:
        orgs_list.append(' '.join(temp_list))
        temp_list = []
        pass

In [14]:
# O tags, most titles are categorized under such
temp_list = []
cleaned_list = []
context_list = []
for item in context:
    if item not in "()*^; ":
        temp_list.append(item)
    else:
        cleaned_list.append(' '.join(temp_list))
        temp_list = []
        pass
cleaned_list = filter(None, cleaned_list) # dropped empty str
for item in cleaned_list:
    if item[0].isdigit():
        context_list.append(item)

In [15]:
# person tags
names_list = []
for item in persons:
    if len(item) > 1 and "." not in item and item not in "()*^; ": # I'm going to get rid of middle initials..
        names_list.append(item)
    else:
        pass


In [16]:
# final dataframes for extracted person names, organizations, and paper titles
# or something of those sorts
names_df = pd.DataFrame(names_list,
                       columns=['names'])
organizations_df = pd.DataFrame(orgs_list,
                       columns=['organizations'])
titles_df = pd.DataFrame(context_list,
                       columns=['titles'])

In [17]:
names_df.to_csv('names_df')

In [18]:
organizations_df.to_csv('organizations_df')

In [19]:
titles_df.to_csv('titles_df')