# Paper Classification

Here, we will explain the method of classifying a paper as a GWAS paper or not a GWAS paper.

In [1]:
from bs4 import BeautifulSoup
import glob
import numpy as np
import pickle
import re

First, we extract features from the paper. In particular, we extract text, which is comprised of the article title, abstract, table captions, and table headers. We also extract three filters based on the year the paper was published, the number of times an RSID appears in the paper's tables, and the number of times an RSID appears in the tables and extracted text.

In [2]:
def extract_features(paper):
    """
    input: name of paper
    output: [text, filters]
    """
    with open(paper) as f:
        soup = BeautifulSoup(f, 'xml')

        ### Text ###
        # Title and Abstract
        text = ""
        article_title = ""
        article_title_tag = soup.find('article-title')
        abstract_tag = soup.find('abstract')
        if article_title_tag is not None:
            text += article_title_tag.get_text().lower() + " "
            article_title = article_title_tag.get_text().lower() + " "
        if abstract_tag is not None:
            text += abstract_tag.get_text().lower()
        else:
            if soup.find('body') is not None:
                body = soup.find('body').get_text().lower()
                # get first 2000 characters from body
                limit = min(len(body), 2000)
                text += body[:limit]

        # Tables
        table_tags = soup.find_all('table-wrapper') + soup.find_all('table-wrap')
        table_titles = ""
        table_headers = ""
        table_data = ""
        for table_tag in table_tags:
            table_title_tag = table_tag.find('caption')
            if table_title_tag is not None:
                table_titles += table_title_tag.get_text().lower() + " "
            table_header = table_tag.find('thead')
            if table_header is not None:
                header_names = table_header.find_all('td')
                for name in header_names:
                    table_headers += name.get_text().lower() + " "
            table_body = table_tag.find('tbody')
            if table_body is not None:
                body_data = table_body.find_all('td')
                for data in body_data:
                    table_data += data.get_text().lower() + " "

        extracted_text = text + table_titles + table_headers

        ### Filters ###
        rsid_regex = re.compile('rs[0-9]+?')
        table_rsid = re.findall(rsid_regex, table_data)
        all_rsid = re.findall(rsid_regex, extracted_text + ' ' + table_data)
        year = soup.find('pub-date').find('year').get_text()

        year_filter = int(year) >= 2006
        weak_rsid_filter = len(all_rsid) > 0
        strong_rsid_filter = len(table_rsid) > 0

        extracted_filters = [year_filter, weak_rsid_filter, strong_rsid_filter]

        return [extracted_text, extracted_filters]

In [8]:
X_text = []
X_filters = []

papers = glob.glob('../data/db/papers/*')

for paper in papers:
    features = extract_features(paper)
    X_text.append(features[0])
    X_filters.append(features[1])

To predict whether a paper is GWAS or not, we use an SVM classifier that has been trained on the title and abstract of papers using a bag-of-words model.

In [9]:
with open('../data/classifiers/classifier.pkl') as f:
    clf = pickle.load(f)
predicted = clf.predict(X_text)

ValueError: Found array with 0 sample(s) (shape=(0, 146291)) while a minimum of 1 is required by the normalize function.

Also, we filter out any paper published before 2006, because those papers are highly unlikely to be GWAS. (The first GWAS paper was published in 2005.)

In [5]:
filters = np.asarray(X_filters)
predicted = np.logical_and(predicted, filters[:,0])

Optionally, we can also filter out papers that do not have any extractable information. The weak filter excludes any papers that don't have an RSID in the title, abstract, or tables. The strong filter excludes any papers that don't have an RSID in the tables.

In [6]:
filter_type = None  # You can change this!
if filter_type == 'weak':
    predicted = np.logical_and(predicted, filters[:,1])
elif filter_type == 'strong':
    predicted = np.logical_and(predicted, filters[:,2])

Write out the results.

In [7]:
def get_title(paper):
    with open(paper) as f:
        text = f.read()
        title_regex = re.compile('<article-title>.+?</article-title>')
        match = re.search(title_regex, text)
        if match is None:
            return ""
        title = match.group(0)[15:-16]
        title = title.replace('<italic>', '').replace('</italic>', '')
        return title

num_gwas_predicted = np.count_nonzero(predicted)
print "number GWAS predicted: {}".format(num_gwas_predicted)
for i, p in enumerate(predicted):
    if p == 1:
        print "{} - {}".format(papers[i], get_title(papers[i]))

number GWAS predicted: 0


Now let's see what happens when we run on a set of 100 random open-access papers.

In [10]:
filter_type = None  # You can change this again!

In [13]:
X_text = []
X_filters = []

papers = glob.glob('../data/db/non-gwas/*')

for paper in papers:
    features = extract_features(paper)
    X_text.append(features[0])
    X_filters.append(features[1])
    
predicted = np.logical_and(predicted, filters[:,0])
if filter_type == 'weak':
    predicted = np.logical_and(predicted, filters[:,1])
elif filter_type == 'strong':
    predicted = np.logical_and(predicted, filters[:,2])

num_gwas_predicted = np.count_nonzero(predicted)
print "number GWAS predicted: {} out of {}".format(num_gwas_predicted, len(predicted))
for i, p in enumerate(predicted):
    if p == 1:
        print "{} - {}".format(papers[i], get_title(papers[i]))

number GWAS predicted: 0 out of 169


 We should expect very few papers to be classified as GWAS, if any.