# CellChar: A SVM-based characterization method for cellular lines using text processing

## Iván Carrera, Eduardo Tejera, and Inês Dutra
### Departamento de Informática y Ciencias de la Computación, Escuela Politécnica Nacional. Quito, Ecuador.
### Departamento de Ciencia de Computadores, Universidade do Porto, Portugal.
### Grupo de Quimio-Bioinformática, Universidad de Las Américas. Quito, Ecuador.

### Programa de Doutoramento em Ciência de Computadores FCUP.

#### Characterizing Cellular lines
The goal of this work is to characterize cellular lines from processing the related scientific literature

##### Web mining
There are two main source databases for cellular lines: [Cellosaurus](https://web.expasy.org/cellosaurus/)
and [ChEMBL](https://www.ebi.ac.uk/chembl/).
First of all, we have to identify the cellular lines. File contains a list of cellular lines, with their synonyms

Processing Cellosaurus v34

In [None]:
cellosaurus_file = 'data/cellosaurus/cellosaurus.xml'
print('Cellosaurus file:', cellosaurus_file)
with open(cellosaurus_file, 'r') as file_:
    lines = file_.readlines()
    for i in range(107,135):
        print(lines[i].strip('\n'))

import xml.etree.ElementTree as ET
tree = ET.parse(cellosaurus_file)
root = tree.getroot()

cell_dict = dict()
for cell_line in root.find('cell-line-list').findall('cell-line'):
    accession = cell_line.find('accession-list').find('accession')
    name_list = list()
    for name in cell_line.find('name-list').findall('name'):
        name_list.append(name.text)
    species_list = list()
    for species in cell_line.find('species-list').findall('cv-term'):
        species_list.append(species.attrib['accession'])
    reference_list = list()
    try:
        for reference in cell_line.find('reference-list').findall('reference'):
            if 'PubMed' in reference.attrib['resource-internal-ref']:
                reference_list.append(reference.attrib['resource-internal-ref'].replace('PubMed=',''))
    except AttributeError:
        pass
    cell_dict[accession.text] = {'accession': accession.text, 'cell_name': name_list, 'species': species_list,
                      'reference': reference_list}

print('\nCell line examples:')
for cell_ in list(cell_dict.keys())[:2]:
    print(cell_, cell_dict[cell_])

print('\nCellosaurus contains information about', len(cell_dict), 'cellular lines')

The goal of this work is to characterize cellular lines from processing the related scientific literature.
In this context, we can understand that verified references are those that appear in cellosaurus as reference.
This will be our _Ground Truth_.

In [None]:
def get_cells_with_min_gtruth_refs(reference_min=0):
    gtruth_idx = list()
    for cell in cell_dict:
        if len(cell_dict[cell]['reference']) >= reference_min:
            gtruth_idx.append(cell)
    return gtruth_idx

gtruth_idx = list()
for i in range(20):
    gtruth_idx = get_cells_with_min_gtruth_refs(i)
    print(len(gtruth_idx),'cell lines with', i, 'or more references')

Now, using Entrez API, we can check how many references in PubMed are related to the cellular lines.
First, we transform the list of names for a cellular line into a query for the PubMed search API.

In [None]:
import parse
for cell_ in gtruth_idx[:4]:
    print(cell_, cell_dict[cell_]['cell_name'])
    print(parse.list_toquery(cell_dict[cell_]['cell_name']))

We can see that there are cell lines with numeric names. These numeric names should be removed from name lists
because they can cause that search queries return false results.

In [None]:
def remove_numeric_names(namelist):
    newlist = list()
    for name in namelist:
        if not str(parse.replace_spchars(name)).replace(' ','').isnumeric():
            newlist.append(name)
    return newlist

list_to_pop = list()
for cell_ in cell_dict:
    newlist = remove_numeric_names(cell_dict[cell_]['cell_name'])
    if len(newlist) == 0:
        list_to_pop.append(cell_)
    cell_dict[cell_]['cell_name'] = newlist

print('There are',len(list_to_pop), 'cell lines with only numeric names.')

for cell_ in list_to_pop:
    try:
        cell_dict.pop(cell_)
    except:
        pass

gtruth_idx = get_cells_with_min_gtruth_refs(1)

print('We are left with', len(cell_dict), 'cell lines.')
print(len(gtruth_idx),'cell lines with', 1, 'or more references')

We have removed numeric names from cell lines.

Now, using Entrez API, we can check how many references in PubMed are related to the cellular lines.
First, we transform the list of names for a cellular line into a query for the PubMed search API.

In [None]:
for cell_ in gtruth_idx[:4]:
    print(cell_, cell_dict[cell_]['cell_name'])
    print(parse.list_toquery(cell_dict[cell_]['cell_name']))


We use that query to retrieve PMIDs by using Entrez API

In [None]:
for cell_ in gtruth_idx[:10]:
    print('_'*80)
    list_ = cell_dict[cell_]['cell_name']
    print(cell_, list_)
    query = parse.list_toquery(list_)
    print(query)
    idlist = parse.search_idlist(query)
    print(cell_, len(idlist), 'references in PubMed')
    print(cell_, len(cell_dict[cell_]['reference']), 'references in Ground Truth')
    comm_ref = list(set(cell_dict[cell_]['reference']).intersection(set(idlist)))
    print(cell_, len(comm_ref), 'references in Ground Truth are in PubMed')

Now, we can store abstracts to disk.
We will retrieve abstracts in two directories: one for Ground Truth, and other for PubMed references.

In [None]:
for cell_ in gtruth_idx[:10]:
    parse.process_gt_pm(cell_dict[cell_])

# This code should be uncommmented if you wish to retrieve all abstracts
# import multiprocessing
# pool = multiprocessing.Pool(processes=8)
# pool.map(func=parse.process_gt_pm, iterable=cell_dict)

We now have a full database with retrieved abstracts.
However, for compatibility and standard processing, we should parse json files into two CSV files.

In [None]:
import os
import json
import pandas as pd

gtruth_dir = 'data/cell_json_gt/'
gtruth_df = pd.DataFrame(columns=['title', 'index', 'document', 'cell_id'])
i = 0
for cell_ in gtruth_idx:
    gtruth_file = gtruth_dir + cell_ + '.json'
    if os.path.exists(gtruth_file):
        cell_gtruth_ = json.load(fp=open(gtruth_file, 'r'))
        docs = pd.DataFrame.from_dict(cell_gtruth_['documents'])
        gtruth_df = pd.concat([gtruth_df, docs]).reset_index(drop=True)
        i = i + 1

print(i, 'files in', gtruth_dir)
gtruth_df.to_csv(path_or_buf='data/gt_df.csv', sep=',', header=True, index=True)

pm_dir = 'data/cell_json_pm/'
pm_filelist = os.listdir(pm_dir)

reference_df = pd.DataFrame(columns=['title', 'index', 'document', 'cell_id'])
i = 0
for reference_file in pm_filelist:
    cell_reference_ = json.load(fp=open(pm_dir + reference_file, 'r'))
    docs = pd.DataFrame.from_dict(cell_reference_['documents'])
    reference_df = pd.concat([reference_df, docs]).reset_index(drop=True)
    i = i + 1

print(i, 'files in', pm_dir)
reference_df.to_csv(path_or_buf='data/pm_df.csv', sep=',', header=True, index=True)

If you don't want to have to retrieve all that information, you can read directly from the CSV file.

In [None]:
import pandas as pd

gtruth_file = 'data/gt_df.csv'
reference_file = 'data/pm_df.csv'

gtruth_df = pd.read_csv(gtruth_file, sep=',', error_bad_lines=False, encoding="latin-1",
                         index_col=0, header=0)
reference_df = pd.read_csv(reference_file, sep=',', error_bad_lines=False, encoding="latin-1",
                         index_col=0, header=0)

print('GTruth Document table has information on', gtruth_df.shape[0], 'documents of',
      gtruth_df['cell_id'].unique().shape[0], 'cell lines')
print('Reference Document table has information on', reference_df.shape[0], 'documents of',
      reference_df['cell_id'].unique().shape[0], 'cell lines')

Now that we have loaded the corpus, we can process it.
First, we remove all references and ground truth that is not common.

In [None]:
gt_count = gtruth_df.groupby(by=['cell_id'])['cell_id'].count()
gtruth_df = gtruth_df[gtruth_df.cell_id.isin(gt_count[gt_count > 2].index)]
pm_count = reference_df.groupby(by=['cell_id'])['cell_id'].count()
reference_df = reference_df[reference_df.cell_id.isin(pm_count[pm_count > 2].index)]

gtruth_cells = gtruth_df['cell_id'].unique()
reference_cells = reference_df['cell_id'].unique()
cell_intersection = set(gtruth_cells).intersection(set(reference_cells))

print('Datasets have', len(cell_intersection), 'common cell lines')

gtruth_df = gtruth_df[gtruth_df.cell_id.isin(cell_intersection)].reset_index(drop=True)
reference_df = reference_df[reference_df.cell_id.isin(cell_intersection)].reset_index(drop=True)

print('GTruth Document table has information on', gtruth_df.shape[0], 'documents of',
      gtruth_df['cell_id'].unique().shape[0], 'cell lines')
print('Reference Document table has information on', reference_df.shape[0], 'documents of',
      reference_df['cell_id'].unique().shape[0], 'cell lines')

reference_df = pd.concat([reference_df, gtruth_df]).reset_index(drop=True)
pm_count = reference_df.groupby(by=['cell_id'])['cell_id'].count()
print('Every cell line has at least', min(pm_count), 'documents')


We define _crossval_ method for processing the dataframe.
_crossval_ method filters cell lines with _min_references_ documents.

In [None]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from nltk.stem.porter import PorterStemmer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import TfidfVectorizer

def crossval(df, min_references, stemmed=False):
    pm_count = df.groupby(by=['cell_id'])['cell_id'].count()
    df = df[df.cell_id.isin(pm_count[pm_count >= min_references].index)]
    df = df[df.cell_id.isin(pm_count[pm_count < 500].index)]
    print(df.shape[0], 'abstracts in corpus')
    pm_count = df.groupby(by=['cell_id'])['cell_id'].count()
    print('Every cell line has at least', min(pm_count), 'documents')

    if stemmed:
        stemmer = PorterStemmer()
        stemmed_corpus = list()
        for doc in df['document']:
            doc = doc.replace('.', '').replace(',', '').replace(';', '').replace(':', '')
            stemmed_corpus.append(' '.join([stemmer.stem(x) for x in doc.split()]))
        corpus = stemmed_corpus
    else:
        corpus = df['document']

    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=1.0, min_df=2, strip_accents='ascii', stop_words='english',
                                 token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')
    dtm = vectorizer.fit_transform(corpus)
    print('DTM shape:', dtm.shape)
    wordfreq = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    print('Top 20 words')
    print(sorted(wordfreq.items(), key=lambda x: x[1])[:20])
    reference_vocab = vectorizer.get_feature_names()
    print(len(reference_vocab), 'words')

    X, y = dtm, df['cell_id']
    scoring = ['precision_micro', 'recall_micro']
    param_grid = {'C': [1, 10], 'kernel': ('linear', 'rbf')}
    svc = SVC(decision_function_shape='ovr')
    clf = GridSearchCV(estimator=svc, param_grid=param_grid, return_train_score=True, cv=10, n_jobs=-1,
                       scoring=scoring, refit='recall_micro', verbose=1)
    clf = clf.fit(X=X, y=y, groups=df['cell_id'])
    return {'clf': clf, 'estimator': clf.best_estimator_, 'pm_count': pm_count}


results_min_ref = dict()
for min_ref in [10, 50, 100]:
    print('_'*80)
    print(min_ref, 'minimum references')
    results_min_ref[min_ref] = crossval(df=reference_df, min_references=min_ref, stemmed=False)

We plot micro-averaged precision for Non-Stemmed corpus.
This results show that the best estimator is _C:1-kernel:linear_

In [None]:
clf_10 = results_min_ref[10]['clf']
clf_50 = results_min_ref[50]['clf']
clf_100 = results_min_ref[100]['clf']

min_refs = [10, 50, 100]
labels = ['C:1-kernel:linear', 'C:1-kernel:rbf', 'C:10-kernel:linear', 'C:10-kernel:rbf']
markers = ['^', '.', 'v', 's']

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.ylabel('Micro-Averaged Precision')
plt.xlabel('Minimum number of documents')
for i in range(4):
    x = pd.array(min_refs)
    y = clf_10.cv_results_['mean_test_precision_micro'][i], clf_50.cv_results_['mean_test_precision_micro'][i], \
        clf_100.cv_results_['mean_test_precision_micro'][i]
    e = clf_10.cv_results_['std_test_precision_micro'][i], clf_50.cv_results_['std_test_precision_micro'][i], \
        clf_100.cv_results_['std_test_precision_micro'][i]
    plt.errorbar(x, y, e, linestyle='None', marker=markers[i], label=labels[i], capsize=3)
    plt.legend(loc='upper left')

plt.savefig('data/img/precision_.png')
plt.show()
plt.close()

We repeat the process for stemmed corpus.
This results show that the best estimator is _C:1-kernel:linear_.

In [None]:
results_min_ref = dict()
for min_ref in [10, 50, 100]:
    print('_'*80)
    print(min_ref, 'minimum references')
    results_min_ref[min_ref] = crossval(df=reference_df, min_references=min_ref, stemmed=True)

clf_10 = results_min_ref[10]['clf']
clf_50 = results_min_ref[50]['clf']
clf_100 = results_min_ref[100]['clf']

min_refs = [10, 50, 100]
labels = ['C:1-kernel:linear', 'C:1-kernel:rbf', 'C:10-kernel:linear', 'C:10-kernel:rbf']
markers = ['^', '.', 'v', 's']

plt.figure(figsize=(10, 6))
plt.ylabel('Micro-Averaged Precision')
plt.xlabel('Minimum number of documents')
for i in range(4):
    x = pd.array(min_refs)
    y = clf_10.cv_results_['mean_test_precision_micro'][i], clf_50.cv_results_['mean_test_precision_micro'][i], \
        clf_100.cv_results_['mean_test_precision_micro'][i]
    e = clf_10.cv_results_['std_test_precision_micro'][i], clf_50.cv_results_['std_test_precision_micro'][i], \
        clf_100.cv_results_['std_test_precision_micro'][i]
    plt.errorbar(x, y, e, linestyle='None', marker=markers[i], label=labels[i], capsize=3)
    plt.legend(loc='upper left')

plt.savefig('data/img/precision_stemmed.png')
plt.show()
plt.close()

Now we define a pipeline for calculating Micro-Averaged precision varying the minimum number of documents per class.
We plot micro-averaged precision for Non-Stemmed corpus.

In [None]:
from sklearn.model_selection import cross_validate

def min_ref_pipeline(df, min_references, stemmed=False):
    pm_count = df.groupby(by=['cell_id'])['cell_id'].count()
    df = df[df.cell_id.isin(pm_count[pm_count >= min_references].index)]
    df = df[df.cell_id.isin(pm_count[pm_count < 500].index)]
    print(df.shape[0], 'abstracts in corpus')
    pm_count = df.groupby(by=['cell_id'])['cell_id'].count()
    print(pm_count.shape[0], 'cell lines')
    print('Every cell line has at least', min(pm_count), 'documents')

    if stemmed:
        stemmer = PorterStemmer()
        stemmed_corpus = list()
        for doc in df['document']:
            doc = doc.replace('.', '').replace(',', '').replace(';', '').replace(':', '')
            stemmed_corpus.append(' '.join([stemmer.stem(x) for x in doc.split()]))
        corpus = stemmed_corpus
    else:
        corpus = df['document']

    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=1.0, min_df=2, strip_accents='ascii', stop_words='english',
                                 token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')
    dtm = vectorizer.fit_transform(corpus)
    print('DTM shape:', dtm.shape)
    wordfreq = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    print('Top 20 words')
    print(sorted(wordfreq.items(), key=lambda x: x[1])[:20])
    reference_vocab = vectorizer.get_feature_names()
    print(len(reference_vocab), 'words')

    X, y = dtm, df['cell_id']
    clf = LinearSVC(penalty='l2', multi_class='ovr', random_state=42)
    X_new = SelectFromModel(estimator=clf, threshold='median').fit_transform(X, y)
    print(X.shape, X_new.shape)

    clf = LinearSVC(penalty="l2", dual=False, tol=1e-3, multi_class='ovr', C=1)
    print('=' * 80)

    scoring = ['precision_macro', 'precision_micro', 'recall_macro', 'recall_micro']
    results = cross_validate(estimator=clf, X=X_new, y=y, cv=10, n_jobs=-1, verbose=0,
                             return_estimator=True, return_train_score=True, scoring=scoring, groups=df['cell_id'])
    print(results)
    return {'clf': clf, 'results': results, 'pm_count': pm_count}


for min_ref in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
    print('_'*80)
    print(min_ref, 'minimum references')
    results_min_ref[min_ref] = min_ref_pipeline(reference_df, min_references=min_ref, stemmed=False)


x = sorted(list(results_min_ref.keys()))
y = [len(results_min_ref[min_ref]['pm_count']) for min_ref in results_min_ref.keys()]
min_ref_pd = pd.DataFrame(data={'Minimum number of documents': x, 'Number of cell lines': y})
plt.figure(figsize=(10, 6))
plt.ylabel('Number of cell lines')
plt.xlabel('Minimum number of documents')
plt.scatter(x, y, linestyle='None', marker='x')
plt.savefig('data/img/min_ref.png')
plt.show()
plt.close()

plt.figure(figsize=(10, 6))
plt.ylabel('Averaged Micro Precision')
plt.xlabel('Minimum number of documents')

import numpy as np

x = sorted(list(results_min_ref.keys()))
y = [np.average(results_min_ref[min_ref]['results']['train_precision_micro']) for min_ref in x]
e = [np.std(results_min_ref[min_ref]['results']['train_precision_micro']) for min_ref in x]
plt.errorbar(x, y, e, linestyle='None', marker=markers[i], label='Train dataset', capsize=3)

y = [np.average(results_min_ref[min_ref]['results']['test_precision_micro']) for min_ref in x]
e = [np.std(results_min_ref[min_ref]['results']['test_precision_micro']) for min_ref in x]
plt.errorbar(x, y, e, linestyle='None', marker=markers[i], label='Test dataset', capsize=3)

plt.ylim(0, 1)
plt.legend(loc='upper left')
plt.savefig('data/img/precision_min_ref.png')
plt.show()
plt.close()

Now we define a pipeline for calculating Micro-Averaged precision varying the variance threshold for feature selection.
We plot micro-averaged precision for Non-Stemmed corpus.


In [None]:
from sklearn.svm import LinearSVC

def min_threshold_pipeline(df, min_references, stemmed=False):
    pm_count = df.groupby(by=['cell_id'])['cell_id'].count()
    df = df[df.cell_id.isin(pm_count[pm_count >= min_references].index)]
    df = df[df.cell_id.isin(pm_count[pm_count < 500].index)]
    print(df.shape[0], 'abstracts in corpus')
    pm_count = df.groupby(by=['cell_id'])['cell_id'].count()
    print(pm_count.shape[0], 'cell lines')
    print('Every cell line has at least', min(pm_count), 'documents')

    if stemmed:
        stemmer = PorterStemmer()
        stemmed_corpus = list()
        for doc in df['document']:
            doc = doc.replace('.', '').replace(',', '').replace(';', '').replace(':', '')
            stemmed_corpus.append(' '.join([stemmer.stem(x) for x in doc.split()]))
        corpus = stemmed_corpus
    else:
        corpus = df['document']

    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=1.0, min_df=2, strip_accents='ascii', stop_words='english',
                                 token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')
    dtm = vectorizer.fit_transform(corpus)
    print('DTM shape:', dtm.shape)
    wordfreq = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    print('Top 20 words')
    print(sorted(wordfreq.items(), key=lambda x: x[1])[:20])
    reference_vocab = vectorizer.get_feature_names()
    print(len(reference_vocab), 'words')

    X, y = dtm, df['cell_id']
    clf = LinearSVC(penalty='l2', multi_class='ovr', random_state=42)
    thresholds = ['0.5*median', 'median', '1.5*median', '2*median']
    results = dict()
    for threshold in thresholds:
        print(threshold)
        X_new = SelectFromModel(estimator=clf, threshold=threshold).fit_transform(X, y)
        print(X.shape, X_new.shape)

        clf = LinearSVC(penalty="l2", dual=False, tol=1e-3, multi_class='ovr', C=1)
        print('=' * 80)

        scoring = ['precision_macro', 'precision_micro', 'recall_macro', 'recall_micro']
        results[threshold] = cross_validate(estimator=clf, X=X_new, y=y, cv=10, n_jobs=-1, verbose=0,
                                 return_estimator=True, return_train_score=True, scoring=scoring, groups=df['cell_id'])
    return {'results': results}


results_min_ref_ = dict()
for min_ref in [60, 70, 80, 90, 100]:
    print('_'*80)
    print(min_ref, 'minimum references')
    results_min_ref_[min_ref] = min_threshold_pipeline(reference_df, min_references=min_ref, stemmed=False)


plt.figure(figsize=(8, 6))
plt.ylabel('Averaged Micro Precision')
plt.xlabel('Minimum number of documents')
markers = ['^', '.', 'v', 's']
thresholds = ['0.5*median', 'median', '1.5*median', '2*median']
labels = [' (threshold = 0.5*median)', ' (threshold = median)', ' (threshold = 1.5*median)', ' (threshold = 2*median)']

x = sorted(list(results_min_ref_.keys()))
for i in range(4):
    y = [np.average(results_min_ref_[min_ref]['results'][thresholds[i]]['train_precision_micro']) for min_ref in x]
    e = [np.std(results_min_ref_[min_ref]['results'][thresholds[i]]['train_precision_micro']) for min_ref in x]
    plt.errorbar(x, y, e, linestyle='None', marker='^', label='Train dataset'+labels[i], capsize=3)

    y = [np.average(results_min_ref_[min_ref]['results'][thresholds[i]]['test_precision_micro']) for min_ref in x]
    e = [np.std(results_min_ref_[min_ref]['results'][thresholds[i]]['test_precision_micro']) for min_ref in x]
    plt.errorbar(x, y, e, linestyle='None', marker='o', label='Test dataset'+labels[i], capsize=3)

plt.ylim(0.2, 1)
plt.legend(loc='lower left')

plt.savefig('data/img/precision_min_ref_threshold.png')
plt.close()