## Processing of 'taste' abstracts for Latent Semantic Analysis

A number of functions is defined that pull the abstracts from the SQLite database, tokenize, lowercase, stem and filter for stopwords (both from nltk.corpus and self-defined).

In [4]:
import sys, numpy, math, sqlite3, random

sys.path.append('/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages')
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.stem.porter import *

In [33]:
## get all pmids from database
def getallpmids(database):

    conn = sqlite3.connect(database)
    c = conn.cursor()
    c.execute('''SELECT pmid FROM articles''')
    pmids = c.fetchall()
    conn.close()
    return pmids

## get all pmids from database and write them to textfile
def writeallpmidstofile(database,filename):
    idlist = getallpmids(database)
    with open(filename, 'w') as idfile:
        for item in idlist:
            idfile.write("%s\n" % item)

def readpmidsfromfile(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
    return lines

def tokenize(text):
    import nltk
    import re
    
    # first tokenize by sentence, then by word
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    return tokens


    
## from a list of ids concatenate all titles and abstracts from database taste.db as one word list
def gettokenizedabstracts(idlist):

    conn = sqlite3.connect('taste.db')
    c = conn.cursor()


    abstracts = dict()

    for pmid in idlist:
        c.execute('''SELECT title,abstract FROM articles WHERE pmid = (?)''', (pmid,))
        result = c.fetchone()
        result = " ".join(result)
        abstracts[pmid] = tokenize(result)

    conn.close()
    return abstracts

def stem_filter(abstracts):
    
    filtered_abstracts = dict()
    
    
    stemmer = PorterStemmer()
    mystopwords = ['show','control','compar','group','aim','examin','studi','purpos','object','result','present','evalu','determin','investig','known','import','role','basi','signific','differ','discuss','term','found','effect','use','associ','suggest','presenc','order','discuss','background','introduct','method']

    
    for pmid,abstract in abstracts.items():
        
        ## filter out non-alphabetic characters
        for i,token in enumerate(abstract):
            abstract[i] = re.sub('[^a-zA-Z]','',token)
            
        ## remove strings with one letter or less
        abstract = [token for token in abstract if len(token) > 1]
        
        ## remove regular stopwords
        abstract = [w for w in abstract if not w in stopwords.words('english')]
        
        ## stem
        for i, token in enumerate(abstract):
            try:
                abstract[i] = stemmer.stem(token)
            except:
                print(token, "failed")
                
        ## remove custom stopwords
        abstract = [w for w in abstract if not w in mystopwords]
        
        filtered_abstracts[pmid] = abstract

    return filtered_abstracts


## Exploratory test on smaller data set

A subset of 500 Abstracts is created,tokenized, stemmed,filtered and a TF-IDF matrix is generated with TfidfVectorized from the sklearn package.

In [5]:
allpmids = readpmidsfromfile('allpmids.txt')
pmidsub1 = random.sample(allpmids,500)
with open('pmidsub1.txt', 'w') as idfile:
        for item in pmidsub1:
            idfile.write("%s\n" % item)

In [7]:
dict1 = gettokenizedabstracts(pmidsub1)

In [34]:
filtered1 = stem_filter(dict1)



In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

TFIDF = TfidfVectorizer(
      # so we can pass it strings
      input='content',
      max_df=0.8, max_features=200000,
      min_df=0.01, use_idf=True,  ngram_range=(1,5),
      # turn off preprocessing of strings to avoid corrupting our keys
      lowercase=False,
      preprocessor=lambda x: x,
      # use our token dictionary
      tokenizer=lambda key: filtered1[key])



In [35]:
%time matrix1 = TFIDF.fit_transform(filtered1) 

print(matrix1.shape)

CPU times: user 609 ms, sys: 21.9 ms, total: 631 ms
Wall time: 632 ms
(500, 1523)


In [36]:
TFIDF.get_feature_names()


['abil',
 'abl',
 'abnorm',
 'absenc',
 'absolut',
 'abstract',
 'accept',
 'access',
 'accompani',
 'accord',
 'account',
 'accumul',
 'acet',
 'achiev',
 'acid',
 'acid bacteria',
 'acid composit',
 'acid tast',
 'acquir',
 'acquisit',
 'across',
 'act',
 'action',
 'activ',
 'acuiti',
 'acut',
 'ad',
 'ad libitum',
 'adapt',
 'addict',
 'addit',
 'address',
 'administ',
 'administr',
 'adolesc',
 'adult',
 'adulthood',
 'advanc',
 'advantag',
 'advers',
 'aerob',
 'affect',
 'affer',
 'age',
 'age year',
 'agent',
 'agerel',
 'aggreg',
 'agonist',
 'air',
 'airway',
 'al',
 'alcohol',
 'alcohol consumpt',
 'allow',
 'almost',
 'alon',
 'along',
 'alpha',
 'also',
 'alter',
 'alter tast',
 'altern',
 'although',
 'alway',
 'alzheim',
 'alzheim diseas',
 'american',
 'amino',
 'amino acid',
 'among',
 'amount',
 'amygdala',
 'analog',
 'analogu',
 'analys',
 'analysi',
 'analysi tast',
 'analyz',
 'anatom',
 'andor',
 'anesthet',
 'anim',
 'anoth',
 'antagonist',
 'anterior',
 'antibo

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
dist1 = 1 - cosine_similarity(matrix1)
print(dist1.shape)

(500, 500)


## Processing of full dataset

The same procedure is repeated with the full dataset. The TF-IDF matrix is saved in sparse Matrix Market format to a file for transfer to R. The column names (terms) and row names (Pubmed IDs) are saved as text files.

In [None]:
dictfull = gettokenizedabstracts(allpmids)

In [44]:
filteredfull = stem_filter(dictfull)



24361833 failed
24361833 failed
24361833 failed
24361833 failed
24361833 failed
24361833 failed
24361833 failed
24361833 failed


In [45]:
TFIDF = TfidfVectorizer(
      # so we can pass it strings
      input='content',
      max_df=0.8, max_features=200000,
      min_df=0.01, use_idf=True,  ngram_range=(1,5),
      # turn off preprocessing of strings to avoid corrupting our keys
      lowercase=False,
      preprocessor=lambda x: x,
      # use our token dictionary
      tokenizer=lambda key: filteredfull[key])



In [46]:
%time matrixfull = TFIDF.fit_transform(filteredfull) 

print(matrixfull.shape)

CPU times: user 47.6 s, sys: 1.94 s, total: 49.5 s
Wall time: 49.8 s
(28690, 1299)


In [50]:
termsfull = TFIDF.get_feature_names()
with open('termsfull.txt', 'w') as idfile:
    for item in termsfull:
        idfile.write("%s\n" % item)

In [48]:
import scipy
scipy.io.mmwrite('matrixfull.mtx', matrixfull) 

In [56]:
len(termsfull)



1299