## Exploratory Analysis of the 'taste' abstract corpus

This part of the analysis is purely exploratory and not required for the rest of the project.

In [1]:
import sys, numpy, math, sqlite3

sys.path.append('/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages')
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.stem.porter import *

In [24]:
## get all pmids from database
def getallpmids(database):

    conn = sqlite3.connect(database)
    c = conn.cursor()
    c.execute('''SELECT pmid FROM articles''')
    pmids = c.fetchall()
    conn.close()
    return pmids

## get all pmids from database and write them to textfile
def writeallpmidstofile(database,filename):
    idlist = getallpmids(database)
    with open(filename, 'w') as idfile:
        for item in idlist:
            idfile.write("%s\n" % item)

def readpmidsfromfile(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
    return lines

## get all pmids from database and write them to textfile
def writeallpmidstofile(database,filename):
    idlist = getallpmids(database)
    with open(filename, 'w') as idfile:
        for item in idlist:
            idfile.write("%s\n" % item)

def readpmidsfromfile(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
    return lines

def tokenize(text):
    import nltk
    import re
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    
    # filter out non-letter characters (e.g., numeric tokens, raw punctuation)
    for i,token in enumerate(tokens):
        tokens[i] = re.sub('[^a-zA-Z]', '', token)
        
    ## remove strings with one letter or less
    filtered_tokens = [i for i in tokens if len(i) > 1]
    
    
    return filtered_tokens

## from a list of ids concatenate all titles and abstracts from database taste.db as one word list
def catabstracts(idlist):

    conn = sqlite3.connect('taste.db')
    c = conn.cursor()

    alltokenized = []

    for pmid in idlist:
        c.execute('''SELECT title,abstract FROM articles WHERE pmid = (?)''', (pmid,))
        result = c.fetchone()
        result = " ".join(result)
        alltokenized.extend(tokenize(result))

    conn.close()
    return alltokenized

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        try:
            stemmed.append(stemmer.stem(item))
        except:
            print(item, "failed")
    return stemmed


In [10]:
allpmids = readpmidsfromfile('allpmids.txt')
alltokenized = catabstracts(allpmids)

In [20]:
regex = re.compile('[^a-zA-Z]')
for token in alltokenized:
    regex.sub('',token)
alltokenized = [i for i in alltokenized if len(i) > 1] 


In [25]:
filtered = [w for w in alltokenized if not w in stopwords.words('english')]

stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
mystopwords = ['aim','study,','purpose','objective','results','present','evaluate','determine','investigate','known','important','role','basis','significant','difference','discussed','terms','found','effect','using','associated','suggest','presence','order','discussion','background','introduction','method']
mystopwords = stem_tokens(mystopwords,stemmer)

aeds failed
aeds failed
aeds failed
aed failed
aeds failed


In [26]:
mystopwords = ['aim','study,','purpose','objective','results','present','evaluate','determine','investigate','known','important','role','basis','significant','difference','discussed','terms','found','effect','using','associated','suggest','presence','order','discussion','background','introduction','method']
mystopwords = stem_tokens(mystopwords,stemmer)

In [28]:
stemmed = [w for w in stemmed if not w in mystopwords]


In [31]:
import nltk
text = nltk.Text(stemmed)

In [32]:
bigrams = ngrams(text, 2)
trigrams = ngrams(text, 3)
tetragrams = ngrams(text, 4)
pentagrams = ngrams(text, 5)


In [35]:
from nltk import FreqDist
fdist1 = FreqDist(text)
fdist2 = FreqDist(bigrams)
fdist3 = FreqDist(trigrams)
fdist4 = FreqDist(tetragrams)
fdist5 = FreqDist(pentagrams)

In [42]:
fdist1.most_common(50)

[('tast', 52108),
 ('studi', 20741),
 ('food', 17850),
 ('rat', 15521),
 ('respons', 15292),
 ('cell', 13588),
 ('increas', 12381),
 ('condit', 12025),
 ('sensori', 11544),
 ('receptor', 11502),
 ('acid', 11265),
 ('patient', 10968),
 ('test', 10783),
 ('activ', 10673),
 ('show', 9475),
 ('prefer', 9173),
 ('avers', 9041),
 ('intak', 8993),
 ('group', 8861),
 ('also', 8686),
 ('concentr', 8504),
 ('control', 8462),
 ('may', 8226),
 ('function', 8067),
 ('flavor', 7947),
 ('sweet', 7587),
 ('two', 7572),
 ('experi', 7569),
 ('treatment', 7515),
 ('water', 7501),
 ('product', 7409),
 ('bitter', 7401),
 ('compar', 7333),
 ('solut', 7296),
 ('level', 7080),
 ('qualiti', 7036),
 ('chang', 7032),
 ('develop', 6897),
 ('neuron', 6602),
 ('compound', 6542),
 ('system', 6493),
 ('protein', 6458),
 ('process', 6392),
 ('sucros', 6369),
 ('behavior', 6315),
 ('gustatori', 6237),
 ('oral', 6103),
 ('howev', 5937),
 ('express', 5818),
 ('day', 5807)]

In [43]:
fdist2.most_common(50)

[(('tast', 'bud'), 5306),
 (('tast', 'avers'), 4051),
 (('tast', 'receptor'), 2989),
 (('condit', 'tast'), 2827),
 (('bitter', 'tast'), 2402),
 (('tast', 'cell'), 2120),
 (('amino', 'acid'), 2034),
 (('food', 'intak'), 2025),
 (('sweet', 'tast'), 1990),
 (('fatti', 'acid'), 1718),
 (('chorda', 'tympani'), 1369),
 (('receptor', 'cell'), 1141),
 (('tast', 'stimuli'), 1006),
 (('tast', 'percept'), 997),
 (('fungiform', 'papilla'), 967),
 (('bodi', 'weight'), 967),
 (('tast', 'respons'), 934),
 (('sensori', 'properti'), 813),
 (('studi', 'examin'), 809),
 (('tast', 'prefer'), 772),
 (('tast', 'smell'), 770),
 (('sensori', 'attribut'), 749),
 (('tast', 'qualiti'), 738),
 (('tast', 'sensit'), 724),
 (('oral', 'caviti'), 722),
 (('citric', 'acid'), 715),
 (('avers', 'cta'), 714),
 (('sensori', 'characterist'), 712),
 (('qualiti', 'life'), 705),
 (('volatil', 'compound'), 696),
 (('sucros', 'solut'), 682),
 (('sodium', 'chlorid'), 678),
 (('sensori', 'qualiti'), 672),
 (('drink', 'water'), 655

In [39]:
fdist3.most_common(50)

[(('condit', 'tast', 'avers'), 2530),
 (('tast', 'receptor', 'cell'), 751),
 (('tast', 'avers', 'cta'), 707),
 (('tast', 'bud', 'cell'), 602),
 (('chorda', 'tympani', 'nerv'), 588),
 (('bitter', 'tast', 'receptor'), 576),
 (('nucleu', 'solitari', 'tract'), 515),
 (('tast', 'avers', 'learn'), 402),
 (('sweet', 'tast', 'receptor'), 392),
 (('lactic', 'acid', 'bacteria'), 367),
 (('central', 'nervou', 'system'), 326),
 (('princip', 'compon', 'analysi'), 253),
 (('head', 'neck', 'cancer'), 251),
 (('cell', 'tast', 'bud'), 242),
 (('abstract', 'truncat', 'word'), 234),
 (('magnet', 'reson', 'imag'), 228),
 (('free', 'fatti', 'acid'), 221),
 (('free', 'amino', 'acid'), 213),
 (('type', 'iii', 'cell'), 192),
 (('condit', 'stimulu', 'cs'), 191),
 (('scan', 'electron', 'microscopi'), 190),
 (('fatti', 'acid', 'composit'), 180),
 (('bodi', 'mass', 'index'), 180),
 (('condit', 'tast', 'avoid'), 178),
 (('transient', 'receptor', 'potenti'), 171),
 (('monosodium', 'glutam', 'msg'), 169),
 (('lithiu

In [40]:
fdist4.most_common(50)

[(('condit', 'tast', 'avers', 'cta'), 701),
 (('function', 'magnet', 'reson', 'imag'), 139),
 (('condit', 'tast', 'avers', 'rat'), 113),
 (('food', 'intak', 'bodi', 'weight'), 112),
 (('condit', 'tast', 'avers', 'paradigm'), 103),
 (('nucleu', 'solitari', 'tract', 'nst'), 98),
 (('acquisit', 'condit', 'tast', 'avers'), 89),
 (('produc', 'condit', 'tast', 'avers'), 87),
 (('induc', 'condit', 'tast', 'avers'), 87),
 (('lactic', 'acid', 'bacteria', 'lab'), 83),
 (('extinct', 'condit', 'tast', 'avers'), 82),
 (('ethanol-induc', 'condit', 'tast', 'avers'), 75),
 (('princip', 'compon', 'analysi', 'pca'), 75),
 (('sweet', 'salti', 'sour', 'bitter'), 74),
 (('tast', 'avers', 'cta', 'paradigm'), 73),
 (('rostral', 'nucleu', 'solitari', 'tract'), 72),
 (('nucleu', 'solitari', 'tract', 'nt'), 72),
 (('condit', 'tast', 'avers', 'learn'), 68),
 (('bitter', 'tast', 'receptor', 'gene'), 68),
 (('von', 'ebner', "'s", 'gland'), 67),
 (('bodi', 'mass', 'index', 'bmi'), 67),
 (('condit', 'tast', 'avers',

In [41]:
fdist5.most_common(50)

[(('condit', 'tast', 'avers', 'cta', 'paradigm'), 71),
 (('function', 'magnet', 'reson', 'imag', 'fmri'), 64),
 (('condit', 'tast', 'avers', 'cta', 'learn'), 35),
 (('latent', 'inhibit', 'condit', 'tast', 'avers'), 35),
 (('acquisit', 'condit', 'tast', 'avers', 'cta'), 33),
 (('aroma', 'extract', 'dilut', 'analysi', 'aeda'), 30),
 (('induc', 'condit', 'tast', 'avers', 'cta'), 28),
 (('transient', 'receptor', 'potenti', 'trp', 'channel'), 24),
 (('reduc', 'food', 'intak', 'bodi', 'weight'), 23),
 (('condit', 'tast', 'avers', 'cta', 'rat'), 21),
 (('thiobarbitur', 'acid', 'reactiv', 'substanc', 'tbar'), 21),
 (('tast', 'bud', 'rat', 'circumval', 'papilla'), 20),
 (('condit', 'tast', 'avers', 'cta', 'procedur'), 19),
 (('receptor', 'potenti', 'frog', 'tast', 'cell'), 19),
 (('tast', 'sweet', 'salti', 'sour', 'bitter'), 19),
 (('high', 'perform', 'liquid', 'chromatographi', 'hplc'), 19),
 (('univers', 'pennsylvania', 'smell', 'identif', 'test'), 19),
 (('rostral', 'nucleu', 'solitari', 'tr

In [44]:
text.collocations()

tast bud; chorda tympani; amino acid; tast avers; fatti acid;
fungiform papilla; tast receptor; food intak; bitter tast; condit
tast; bodi weight; solitari tract; oral caviti; shelf life; sweet
tast; citric acid; lithium chlorid; insular cortex; sodium chlorid;
head neck
