In [1]:
import sqlite3
import datetime as datetime
import os
import itertools
import numpy as np
import pandas as pd
conn = sqlite3.connect("testDB.db")

In [17]:
abstract=pd.read_sql_query("select doi,abstract from trial_10000;", conn)

In [18]:
abstract.head()

Unnamed: 0,doi,abstract
0,10.1371/journal.pone.0000100,BackgroundMeasuring perceptual judgments about...
1,10.1371/journal.pone.0000008,Background“Explosive” adaptive radiations on i...
2,10.1371/journal.pone.0000061,Reliable and comprehensive maps of molecular p...
3,10.1371/journal.pone.0000094,The transcriptional response to exogenously su...
4,10.1371/journal.pone.0000011,BackgroundDrug treatment is becoming more expe...


In [21]:
abstract['abstract']=abstract['abstract'].str.replace('\d+', '') # for digits
abstract['abstract']=abstract['abstract'].str.replace(r'(\b\w{1,2}\b)', '') # for words
abstract['abstract']=abstract['abstract'].str.replace('Background', '')

In [22]:
abstract.head()

Unnamed: 0,doi,abstract
0,10.1371/journal.pone.0000100,Measuring perceptual judgments about stimuli w...
1,10.1371/journal.pone.0000008,“Explosive” adaptive radiations islands remai...
2,10.1371/journal.pone.0000061,Reliable and comprehensive maps molecular pat...
3,10.1371/journal.pone.0000094,The transcriptional response exogenously supp...
4,10.1371/journal.pone.0000011,Drug treatment becoming more expensive due t...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english',)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk.stem

english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

vectorizer_s = StemmedCountVectorizer(min_df=3, analyzer="word", stop_words='english')

In [23]:
vectorizer_s.fit(abstract['abstract'])

StemmedCountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=3,
            ngram_range=(1, 1), preprocessor=None, stop_words='english',
            strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
            tokenizer=None, vocabulary=None)

In [24]:
vectorizer_s.get_feature_names()

['aa',
 'aaa',
 'aac',
 'aag',
 'aav',
 'ab',
 'aba',
 'abandon',
 'abas',
 'abbrevi',
 'abc',
 'abca',
 'abcb',
 'abcc',
 'abcg',
 'abd',
 'abdomen',
 'abdomin',
 'aberr',
 'abh',
 'abi',
 'abil',
 'abiot',
 'abl',
 'ablat',
 'abm',
 'abnorm',
 'abolish',
 'abolit',
 'abort',
 'abortus',
 'aboveground',
 'abp',
 'abr',
 'abrog',
 'abrupt',
 'abscess',
 'abscis',
 'absciss',
 'absenc',
 'absent',
 'absolut',
 'absorb',
 'absorpt',
 'absorptiometri',
 'abstin',
 'abstract',
 'abund',
 'abus',
 'ac',
 'academ',
 'acanthamoeba',
 'acasi',
 'acc',
 'acceler',
 'acceleromet',
 'accentu',
 'accept',
 'acceptor',
 'access',
 'accessori',
 'accid',
 'accident',
 'acclim',
 'acclimat',
 'accn',
 'accommod',
 'accompani',
 'accomplish',
 'accord',
 'account',
 'accret',
 'accru',
 'accumben',
 'accumul',
 'accur',
 'accuraci',
 'ace',
 'acellular',
 'acet',
 'acetyl',
 'acetylcholin',
 'acetylcholinesteras',
 'acetylhydrolas',
 'acetyltransferas',
 'acgh',
 'ach',
 'achiev',
 'achr',
 'achromat'

In [26]:
dtm = vectorizer_s.fit_transform(abstract['abstract']).toarray()

In [27]:
vocab = np.array(vectorizer_s.get_feature_names())

In [28]:
dtm.shape

(10000, 12861)

In [29]:
len(vocab)

12861

In [30]:
from sklearn import decomposition

In [31]:
num_topics = 200
num_top_words = 200
clf = decomposition.NMF(n_components=num_topics, random_state=1)

In [32]:
doctopic = clf.fit_transform(dtm)

In [33]:
topic_words = []

for topic in clf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])
  

In [34]:
doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

  """Entry point for launching an IPython kernel.


In [37]:
doc_names=np.asarray(abstract['doi'])

In [38]:
print(doc_names)

['10.1371/journal.pone.0000100' '10.1371/journal.pone.0000008'
 '10.1371/journal.pone.0000061' ..., '10.1371/journal.pone.0009760'
 '10.1371/journal.pone.0009675' '10.1371/journal.pone.0009654']


In [39]:
doctopic_orig = doctopic.copy()

In [40]:
num_groups = len(set(doc_names))

In [52]:
doctopic_grouped = np.zeros((num_groups, num_topics))

In [53]:
print((doctopic))

[[  1.45811206e-02   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    7.38420101e-02   0.00000000e+00]
 [  9.70940670e-03   0.00000000e+00   1.90040213e-01 ...,   0.00000000e+00
    3.82365440e-03   0.00000000e+00]
 [  1.93659523e-04   0.00000000e+00   7.64973810e-02 ...,   1.09855992e-01
    0.00000000e+00   0.00000000e+00]
 ..., 
 [  7.94686119e-04   1.49224655e-01   1.83387934e-05 ...,   2.09832047e-03
    2.25864679e-03   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   5.11227130e-05 ...,   0.00000000e+00
    2.95165570e-02   0.00000000e+00]
 [  0.00000000e+00   8.42414490e-02   0.00000000e+00 ...,   1.08830357e-03
    2.59579935e-03   1.00048161e-02]]


In [54]:
for i, name in enumerate(sorted(set(doc_names))):
    doctopic_grouped[i, :] = np.mean(doctopic[doc_names == name, :], axis=0) 

In [55]:
print(doctopic_grouped)

[[        nan         nan         nan ...,         nan         nan
          nan]
 [        nan         nan         nan ...,         nan         nan
          nan]
 [        nan         nan         nan ...,         nan         nan
          nan]
 ..., 
 [ 0.02652607  0.          0.00604433 ...,  0.          0.          0.        ]
 [ 0.0006049   0.          0.         ...,  0.          0.00412628  0.        ]
 [ 0.          0.03845785  0.         ...,  0.          0.00155413
   0.00402253]]


In [56]:
plos_articles = sorted(set(doc_names))

In [57]:
for i in range(len(doctopic_grouped)):
    top_topics = np.argsort(doctopic_grouped[i,:])[::-1][0:10]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(plos_articles[i], top_topics_str))

10.1371/annotation/01641ef8-cbe2-4ca7-900a-e1d12bd8557a: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/05bed72c-c6f6-4685-a732-02c78e5f66c2: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/06306df9-1db3-4e7b-a7ce-18338b655967: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/06938481-6c6e-4845-af5c-8d9c8131c4b7: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/07db26ac-e3b1-4c16-953a-dd06af38f621: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/07edbe56-d503-477f-adcf-d7ec30e9beda: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/0a0b70a9-a6e0-4e60-b6bd-8ef894ed22cd: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/0b1a375a-ea43-4586-824f-05bc5b359d63: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/0b364095-9f93-4cb9-9a2e-aae5ed1bf362: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/0c224e4f-d48d-4c12-adfa-f2afd7b9a62f: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/0cc4d7c5-134f-4db0-919f-14f74dd7846e: 199 62 72 71 70 69 68 67 66 65
10.1371/annotation/0dfbcb98-872c

10.1371/journal.pone.0000507: 13 8 12 5 150 193 102 33 41 37
10.1371/journal.pone.0000508: 22 198 29 180 193 2 33 177 159 103
10.1371/journal.pone.0000509: 2 180 41 61 141 60 37 83 4 51
10.1371/journal.pone.0000510: 180 13 17 1 2 167 41 4 0 44
10.1371/journal.pone.0000511: 197 171 6 177 183 47 138 15 59 51
10.1371/journal.pone.0000512: 1 195 3 22 10 188 21 16 19 138
10.1371/journal.pone.0000513: 172 23 183 190 165 194 42 180 41 162
10.1371/journal.pone.0000514: 2 22 3 180 188 165 191 179 168 185
10.1371/journal.pone.0000515: 17 4 180 1 115 42 139 106 160 113
10.1371/journal.pone.0000516: 17 195 33 194 193 192 167 113 63 5
10.1371/journal.pone.0000517: 180 187 66 101 110 179 41 135 48 18
10.1371/journal.pone.0000518: 1 190 2 11 165 32 5 111 6 98
10.1371/journal.pone.0000519: 2 198 23 190 162 61 165 196 18 182
10.1371/journal.pone.0000520: 197 185 193 180 3 188 22 184 66 159
10.1371/journal.pone.0000521: 159 192 28 2 29 144 176 130 4 122
10.1371/journal.pone.0000522: 37 2 165 164 41 136 

10.1371/journal.pone.0002349: 193 174 194 192 33 12 179 22 132 161
10.1371/journal.pone.0002350: 1 174 125 131 130 199 150 12 179 102
10.1371/journal.pone.0002351: 1 3 185 51 22 179 145 33 191 136
10.1371/journal.pone.0002352: 184 23 2 110 177 3 195 183 5 155
10.1371/journal.pone.0002353: 1 2 188 180 167 193 192 175 153 104
10.1371/journal.pone.0002354: 189 1 193 3 2 165 127 94 22 198
10.1371/journal.pone.0002355: 1 13 171 198 153 17 40 4 33 146
10.1371/journal.pone.0002356: 1 198 17 135 141 19 22 3 153 8
10.1371/journal.pone.0002357: 1 142 33 6 151 11 192 18 184 15
10.1371/journal.pone.0002358: 186 158 197 33 190 154 4 41 109 25
10.1371/journal.pone.0002359: 176 180 190 55 3 33 136 191 50 126
10.1371/journal.pone.0002360: 1 180 6 2 116 114 25 165 159 125
10.1371/journal.pone.0002361: 1 152 192 193 2 92 6 154 12 168
10.1371/journal.pone.0002362: 17 40 13 190 33 5 107 18 181 154
10.1371/journal.pone.0002363: 166 107 40 198 175 136 18 179 190 33
10.1371/journal.pone.0002364: 187 17 92 18

10.1371/journal.pone.0003969: 13 158 181 2 169 6 135 3 125 136
10.1371/journal.pone.0003970: 192 190 193 13 191 41 23 11 196 160
10.1371/journal.pone.0003971: 176 13 1 17 11 42 165 4 22 33
10.1371/journal.pone.0003972: 188 164 41 181 185 157 39 16 60 23
10.1371/journal.pone.0003973: 1 21 10 195 19 2 187 61 133 152
10.1371/journal.pone.0003974: 1 180 170 47 2 191 154 6 59 57
10.1371/journal.pone.0003975: 5 8 21 194 2 168 41 25 133 40
10.1371/journal.pone.0003976: 2 188 194 22 3 184 18 195 39 6
10.1371/journal.pone.0003977: 17 13 163 197 169 146 157 2 15 41
10.1371/journal.pone.0003978: 179 17 180 191 161 189 145 0 165 41
10.1371/journal.pone.0003979: 198 181 193 17 197 33 21 176 191 180
10.1371/journal.pone.0003980: 37 129 18 191 33 0 7 113 51 118
10.1371/journal.pone.0003981: 23 176 174 180 17 193 5 8 41 18
10.1371/journal.pone.0003982: 181 197 176 161 190 185 98 165 187 23
10.1371/journal.pone.0003983: 39 11 165 41 184 4 42 132 19 180
10.1371/journal.pone.0003984: 181 176 132 4 172 16

10.1371/journal.pone.0005855: 1 193 175 127 158 33 37 181 12 168
10.1371/journal.pone.0005856: 177 17 10 13 5 166 18 157 70 74
10.1371/journal.pone.0005857: 1 197 11 178 17 180 188 187 132 193
10.1371/journal.pone.0005858: 125 2 169 158 13 104 136 6 101 135
10.1371/journal.pone.0005859: 86 1 40 187 89 41 4 60 139 141
10.1371/journal.pone.0005860: 27 198 154 32 18 180 98 133 41 115
10.1371/journal.pone.0005861: 190 22 17 154 179 101 18 33 3 20
10.1371/journal.pone.0005862: 193 169 22 29 195 2 59 180 159 144
10.1371/journal.pone.0005863: 190 195 127 198 192 181 184 169 84 144
10.1371/journal.pone.0005864: 179 175 158 172 113 26 100 191 73 143
10.1371/journal.pone.0005865: 191 126 4 180 179 174 167 132 187 76
10.1371/journal.pone.0005866: 141 155 193 41 5 6 152 139 103 83
10.1371/journal.pone.0005867: 1 13 3 183 189 185 197 192 141 2
10.1371/journal.pone.0005868: 3 1 17 174 197 33 145 160 48 187
10.1371/journal.pone.0005869: 183 23 4 194 193 71 50 33 31 130
10.1371/journal.pone.0005870: 1

10.1371/journal.pone.0007222: 1 198 149 177 162 187 41 42 4 9
10.1371/journal.pone.0007223: 2 162 184 181 168 22 165 3 46 89
10.1371/journal.pone.0007224: 198 76 13 157 183 197 167 1 122 42
10.1371/journal.pone.0007225: 183 132 114 119 180 2 6 165 41 163
10.1371/journal.pone.0007226: 2 189 180 1 11 17 10 6 157 133
10.1371/journal.pone.0007227: 187 195 1 42 3 2 175 177 25 153
10.1371/journal.pone.0007228: 13 174 180 188 132 184 33 167 55 161
10.1371/journal.pone.0007229: 21 17 190 1 159 50 41 19 170 187
10.1371/journal.pone.0007230: 190 2 180 193 194 28 0 170 6 16
10.1371/journal.pone.0007231: 185 194 187 160 6 1 2 193 111 150
10.1371/journal.pone.0007232: 17 141 13 180 187 193 33 181 3 2
10.1371/journal.pone.0007233: 198 170 167 188 2 4 33 3 165 83
10.1371/journal.pone.0007234: 1 17 32 197 12 171 74 134 41 30
10.1371/journal.pone.0007235: 3 193 2 33 22 167 154 160 50 37
10.1371/journal.pone.0007236: 13 1 17 193 2 57 130 150 62 18
10.1371/journal.pone.0007237: 1 190 183 187 3 41 37 33 2

10.1371/journal.pone.0008793: 188 190 146 169 191 61 41 39 152 25
10.1371/journal.pone.0008794: 198 1 3 189 180 154 193 4 187 167
10.1371/journal.pone.0008795: 23 180 169 36 194 5 193 177 16 22
10.1371/journal.pone.0008796: 193 8 191 25 33 156 41 39 5 18
10.1371/journal.pone.0008797: 193 109 187 33 113 41 184 66 22 179
10.1371/journal.pone.0008798: 2 162 160 197 66 190 11 174 104 4
10.1371/journal.pone.0008799: 193 188 13 1 3 4 44 2 33 167
10.1371/journal.pone.0008800: 179 188 180 167 18 130 61 147 63 16
10.1371/journal.pone.0008801: 180 195 170 5 36 25 179 143 144 61
10.1371/journal.pone.0008802: 27 160 2 193 32 198 197 190 22 12
10.1371/journal.pone.0008803: 27 2 186 174 194 188 193 150 185 60
10.1371/journal.pone.0008804: 166 5 17 174 21 154 1 150 195 186
10.1371/journal.pone.0008805: 1 21 138 17 8 76 4 5 160 159
10.1371/journal.pone.0008806: 197 180 115 130 29 161 183 137 146 26
10.1371/journal.pone.0008807: 199 187 178 17 196 61 13 60 18 33
10.1371/journal.pone.0008808: 173 1 23 1

In [58]:
 for t in range(len(topic_words)):
        print("Topic {}: {}".format(t, ' '.join(topic_words[t][:15])))

Topic 0: differ compar similar higher distinct variabl variat reflect comparison vari determin main profil code explain
Topic 1: cell prolifer cycl cellular epitheli cultur germ subset cytotox polar label lymphocyt effector cytometri divis
Topic 2: gene microarray encod involv cluster duplic profil candid set delet hox known epigenet biolog enrich
Topic 3: protein proteom encod fluoresc fold similar secret includ degrad gfp involv yeast fusion eukaryot abund
Topic 4: activ stimul enzym erk enhanc modul demonstr cortex enzymat physic indic mapk block presenc cortic
Topic 5: infect hcv transmiss persist infecti post mosquito suscept prion inocul acut uninfect rsv aureus occur
Topic 6: express profil microarray normal mrna transgen pcr mous tast bud upregul isoform quantit real ectop
Topic 7: telomer length telomeras shorten caus defici repeat determin fibroblast normal htert radiat attrit independ repair
Topic 8: hiv aid preval transmiss partner art prevent antiretrovir coupl count hsv i

In [67]:
basket=[]
for i in range(len(doctopic_grouped)):
    top_topics = np.argsort(doctopic_grouped[i,:])[::-1][0:10]
    basket.append(list(top_topics))
print(basket)

[[199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 72, 71, 70, 69, 68, 67, 66, 65], [199, 62, 

In [76]:
import pandas as pd
from mlxtend.preprocessing import OnehotTransactions
from mlxtend.frequent_patterns import apriori

oht = OnehotTransactions()
oht_ary = oht.fit(basket).transform(basket)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.4157,[1]
1,0.3074,[2]
2,0.2013,[3]
3,0.1779,[4]
4,0.1209,[5]
5,0.1692,[6]
6,0.1364,[11]
7,0.0755,[12]
8,0.1877,[13]
9,0.0537,[16]


In [77]:
from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(3),(1),0.2013,0.542474,1.304965
1,(4),(1),0.1779,0.535132,1.287304
2,(6),(1),0.1692,0.632388,1.52126
3,(11),(1),0.1364,0.710411,1.70895
4,(167),(1),0.15,0.502667,1.209205
5,(175),(1),0.0819,0.747253,1.797577
6,(187),(1),0.1671,0.513465,1.235182
7,(6),(2),0.1692,0.599882,1.95147
8,"(1, 6)",(2),0.107,0.565421,1.839364
9,"(2, 6)",(1),0.1015,0.596059,1.433868
