# NMF Decomposition and Association Rule Mining
This factorization can be used for example for dimensionality reduction, source separation or topic extraction. Association Rule Mining helps us find frequently occuring itemsets (in our case, topics) and characterizes these frequent itemsets with commonly used measures of support, confidence, and lift.

## Importing libraries and establishing SQLite database connection

In [104]:
import sqlite3
import datetime as datetime
import os
import itertools
import numpy as np
import pandas as pd
conn = sqlite3.connect("testDB.db")

## Reading in data using SQL style querying

In [105]:
abstract=pd.read_sql_query("select doi,abstract from trial_10000;", conn)

In [106]:
abstract.head()

Unnamed: 0,doi,abstract
0,10.1371/journal.pone.0000100,BackgroundMeasuring perceptual judgments about...
1,10.1371/journal.pone.0000008,Background“Explosive” adaptive radiations on i...
2,10.1371/journal.pone.0000061,Reliable and comprehensive maps of molecular p...
3,10.1371/journal.pone.0000094,The transcriptional response to exogenously su...
4,10.1371/journal.pone.0000011,BackgroundDrug treatment is becoming more expe...


In [107]:
abstract['abstract']=abstract['abstract'].str.replace('\d+', '') # for digits
abstract['abstract']=abstract['abstract'].str.replace(r'(\b\w{1,2}\b)', '') # for words
abstract['abstract']=abstract['abstract'].str.replace('Background', '')

In [108]:
abstract.head()

Unnamed: 0,doi,abstract
0,10.1371/journal.pone.0000100,Measuring perceptual judgments about stimuli w...
1,10.1371/journal.pone.0000008,“Explosive” adaptive radiations islands remai...
2,10.1371/journal.pone.0000061,Reliable and comprehensive maps molecular pat...
3,10.1371/journal.pone.0000094,The transcriptional response exogenously supp...
4,10.1371/journal.pone.0000011,Drug treatment becoming more expensive due t...


## Convert a collection of text documents to a matrix of token counts

In [109]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english',)

In [110]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk.stem

english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

vectorizer_s = StemmedCountVectorizer(min_df=3, analyzer="word", stop_words='english')

In [111]:
vectorizer_s.fit(abstract['abstract'])

StemmedCountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=3,
            ngram_range=(1, 1), preprocessor=None, stop_words='english',
            strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
            tokenizer=None, vocabulary=None)

In [112]:
vectorizer_s.get_feature_names()[0:5]

['aa', 'aaa', 'aac', 'aag', 'aav']

In [113]:
dtm = vectorizer_s.fit_transform(abstract['abstract']).toarray()

In [114]:
vocab = np.array(vectorizer_s.get_feature_names())

In [115]:
dtm.shape

(10000, 12862)

In [116]:
len(vocab)

12862

## Perform Non-Negative Matrix Factorization (NMF)

In [117]:
from sklearn import decomposition

In [118]:
num_topics = 50
num_top_words = 10
clf = decomposition.NMF(n_components=num_topics, random_state=1)

In [119]:
doctopic = clf.fit_transform(dtm)

In [120]:
topic_words = []

for topic in clf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])
  

In [121]:
doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

  """Entry point for launching an IPython kernel.


In [122]:
doc_names=np.asarray(abstract['doi'])

In [123]:
print(doc_names)

['10.1371/journal.pone.0000100' '10.1371/journal.pone.0000008'
 '10.1371/journal.pone.0000061' ..., '10.1371/journal.pone.0009760'
 '10.1371/journal.pone.0009675' '10.1371/journal.pone.0009654']


In [124]:
doctopic_orig = doctopic.copy()

In [125]:
num_groups = len(set(doc_names))

In [126]:
doctopic_grouped = np.zeros((num_groups, num_topics))

In [127]:
for i, name in enumerate(sorted(set(doc_names))):
    doctopic_grouped[i, :] = np.mean(doctopic[doc_names == name, :], axis=0) 

In [128]:
plos_articles = sorted(set(doc_names))

## Display top 10 topics in a document, and words in a topic.

In [138]:
for i in range(len(doctopic_grouped)):
    top_topics = np.argsort(doctopic_grouped[i,:])[::-1][0:10]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    #print("{}: {}".format(plos_articles[i], top_topics_str))

In [134]:
 for t in range(len(topic_words)):
        print("Topic {}: {}".format(t, ' '.join(topic_words[t][:15])))

Topic 0: differenti stem cultur marker develop bone tissu embryon progenitor vitro
Topic 1: cell line prolifer stem cycl epitheli vivo cultur vitro type
Topic 2: gene identifi analysi microarray involv encod cluster profil pathway relat
Topic 3: protein membran identifi proteom complex local domain like process encod
Topic 4: result role develop suggest mutant play mechan format growth import
Topic 5: infect host immun parasit transmiss pathogen hcv viral bacteri tuberculosi
Topic 6: associ risk snps genotyp polymorph signific allel variant genet identifi
Topic 7: activ phosphoryl kinas inhibit depend enzym stimul mediat erk channel
Topic 8: hiv infect viral women aid risk art partner preval prevent
Topic 9: tumor cancer breast growth lung line prostat ovarian target progress
Topic 10: vaccin immun antibodi antigen protect dose malaria immunogen challeng bcg
Topic 11: interact network complex dynam biolog mechan host organ connect understand
Topic 12: patient clinic treatment therapi s

## Generate 'Baskets' of topics for each article. 
Here, article is the identifier, and the collection of topics acts like the shopping basket.

In [139]:
basket=[]
for i in range(len(doctopic_grouped)):
    top_topics = np.argsort(doctopic_grouped[i,:])[::-1][0:10]
    basket.append(list(top_topics))
#print(basket)

## Making use of Apriori algorithm to generate frequent itemsets with threshold as 10%.

In [137]:
import pandas as pd
from mlxtend.preprocessing import OnehotTransactions
from mlxtend.frequent_patterns import apriori

oht = OnehotTransactions()
oht_ary = oht.fit(basket).transform(basket)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
frequent_itemsets = apriori(df, min_support=0.10, use_colnames=True)

## Applying association rules to generated frequent itemsets.

In [136]:
from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(3),(1),0.2547,0.531213,1.322084
1,(7),(1),0.2423,0.518366,1.290109
2,(21),(1),0.2006,0.517448,1.287824
3,(43),(1),0.2211,0.600633,1.494856
4,(39),(33),0.3383,0.543305,1.240988
5,(45),(33),0.227,0.520264,1.188361
6,(46),(33),0.2835,0.506526,1.156979
