In [10]:
from sklearn import pipeline, preprocessing
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model
from sklearn import ensemble
from sklearn import model_selection
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation, NMF

from scipy import sparse

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import pyLDAvis
import pyLDAvis.sklearn
from gensim import matutils

import math
import numpy as np
import pandas as pd
import csv
import os

In [4]:
def directory_list_generator(prime_directory):
    """Returns a list of all non-hidden directories
    based on the path directory given, it will, return
    only directories within the folder specified"""
    directories=os.listdir(prime_directory)
    dir_list = [x for x in directories if '.' not in x]
    return dir_list

In [5]:
def text_file_tabulator(dir_list):
    """Goes through all directories given as argument list
    then picks up each text file and extracts text dumping it
    to a column"""
    paper_content = dict()
    for txtDir in dir_list:
        txtDir = prime_directory + txtDir
        for txtfile in os.listdir(txtDir): #iterate through text files in directory
            if txtfile[-3:] == 'txt':
                document_path = txtDir + '/' + txtfile
                with open(document_path) as fhand:
                    content = fhand.read()
                    paper_content[txtfile] = [content]
    return paper_content              

In [6]:
def word_counter(compiled_documents):
    """Pipeline to convert list of documents with text
    content from papers into a matrix using count
    vectoriser."""
    # Create numpy array of text data from input dictionary
    text_data = []
    text_data.append([v for k,v in compiled_documents.items()])
    text_data = np.array(text_data[0])
    # Create list of stop words
    my_stop_words = text.ENGLISH_STOP_WORDS.union(['cid'])
    # Create a CountVectorizer for parsing/counting words
    count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=my_stop_words, token_pattern="\\b[a-z][a-z]+\\b")
    counts = count_vectorizer.fit_transform(text_data[:,0])
    return counts, count_vectorizer,text_data

In [103]:
def word_counter_fromCSV(df):
    """Pipeline to convert list of documents with text
    content from papers into a sparse matrix using count
    vectoriser."""
    # Create numpy array from inputted dataframe
    text_data = df.values
    # Create list of stop words
    my_stop_words = text.ENGLISH_STOP_WORDS.union(['cid','et','et al','al', 'yes', 'method',
                                                   'results','citation','use','used','submitted','published'])
    # Create a CountVectorizer for parsing/counting words
    count_vectorizer = CountVectorizer(ngram_range=(2,3), 
                                       stop_words=my_stop_words, 
                                       token_pattern="\\b[a-z][a-z]{2,15}\\b",
                                       min_df=5,max_df=30)
    counts = count_vectorizer.fit_transform(text_data[:,0])
    return counts, count_vectorizer,text_data

In [104]:
def tfidf_vectorizer_fromCSV(df):
    """Uses tfidf algorithm to return word frequency by
    document."""

    # Create numpy array from inputted dataframe
    text_data = df.values
    # Create list of stop words
    my_stop_words = text.ENGLISH_STOP_WORDS.union(['cid','et','et al','al', 'yes', 'method',
                                                   'results','citation','use','used','submitted','published'])


    # Vectorize the text using TFIDF
    tfidf = TfidfVectorizer(ngram_range=(2,2), stop_words=my_stop_words, 
                            token_pattern="\\b[a-zA-Z][a-zA-Z]{2,15}\\b", 
                            min_df=5,max_df=30)
    tfidf_vecs = tfidf.fit_transform(text_data[:,0])
    return tfidf_vecs, tfidf,text_data

In [105]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [143]:
def print_top_grams_bydoc(result,model, feature_names,n_docs, n_top_words):
    for i in range(n_docs):
        doc_topic = np.argmax(result[i])
        message = "Document #%d, top topic #%d: " % (i,doc_topic)
        word_topic = model.components_[doc_topic]
        message += ", ".join([feature_names[i]
                     for i in word_topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [106]:
df = pd.DataFrame.from_csv('text_df.csv')

## Count Vectorizer method

In [107]:
dtm_tf, tf_vectorizer,text_data = word_counter_fromCSV(df)

In [108]:
sp_dtm_tf = sparse.csr_matrix(dtm_tf)

In [109]:
NMF_model = NMF(n_components=20,random_state=0)
NMF_fitted = NMF_model.fit_transform(sp_dtm_tf)

In [110]:
print("\nTopics in NMF model:")
countvec_feature_names = tf_vectorizer.get_feature_names()
print_top_words(NMF_model, countvec_feature_names, 5)


Topics in NMF model:
Topic #0: multi task, multi task learning, different tasks, task feature, multi task feature
Topic #1: metric learning, structured data, distance metric, learning feature, edit distance
Topic #2: arxiv prints, value function, deep reinforcement, deep reinforcement learning, learning representations
Topic #3: true true, performance measures, performance measure, cost sensitive, trained task
Topic #4: hinge loss, mirror descent, step size, data distribution, shalev shwartz
Topic #5: quantum machine, quantum machine learning, learning quantum, quantum computer, quantum algorithms
Topic #6: function class, empirical risk, training points, function space, approximation error
Topic #7: graphical models, exponential family, graphical model, family distributions, exp exp
Topic #8: text categorization, acm international, acm international conference, conference research, sch utze
Topic #9: adversarial examples, adversarial training, conference paper, adversarial example, o

In [145]:
print_top_grams_bydoc(NMF_fitted,NMF_model, countvec_feature_names, 10,2)

Document #0, top topic #10: empirical risk, risk minimization
Document #1, top topic #0: decision support, recommender systems
Document #2, top topic #13: adversarial examples, adversarial training
Document #3, top topic #7: dictionary learning, hidden units
Document #4, top topic #10: empirical risk, risk minimization
Document #5, top topic #14: multi armed, armed bandits
Document #6, top topic #12: learning extract, web pages
Document #7, top topic #12: learning extract, web pages
Document #8, top topic #3: contextual features, context sensitive
Document #9, top topic #7: dictionary learning, hidden units



## TFIDF Implementation

In [146]:
dtm_tf, tf_vectorizer,text_data = tfidf_vectorizer_fromCSV(df)

In [147]:
sp_dtm_tf = sparse.csr_matrix(dtm_tf)

In [148]:
NMF_model = NMF(n_components=20,random_state=0)
NMF_fitted = NMF_model.fit_transform(sp_dtm_tf)

In [149]:
print("\nTopics in NMF model:")
countvec_feature_names = tf_vectorizer.get_feature_names()
print_top_words(NMF_model, countvec_feature_names, 5)


Topics in NMF model:
Topic #0: decision support, recommender systems, health care, feature engineering, software engineering
Topic #1: quantum machine, quantum learning, quantum algorithm, learning quantum, quantum algorithms
Topic #2: scikit learn, ensemble learning, pedregosa varoquaux, learning python, learn machine
Topic #3: contextual features, context sensitive, primary features, contextual information, sensitive features
Topic #4: convergence rate, strongly convex, matrix factorization, step size, coordinate descent
Topic #5: multi task, multitask learning, multiple tasks, sparse coding, lifelong learning
Topic #6: state action, learning agents, value function, state space, learning agent
Topic #7: dictionary learning, hidden units, deep belief, auto encoders, new classes
Topic #8: meta learning, shot learning, meta features, meta data, instance level
Topic #9: phys rev, rev lett, materials science, quantum machine, dimensional vectors
Topic #10: empirical risk, risk minimizati

In [150]:
print_top_grams_bydoc(NMF_fitted,NMF_model, countvec_feature_names, 10,2)

Document #0, top topic #10: empirical risk, risk minimization
Document #1, top topic #0: decision support, recommender systems
Document #2, top topic #13: adversarial examples, adversarial training
Document #3, top topic #7: dictionary learning, hidden units
Document #4, top topic #10: empirical risk, risk minimization
Document #5, top topic #14: multi armed, armed bandits
Document #6, top topic #12: learning extract, web pages
Document #7, top topic #12: learning extract, web pages
Document #8, top topic #3: contextual features, context sensitive
Document #9, top topic #7: dictionary learning, hidden units

