In [1]:
## Dependencies
import sys, os
import pandas as pd
import re
import glob
import numpy as np
from pprint import pprint

## Scikitlearn
import sklearn as sklearn
from tika import parser # pip install tika
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

## NLTK
import nltk as nltk
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

## Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud
# load and show an image with Pillow
from PIL import Image



  from PIL import PILLOW_VERSION
  from PIL import PILLOW_VERSION


In [2]:
## Check versions as needed

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))
#print('The pillow version is {}.'.format(PILLOW_VERSION))

The nltk version is 3.5.
The scikit-learn version is 0.23.1.


In [3]:
## Use the glob method to retrieve files/pathnames in the directory
## https://www.geeksforgeeks.org/how-to-use-glob-function-to-find-files-recursively-in-python/

directory = "News_Industry"
files = list(glob.glob(os.path.join(directory,'*.*')))
print(files)

## Other Resources
## https://stackoverflow.com/questions/34000914/how-to-create-a-list-from-filenames-in-a-user-specified-directory-in-python
## https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
## https://stackoverflow.com/questions/33912773/python-read-txt-files-into-a-dataframe


['News_Industry\\Bibliography.10AGGRESSION AND PHYSICAL HEALTH IN MARRIED WOMEN.pdf', 'News_Industry\\Bibliography.12Impact of Socio-demographic Factors on Awareness of Smoking Effects on Oral Health among Smokers and.pdf', 'News_Industry\\Bibliography.17Health-Promoting Factors related to lifestyle among nursing students in University of Hail.pdf', 'News_Industry\\Bibliography.17Multinomial logit analysis of the effects of five different app-based incentives to encourage cyclin.pdf', 'News_Industry\\Bibliography.1PREVALENCE OF DYSLIPIDEMIA IN YOUNG ADULTS.pdf', 'News_Industry\\Bibliography.20Risk Factors for Atherosclerotic Cardiovascular Disease in the South Asian Population.pdf', 'News_Industry\\Bibliography.29Is the Gay Community the Neo-marginalised of Modern Society_.pdf', 'News_Industry\\Bibliography.33A Biological Effect of Sex Hormone Binding Globulin and Testosterone in Polycystic Ovary Syndrome (P.pdf', 'News_Industry\\Bibliography.34DETERMINANTS OF DEPRESSION ANXIETY STRESS

In [4]:
## Extract text from the pdfs and add them to a list using Tika Python
## The output is a dictionary with: metadata, content, status

document_list = []
for f in files:
    raw = parser.from_file(f)
    document_list.append(raw)
    
## Resources
## https://www.geeksforgeeks.org/parsing-pdfs-in-python-with-tika/
## https://stackoverflow.com/questions/34837707/how-to-extract-text-from-a-pdf-file

In [5]:
## Add the dictionary to a pandas dataframe

text_df = pd.DataFrame(document_list)
text_df.head()
#print(text_df["content"][1])

Unnamed: 0,metadata,content,status
0,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200
1,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200
2,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200
3,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200
4,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200


In [6]:
## Create a new column in the dataframe called "title" and populate it with the title from the metadata key called dc:title

text_df['title'] = [value.get('dc:title') for value in text_df["metadata"]]

text_df.head()

## Resources
## https://stackoverflow.com/questions/44218812/pandas-add-columns-to-a-dataframe-based-in-dict-from-one-of-the-columns

## If needed, you can review the contents of the metadata column using this code:
## print(text_df['metadata'])
## my_dict.keys()[0]     -> key of "first" element
## my_dict.values()[0]   -> value of "first" element
## my_dict.items()[0]    -> (key, value) tuple of "first" element
##list(contacts.items())[0]

# for v in text_df['metadata']:
#     new = list(v.items())[1]
#     print(new)

Unnamed: 0,metadata,content,status,title
0,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,AGGRESSION AND PHYSICAL HEALTH IN MARRIED WOMEN
1,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Impact of Socio-demographic Factors on Awarene...
2,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Health-Promoting Factors related to lifestyle ...
3,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Multinomial logit analysis of the effects of f...
4,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,PREVALENCE OF DYSLIPIDEMIA IN YOUNG ADULTS.


In [7]:
## Consider deleting this code
# Convert the "content" column in dataframe to a list
# The Count Vectorize and the fit transform() for Scikit-Learn expects an iterable 
# or list of strings or file objects, and creates a dictionary of the vocabulary on the corpus.

def convert_content_to_list(text_df):
    
    global content_list
    
    content_list = text_df['content'].tolist()

    return content_list

convert_content_to_list(text_df)

print(type(content_list))
print(content_list[0])

<class 'list'>
















































AGGRESSION AND PHYSICAL HEALTH IN MARRIED WOMEN


 

AGGRESSION AND PHYSICAL HEALTH IN MARRIED WOMEN

Journal of Postgraduate Medical Institute

December 31, 2019 Tuesday

Copyright 2019 Postgraduate Medical Institute All Rights Reserved

Section: Vol. 33; No. 4

Length: 3751 words

Byline: Faiza Shafique and Riffat Sadiq

Body

KeyWords: Aggression, Health, Women

INTRODUCTION

Aggression is an instinctive drive of a person and a dark side of human nature1. It includes a variety of range of 
behaviors2. Aggression involves verbal and physical assault3, therefore, its expression results in intense violence 
towards others4. Aggression is an unwanted and maladaptive behavior causing damage and obliteration5. It is 
exhibited in different forms encompassing physical aggression, verbal aggression, anger and hostility6. A person 
with physical aggression causing physical and emotional harm others while harming or hurting someon

#### Pre-processing for dataset

Scikit-Learn does tokenization using Count Vectorize, but not stemming. Stemming, lemmatizing, compound splitting, filtering based on part-of-speech, etc. are not included in the scikit-learn codebase, but can be added by customizing either the tokenizer or the analyzer.

<https://scikit-learn.org/stable/modules/feature_extraction.html>

In [8]:
## Pre-process the text to lower case, remove special characters, etc. 
## https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.X7RHltBKiUn
## Test regex here: https://pythex.org/

def preprocess(text):
    
    ## Lowercase words
    text_lower = text.lower()
    
    ## Remove Emails from text
    ## if you need to match a \, you can precede them with a backslash to remove their special meaning: \\.
    ## \S matches any non-whitespace character; this is equivalent to the class [^ \t\n\r\f\v].
    ## \s Matches any whitespace character; this is equivalent to the class [ \t\n\r\f\v]
    ## Code below matches any character, then an @ sign, then more characters, end matching when a white space is found.
    text_email = re.sub('\\S*@\\S*\\s?', '', text_lower) 
    
    ## Remove URLS from text
    ## https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python/40823105#40823105
    ## text_urls = re.sub(r'http\S+', '', text_email)
    ## https://www.geeksforgeeks.org/python-check-url-string/
    text_urls = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",'', text_email)
    
    
    ## Remove tabs and new lines from text
    ## https://stackoverflow.com/questions/16355732/how-to-remove-tabs-and-newlines-with-a-regex
    ## \s Matches any whitespace character; this is equivalent to the class [ \t\n\r\f\v]
    text_spaces = re.sub(r'\s+',' ',text_urls)
        
    ## Remove \n from text
    text_space_character = text_spaces.replace('\n','')
    
    ## Remove \t from text
    text_tab_character = text_space_character.replace('\t','')
    
    ## Remove special characters and numbers
    ## \W matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_]
    ## \d matches any decimal digit; this is equivalent to the class [0-9]
    text_numbers = re.sub("(\\d|\\W)+"," ",text_tab_character)
    
    ## Remove tags
    ##text_tags = re.sub("","",text_numbers)

    ## Remove special characters and space, but leave in periods and numbers
    ## ^ means any character except. So [^5] will match any character except '5'
    ## [^a-zA-Z0-9_] matches any non-alphanumeric character.
    ## text_special = re.sub('[^A-Za-z0-9.]+|\s',' ',text_tab_character)
    
    ## Remove a sepcial list of terms
    ## https://stackoverflow.com/questions/15435726/remove-all-occurrences-of-words-in-a-string-from-a-python-list
    
    PRUNE_LIST = ['right reserved section',
                   'reserved section',
                   "length word byline", 
                   "byline", 
                   "word byline",
                   "journal code",
                   "load date", 
                   "english", 
                   "dr", 
                   "publication type magazine",
                   "type magazine",
                   "magazine",
                   "type newspaper",
                   "publication type newspaper",
                   'newspaper',
                   "group right reserved",
                   'section:',
                   'copyright',
                   'body',
                   'length:',
                   'keywords:',
                   'introduction',
                   'page',
                   'methodology',
                   'table',
                   'discussion',
                   'conclusions',
                   'references',
                   'classification',
                   'language',
                   'industry',
                   'geographic',
                   'load-date',
                   'end of document',
                   'mg dl',
                   'mg'
                   
                  ]

    remove = '|'.join(PRUNE_LIST)
    regex = re.compile(r'\b('+remove+r')\b', flags=re.IGNORECASE)
    text_special_remove = regex.sub("", text_numbers)

    return text_special_remove

## New column "preprocess" is formed from applying pre_process function to each item in the "content" column in dataframe
text_df['preprocess'] = text_df['content'].apply(lambda x:preprocess(x))

print(text_df['preprocess'][1])

#https://www.machinelearningplus.com/nlp/lemmatization-examples-python/



In [9]:
## Open the stopwords file
def get_stop_words(stop_file_path):
#     """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("stop_words.txt")

In [10]:
## Lemmatize documents

    
def lemmatize(doc_list, stopwords):
    
    #initiate a lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    #initiate an empty string
    lemmatized_text=''

    #split each doc into words
    for word in doc_list.split():
            
        #check if each word is in stopword list and lemmatize, add to string
        if word not in stopwords:
            
                lemmatized_text = lemmatized_text+' '+ str(lemmatizer.lemmatize(word))
                
    return lemmatized_text

## New column "lemmatize" is formed from applying pre_process function to each item in the "preprocess" column in dataframe
text_df['lemmatize'] = text_df['preprocess'].apply(lambda x:lemmatize(x, stopwords))

print(text_df['lemmatize'][1])




In [11]:
## import dataframe from non-core spreadsheet with year column
industry_df = pd.read_csv("news_industry_metadata.csv", encoding='ISO-8859-1') 
industry_df.head()
industry_df.keys()



Index(['DOI', 'PubMed_ID', 'Scopus_ID', 'WoS_ID', 'DT', 'RD', 'CORE',
       'Authors', 'Title ', 'Year', 'Source title '],
      dtype='object')

In [12]:
## Match text_df and non_core_df based on title to add year to the text_df dataframe

final_df= text_df.merge(industry_df, left_on = 'title', right_on = 'Title ', how = 'inner')
final_df.head()

# result = pd.concat([df1, df4], axis=1, join="inner")
# df1.merge(df2, left_on='lkey', right_on='rkey')

## https://stackoverflow.com/questions/49890305/match-two-columns-from-two-dataframes-and-add-items-from-a-third-column-if-cells
# miscset = miscset.merge(oset, left_on='subset', right_on='some_items', 
#     how='inner').drop(columns='some_items')

Unnamed: 0,metadata,content,status,title,preprocess,lemmatize,DOI,PubMed_ID,Scopus_ID,WoS_ID,DT,RD,CORE,Authors,Title,Year,Source title
0,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,AGGRESSION AND PHYSICAL HEALTH IN MARRIED WOMEN,aggression and physical health in married wom...,aggression physical health married woman aggr...,,,,,News Industry,CO,NONCORE,S. Faiza; S. Riffat,AGGRESSION AND PHYSICAL HEALTH IN MARRIED WOMEN,2019,Journal of Postgraduate Medical Institute
1,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Impact of Socio-demographic Factors on Awarene...,impact of socio demographic factors on awaren...,impact socio demographic factor awareness smo...,,,,,News Industry,CO,NONCORE,N. M. C. M. A. H. A. F. Asad R; H. A. Majeed,Impact of Socio-demographic Factors on Awarene...,2019,Biomedica
2,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Health-Promoting Factors related to lifestyle ...,health promoting factors related to lifestyle...,health promoting factor lifestyle nursing stu...,,,,,News Industry,CO,NONCORE,V. I. Dr,Health-Promoting Factors related to lifestyle ...,2018,Asian Journal of Nursing Education and Research
3,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Multinomial logit analysis of the effects of f...,multinomial logit analysis of the effects of ...,multinomial logit analysis effect app based i...,,,,,News Industry,CO,NONCORE,T. F. T. T. E. V. B. Bingyuan Huang,Multinomial logit analysis of the effects of f...,2018,IET Intelligent Transport Systems
4,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Risk Factors for Atherosclerotic Cardiovascula...,risk factors for atherosclerotic cardiovascul...,risk factor atherosclerotic cardiovascular di...,,,,,News Industry,CO,NONCORE,D. N.-C. C. F. Casey Elkins; P. F. N. P. C. T....,Risk Factors for Atherosclerotic Cardiovascula...,2018,Clinical Advisor


In [13]:
## Create a dataframe form the document list
text_df.to_csv('sklearn_test.csv', columns = ['metadata','lemmatize'])

In [14]:
## Convert the "lemmatized" column in dataframe to a list

def convert_lemmatized_to_list(text_df):
    
    global lemmatized_list
    lemmatized_list = text_df['lemmatize'].tolist()

    return lemmatized_list

convert_lemmatized_to_list(text_df)
print(type(lemmatized_list))
print(len(lemmatized_list))
print(lemmatized_list[1])

<class 'list'>
103


### CountVectorizer 

The vectorizer objects provided by Scikit-Learn are quite reliable right out of the box, they allow us to perform all the above steps at once efficiently, and even apply preprocessing and rules regarding the number and frequency of tokens.Count Vectorizer: The most straightforward one, it counts the number of times a token shows up in the document and uses this value as its weight.

We first have to create a CountVectorizer to count the number of words (term frequency), limit your vocabulary size, apply stop words and etc. The CountVectorizer transformer from the sklearn.feature_extraction model has its own internal tokenization and normalization methods. 

**How it wants the data:** Note the corpus should be a list (which is made of a "list of strings" not a "list of lists"). CountVectorizer considers each element of the list as a different document to vectorize.

**Expected return:** CountVectorizer creates a python dictionary of the tokens and their unique IDs from the corpus.

**Helpful methods:** 
* NOTE: cv is the variable I used when creating my CountVectorizer (see code below)
* Print the ID of one word in the dictionary: print(cv.vocabulary_.get(u'aspirin'))
* Print the list of terms and their unique IDs: print(cv.vocabulary_)
* Print the first 50 token IDs: print(list(cv.vocabulary_.keys())[:50])
* Print the frist 50 token names: print(list(cv.vocabulary_.values())[:50])
* Get a list of the token names: cv.get_feature_names()
* Print the stop list that was used: print(cv.get_stop_words())


### Fit_Transform 
Then we will use fit_transform to create a term-document matrix, where each column in the matrix represents a word in the vocabulary while each row represents the document in our dataset where the values in this case are the word counts.

**How it wants the data:** Remember that fit_transform() function for Scikit-Learn expects an iterable or list of strings or file objects.
 
**Expected return:** When fit_transform() is called, each individual document is transformed into a sparse array/matrix whose index tuple is the row (the document ID) and the token ID from the dictionary, and whose value is the count.

**Helpful methods:**
* Check the shape, which should return Number of documents in corpus, Number of terms extracted from corpus: print(word_count_vector.shape) 
* Note, the the todense() function acts as a dataframe contructor for a numpy matrix: word_count_df_all = pd.DataFrame(word_count_vector.todense())

#### Text from: 
* <https://towardsdatascience.com/hacking-scikit-learns-vectorizers-9ef26a7170af>
* <https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.X70izdBKiUn>
* <https://kavita-ganesan.com/how-to-use-countvectorizer/#Working-With-NGrams>
* <https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html>
* <https://medium.com/@rnbrown/more-nlp-with-sklearns-countvectorizer-add577a0b8c8>

In [None]:
def vectorize_content_list(lemmatized_list): 

    global word_count_vector
    global cv
       
    ## Set parameters for Count Vectorize
    ## max_df: Ignore words that appear in 85% of documents, 
    ## max_df=0.85,
    ## stop_words: Eliminate stop words
    ## ngram_range: Include tokens with one word or two word phrases, (1, 2) means unigrams and bigrams
    ## max_features: Limit our vocabulary size to 10,000
    ## min_df: Ignore words that only appear in 1 document  
    ## Additional features not used: 
    ### preprocessor = preprocess, then use the initiate the preprocess function
    ### binary = True, then CountVectorizer no longer uses the counts of terms/tokens. 
    #### If a token is present in a document, it is 1, if absent it is 0 regardless of its frequency of occurrence. 
    #### By default, binary=False.
    
    cv=CountVectorizer(
                       stop_words='english',
                       ngram_range=(1, 3),
                       max_features=10000, 
                       min_df=1)
                    
   
    ## Use Count Vectorizer and call fit_transform() to create the vocabulary and return a term-document matrix for each document
    ## Return is tuple: the document ID, the token ID from the dictionary, the count
    word_count_vector=cv.fit_transform(lemmatized_list)
    
    ##.toarray()
    
    return cv, word_count_vector

cv, word_count = vectorize_content_list(lemmatized_list)

### Do some checking of the output

First, check the shape. We should have the same number of rows as documents in our dataset (6 rows = 6 docs) and the number of columns based on the unique words in our dataset, which we limited above to 10,000. 

Second, check the index of one of the words in the dictionary. 
Third, check the keys and the values. 
Fourth, review the tuple that is the output.

In [None]:
## Review the outputs of the vectorize_string() function
# print the stop list used
#print(cv.get_stop_words())

# print(type(cv))
print(cv.vocabulary_.get(u'aspirin'))
print(type(cv.vocabulary_))
#print(cv.vocabulary_)

## Review the first 50 token IDs
# print(list(cv.vocabulary_.keys())[:50])

## Review the frist 50 token names
# print(list(cv.vocabulary_.values())[:50])

## Review the word_count_vector
## The vector includes (doc, token_id) and a count of term in document

print(type(word_count_vector))
print(word_count_vector.shape)
# print(word_count_vector)

## See what stop words were found using min_df, max_df, and stopwords
print(cv.stop_words_)

### Check the Sparcity of the data

Sparsicity is nothing but the percentage of non-zero datapoints in the document-word matrix, that is data_vectorized.

Since most cells in this matrix will be zero, I am interested in knowing what percentage of cells contain non-zero values.

Resource:<https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn>

In [None]:
# Materialize the sparse data
data_dense = word_count_vector.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity (percent of cells contain non-zero): ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

## Topic Modeling with ScikitLearn using LDA

Topic Modelling is an unsupervised Machine Learning task where we try to discover “abstract topics” that can describe a collection of documents. For example, let’s say we have a collection of 100 texts. We go through each text and discover that ten of them contain words like “machine learning”, “training”, “supervised”, “unsupervised”, “dataset” and so on. We may not know what these words mean and we really don’t care. We only see a pattern here, that 10% of our articles contain these words and we conclude that they should be included in the same topic. We can’t actually name the topic and again, this is not needed. We are able to cluster these 10 articles into the same topic. And when we get a new text which we have never seen before, we look into it, we find it contains some of these words, then we’ll be able to say “hey, this goes into the same category with the other 10 articles!”

Our first step is to build an LDA model with scikitlearn. We need to set the number of topics (usually start with 5 to 10 topics) and choose a learning decay state (usually: 0.5, 0.7, or 0.9). Later we will test our LDA model to identify the best number of topics and the setting for learning decay. 

The LatentDirichletAllocation() function works like this: 

Assign every word in every document to a temporary topic. This temporary topic will be random at first, but will be updated in the next step.For this step, we will go through every document and then every word in that document and compute 2 values:
* the probability that this document belongs to a certain topic; this is based on how many words(except the current word) from this document belong to the topic of the current word
* the proportion of documents that are assigned to the topic of the current word because of the current word.

This step is completed a certain number of times (established before beginning to run the algorithm). In the end, we will look at each document, find the topic that is most prevalent based on its words and assign that document to that topic.

**How it wants the data:** The LDA topic model algorithm requires a document word matrix as the main input. You can create one using CountVectorizer (see steps above to create cv) and the output of the fit_transform, which we called word_count_vector. 

**Expected Return:** After calling fit_transform on the lda_model, the return is an array which contains a the number of probabilities that our text belongs to each of the number of topics we've asked for. 

For example, if there are three numbers for each item in the array, then these 3 numbers are probabilities that our text belongs to one of the 3 topics we’ve generated from the LDA algorithm. We can see that the highest probability(72%) tells us that this text should also belong to the 3rd topic, so in the same topic that talks about cities. We can see that this is a very good result obtained from a very small dataset.

Text from: <https://towardsdatascience.com/latent-dirichlet-allocation-for-topic-modelling-explained-algorithm-and-python-scikit-learn-c65a82e7304d>

In [None]:
## Build the LDA Model

feature_names = cv.get_feature_names()
num_of_topic = 2

lda_model = LatentDirichletAllocation(
                n_components=num_of_topic, 
                max_iter=5, 
                learning_method='online', 
                random_state=0,
                learning_decay=0.9)

lda_output = lda_model.fit_transform(word_count_vector)
#print(lda_output)# Model attributes

##https://towardsdatascience.com/2-latent-methods-for-dimension-reduction-and-topic-modeling-20ff6d7d547#:~:text=Both%20LSA%20and%20LDA%20have,LDA%20solves%20topic%20modeling%20problems.



In [None]:
def display_word_distribution(model, feature_names, n_word):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        words = []
        for i in topic.argsort()[:-n_word - 1:-1]:
            words.append(feature_names[i])
        print(words)

display_word_distribution(
    model=lda_model, feature_names=feature_names, 
    n_word=10)

## n_word here is the number of terms to print for each topic

### Diagnose model performance with perplexity and log-likelihood

A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good.

Coherence in this case measures a single topic by the degree of semantic similarity between high scoring words in the topic (do these words co-occur across the text corpus). 

Text from: <https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn>

In [None]:
## A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good.
## https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(word_count_vector))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(word_count_vector))

# See model parameters
pprint(lda_model.get_params())

### Using GridSearch to find the best LDA Model

The most important tuning parameter for LDA models is n_components (number of topics). In addition, we will search learning_decay (which controls the learning rate) as well.

Be warned, the grid search constructs multiple LDA models for all possible combinations of param values in the param_grid dict. So, this process can consume a lot of time and resources.

Text from: <https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn>

**What is GridSearch?**
Gridsearch helps us establish the best hyperparameters for our model. A machine learning model has multiple parameters that are not trained by the training set. These parameters control the accuracy of the model. Therefore, the hyperparameters are particularly important in a data science project.The hyperparameters are configured up-front and are provided by the caller of the model before the model is trained.

Text from: <https://medium.com/fintechexplained/what-is-grid-search-c01fe886ef0a>

In [None]:
# Define Search Param
search_params = {'n_components': [2, 4, 6, 8, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(word_count_vector)


## A more in-depth version of the code above

# GridSearchCV(cv=None, error_score='raise',
#        estimator=LatentDirichletAllocation(batch_size=128, 
#                                            doc_topic_prior=None,
#                                            evaluate_every=-1, 
#                                            learning_decay=0.7, 
#                                            learning_method=None,
#                                            learning_offset=10.0, 
#                                            max_doc_update_iter=100, 
#                                            max_iter=10,
#                                            mean_change_tol=0.001, 
#                                            n_components=10, 
#                                            n_jobs=1,
#                                            perp_tol=0.1, 
#                                            random_state=None,
#                                            topic_word_prior=None, 
#                                            total_samples=1000000.0, 
#                                            verbose=0),
#                                            #fit_params=None, 
#                                            #iid=True, 
#                                            #n_jobs=1,
#        param_grid={'n_components': [2, 4, 6, 8, 10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
#        pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
#        scoring=None, verbose=0)


### Printing the best parameters using GridSearch

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(word_count_vector))

In [None]:
## Grid search returns a number of results. Here we can view those results in a pandas dataframe
# model.cv_results_.keys()
# model.cv_results_.values()

results_df = pd.DataFrame.from_dict(model.cv_results_)
results_df.head()

### Compare LDA Model Performance Scores 
Though we found the best scores above, here we can visualize the results of the analysis used to find those scores. 

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_components = [2, 4, 6, 8, 10, 15, 20, 25, 30]

 
log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.5]
log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.7]
log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_components, log_likelyhoods_5, label='0.5')
plt.plot(n_components, log_likelyhoods_7, label='0.7')
plt.plot(n_components, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

### How to see the dominant topic in each document?

To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it.

In the table below, I’ve greened out all major topics in a document and assigned the most dominant topic in its own column.

Note: The number in the dominate topic column corresponds the number of the topic for which the document is most like. 

Text from: <https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn>

In [None]:
## Turn original dataframe of text to a list
data = text_df.values.tolist()

# Create Document - Topic Matrix
lda_output = best_lda_model.transform(word_count_vector)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

### Review topics distribution across documents


In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

### Visualizing the LDA model with pyLDAvis
The pyLDAvis offers the best visualization to view the topics-keywords

Text from: <https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn>

In [None]:
## A good topic model will have non-overlapping, fairly big sized blobs for each topic. 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, word_count_vector, cv, mds='tsne')
panel

### Viewing all of the topic’s keywords
The weights of each keyword in each topic is contained in lda_model.components_ as a 2d array. The names of the keywords itself can be obtained from vectorizer object using get_feature_names().

Let’s use this info to construct a weight matrix for all keywords in each topic.

Text from <https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn>

In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = cv.get_feature_names()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head()

### Get the top 15 keywords each topic

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=cv, lda_model=lda_model, n_words=15):
    keywords = np.array(cv.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=cv, lda_model=best_lda_model, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

### Preditcting the topics for a new piece of text

Assuming that you have already built the topic model, you need to take the text through the same routine of transformations and before predicting the topic.

For our case, the order of transformations is:

sent_to_words() –> lemmatization() –> vectorizer.transform() –> best_lda_model.transform()

Text from: <https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn>

In [None]:
## NOTE!! This codes does not conform to my code here...just sample code from the url above

## Define function to predict topic for a given text document.
# nlp = spacy.load('en', disable=['parser', 'ner'])

# def predict_topic(text, nlp=nlp):
#     global sent_to_words
#     global lemmatization

#     # Step 1: Clean with simple_preprocess
#     mytext_2 = list(sent_to_words(text))

#     # Step 2: Lemmatize
#     mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#     # Step 3: Vectorize transform
#     mytext_4 = vectorizer.transform(mytext_3)

#     # Step 4: LDA Transform
#     topic_probability_scores = best_lda_model.transform(mytext_4)
#     topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
#     return topic, topic_probability_scores

# # Predict the topic
# mytext = ["Some text about christianity and bible"]
# topic, prob_scores = predict_topic(text = mytext)
# print(topic)

### How to cluster documents that share similar topics and plot?
You can use k-means clustering on the document-topic probabilioty matrix, which is nothing but lda_output object. The k-means clustering method is an unsupervised machine learning technique used to identify clusters of data objects in a dataset. Because our best model has 2 clusters, we've set n_clusters=2 in KMeans().Alternately, you could avoid k-means and instead, assign the cluster as the topic column number with the highest probability score.

Text from: <https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#20howtoclusterdocumentsthatsharesimilartopicsandplot>

The k-means clustering method is an unsupervised machine learning technique used to identify clusters of data objects in a dataset.

Text from: <https://realpython.com/k-means-clustering-python/>

In [None]:
## Construct the k-means clusters

clusters = KMeans(n_clusters=2, random_state=100).fit(lda_output)
# clusters.inertia_ ## The lowest SSE value
# clusters.cluster_centers_ ## Final locations of the centroid
# clusters.n_iter_ ## The number of iterations required to converge


clusters_2 = KMeans(n_clusters=2, random_state=100).fit_transform(lda_output)
print(clusters_2)

## Build the Singular Value Decomposition(SVD) model
## Use code when you have more than 2 clusters
#svd_model = TruncatedSVD(n_components=2)  
#lda_output_svd = svd_model.fit_transform(lda_output)

# ## X and Y axes of the plot using SVD decomposition
# x = lda_output_svd[:, 0]
# y = lda_output_svd[:, 1]

# ## Weights for the 15 columns of lda_output, for each component
# print("Component's weights: \n", np.round(svd_model.components_, 2))

# ## Percentage of total information in 'lda_output' explained by the two components
# print("Perc of Variance Explained: \n", np.round(svd_model.explained_variance_ratio_, 2))

## Use this code when you have equal to or less than 2 clusters
x = lda_output[:, 0]
y = lda_output[:, 1]


### Visualize the clusters

We now have the cluster number. But we also need the X and Y columns to draw the plot.

For the X and Y, you can use SVD on the lda_output object with n_components as 2. SVD ensures that these two columns captures the maximum possible amount of information from lda_output in the first 2 components.

Text from: <https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#20howtoclusterdocumentsthatsharesimilartopicsandplot>

In [None]:
## We have the X, Y and the cluster number for each document.

## Let’s plot the document along the two SVD decomposed components. The color of points represents the cluster number (in this case) or topic number.

## Use this code when you have only 2 clusters
x = lda_output[:, 0]
y = lda_output[:, 1]

## Plot
plt.figure(figsize=(12, 12))
plt.scatter(x, y)
plt.xlabel('Component 2')
plt.xlabel('Component 1')
plt.title("Segregation of Topic Clusters", )

### Using NMF instead of LDA

TF-IDF Vectorizer: TF-IDF stands for “term frequency-inverse document frequency”, meaning the weight assigned to each token not only depends on its frequency in a document but also how recurrent that term is in the entire corpora. More on that here.

<https://towardsdatascience.com/hacking-scikit-learns-vectorizers-9ef26a7170af>

In [None]:
## NMF is able to use tf-idf
## TFIDF VEctorize is Equivalent to CountVectorizer followed by TfidfTransformer.
## Set parameters for TFIDF Vectorize
    ## Ignore words that appear in 85% of documents, 
    ## Eliminate stop words
    ## Include tokens with one word or two word phrases, (1, 2) means unigrams and bigrams
    ## Run the preprocess function 
    ## Limit our vocabulary size to 10,000
    ## Ignore words that only appear in 1 document
    
tfidf_vectorizer = TfidfVectorizer(max_df=0.85,
                                   stop_words='english',
                                   ngram_range=(1, 3),
                                   max_features=10000,
                                   min_df=1, 
                                   )
tfidf = tfidf_vectorizer.fit_transform(lemmatized_list)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()



In [None]:
no_topics = 2
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
no_top_features = 15
def display_topics(model, feature_names, no_top_features):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic:" ,(topic_idx))
        print (", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_features - 1:-1]]))

display_topics(nmf, tfidf_feature_names, no_top_features)