# Project 2
**Name**: Adriane Mikko Amorado<br>
**Course Name**: Solving Business Problems with NLP<br>
**Instructor**: Juber Rahman

## Imports

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Analysis

### 1. Select a book of your choice from project Gutenberg https://www.gutenberg.org/

In [15]:
url = 'https://www.gutenberg.org/files/15420/15420.txt'

### 2. Load the text in your python workspace

In [16]:
data = requests.get(url)

### 3. Do topic modeling on the text after due preprocessing, vectorization etc.

In [28]:
def data2docs(data):
    docs = pd.Series([doc.strip() for doc in data.text.split('\r\n\r\n') if doc])
    start = docs.index[docs.str.startswith('*** START OF THIS PROJECT GUTENBERG EBOOK')]
    end = docs.index[docs.str.startswith('*** END OF THIS PROJECT GUTENBERG EBOOK')]
    return docs.loc[start[0]+1: end[0]-1]

In [29]:
docs = data2docs(data)

In [30]:
def tokenizer(doc):
    """Tokenizes docs by applying pos with lemmatizer for each token"""
    tokenizer = RegexpTokenizer(r'(?u)\b(\w(?:\w|\-)+)\b')
    tokens = tokenizer.tokenize(doc)

    postags = [
        (token.lower(), 'a' if pos[0] == 'J' else pos[0].lower())
        for token, pos in nltk.pos_tag(tokens)
        if pos[0] in 'JNVR']
    
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(*t) for t in postags]

    return lemmas

In [41]:
bow_pipeline = Pipeline(
    [
        ("bow", CountVectorizer(
            tokenizer=tokenizer,
            stop_words=stopwords.words('english')
        )),
        ("lda", LatentDirichletAllocation(
            n_components=5,             
            max_iter=10,             
            learning_method='online',
            random_state=100,        
            batch_size=128,          
            evaluate_every=-1,     
            n_jobs = -1
        )),
    ]
)

In [42]:
bow_pipeline.fit(docs)

  % sorted(inconsistent)


Pipeline(steps=[('bow',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function tokenizer at 0x7fcd9d236950>)),
                ('lda',
                 LatentDirichletAllocation(learning_method='online',
                                           n_components=5, n_jobs=-1,
                                           random_state=100))])

In [43]:
# Log Likelihood: Higher the better
print("Log Likelihood: ", bow_pipeline.score(docs))

# Perplexity: Lower the better. 
print("Perplexity: ", bow_pipeline['lda'].perplexity(bow_pipeline['bow'].transform(docs)))

Log Likelihood:  -146026.15554281248
Perplexity:  1978.534148551787


In [44]:
tfidf_pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer(
            tokenizer=tokenizer,
            stop_words=stopwords.words('english')
        )),
        ("lda", LatentDirichletAllocation(
            n_components=5,             
            max_iter=10,             
            learning_method='online',
            random_state=100,        
            batch_size=128,          
            evaluate_every=-1,     
            n_jobs = -1
        )),
    ]
)

In [45]:
tfidf_pipeline.fit(docs)

  % sorted(inconsistent)


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function tokenizer at 0x7fcd9d236950>)),
                ('lda',
                 LatentDirichletAllocation(learning_method='online',
                                           n_components=5, n_jobs=-1,
                                           random_state=100))])

In [47]:
# Log Likelihood: Higher the better
print("Log Likelihood: ", tfidf_pipeline.score(docs))

# Perplexity: Lower the better. 
print("Perplexity: ", tfidf_pipeline['lda'].perplexity(tfidf_pipeline['tfidf'].transform(docs)))

Log Likelihood:  -25222.619218228218
Perplexity:  11524.04667413381


### 4. How many topics are there, what are they?

In [34]:
pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 6.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=ebdadb30f84ed31701b0271a91b5e5a7239e6b97dd108577e4a3c5c5d52dcc95
  Stored in directory: /root/.cache/pip/wheels/c9/21/f6/17bcf2667e8a68532ba2fbf6d5c72fdf4c7f7d9abfa4852d2f
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.17 pyLDAvis-3.3.1


In [39]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(
    bow_pipeline['lda'],
    bow_pipeline['bow'].transform(docs),
    bow_pipeline['bow'], 
    mds='tsne')
panel

  by='saliency', ascending=False).head(R).drop('saliency', 1)


### 5. Which vectorization method performed better?

In [49]:
scores = [
    {
        'method': 'BoW',
        'Log Likelihood': bow_pipeline.score(docs),
        'Perplexity': bow_pipeline['lda'].perplexity(bow_pipeline['bow'].transform(docs))
    },
    {
        'method': 'TF-IDF',
        'Log Likelihood': tfidf_pipeline.score(docs),
        'Perplexity': tfidf_pipeline['lda'].perplexity(tfidf_pipeline['tfidf'].transform(docs))
    }
]
pd.DataFrame.from_records(scores)

Unnamed: 0,method,Log Likelihood,Perplexity
0,BoW,-146026.155543,1978.534149
1,TF-IDF,-25222.619218,11524.046674


In terms of `Log Likelihood`, TF-IDF is better (higher value), but in terms of `Perplexity`, TF-IDF is worse (higher vlaue).

### 6. Compare the results with different topic classification models.

In [None]:
def evaluate_method(model, name, docs):
    model = Pipeline([
        ('vectorizer', TfidfVectorizer(
            tokenizer=tokenizer,
            stop_words=stopwords.words('english')
        )),
        ("lda", model)
    ])
    model.fit(docs)
    return {
        'method': name,
        'Log Likelihood': model.score(docs),
        'Perplexity': model['topic_model'].perplexity(model['vectorizer'].transform(docs))
    }

In [None]:
def get_coherence(model, texts, common_dictionary):
    coherence_model = CoherenceModel(
        model=model,
        texts=texts,
        dictionary=common_dictionary,
        coherence='c_v')
    )
    return coherence_model.get_coherence()

In [52]:
bow_pipeline.transform(docs)

array([[0.01334156, 0.0133791 , 0.01334086, 0.94655815, 0.01338033],
       [0.05012608, 0.05008479, 0.05174305, 0.79798444, 0.05006164],
       [0.06711587, 0.06671409, 0.73164821, 0.0678525 , 0.06666932],
       ...,
       [0.03336015, 0.03339981, 0.86652123, 0.03338403, 0.03333478],
       [0.0400085 , 0.8396376 , 0.0401096 , 0.04024326, 0.04000104],
       [0.02224591, 0.02222591, 0.91099972, 0.02230538, 0.02222308]])

### 7. (optional) use random search or grid search to optimize the number of topics.

In [None]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

### 8. Upload your notebook to project2 branch of the course github repo