In [22]:
#Goal - Implement and Evaluate the LSI NLP model, using the CRISP-DM Process

#LSI Overview: LSI is technique in natural language processing of analyzing relationships between a set of documents 
#and the terms they contain by producing a set of concepts related to the documents and terms. 
#i.e. LSA assumes that words that are close in meaning will occur in similar pieces of text 

#import libraries for data structures and Gensim Word2Vec API
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))
import numpy as np
import pandas as pd
import gensim #open-source achine learning framework
from gensim import corpora
from gensim import models
from gensim.parsing.preprocessing import strip_non_alphanum 
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation


Folder "C:\Users\Bergmann\AppData\Local\Temp" will be used to save temporary dictionary and corpus.


In [35]:
#CRISP-DM Task: Data Preparation 
#import favorite text dataset for analysis 
def read_text(path):
    print("Python File I/O Example - text Read")
    with open(path, "r", encoding='utf-8') as f:
    	line = f.readlines()
    return line

print('CRISP-DM Task: Data Preparation')
print('Task 1: Read-in a text-based document, aka "establishing the corpus')
documents = read_text(r"C:\\Python\\Data\\Text8") #single-line text
print(documents[0])

CRISP-DM Task: Data Preparation
Task 1: Read-in a text-based document, aka "establishing the corpus
Python File I/O Example - text Read


TypeError: 'format' is an invalid keyword argument for this function

In [24]:
print('Task 2: Preprocessing dataset, including stoplist, word frequencies & filters')
print('Task 2a: Remove punctuation, non-alphanumeric and numeric characters')
#preprocess data for use in text mining/NLP
def preprocess_text(corpus=[]):
    print("Preprocessing Corpus from list data structure")
    for i, val in enumerate(corpus):  #iterate through list
	    corpus[i] = corpus[i].strip('\n')
	    corpus[i] = strip_punctuation(corpus[i])
	    corpus[i] = strip_non_alphanum(corpus[i])
	    corpus[i] = strip_numeric(corpus[i])
    return corpus

raw_corpus = preprocess_text(documents)

Task 2: Preprocessing dataset, including stoplist, word frequencies & filters
Task 2a: Remove punctuation, non-alphanumeric and numeric characters
Preprocessing Corpus from list data structure


In [25]:
print('Task 2b: Remove words in stoplist and Lowercase each document')
#stoplist = set('for a of the and to in i they it my me that have with are was is t s ve he re is'.split())
stoplist = set('for a of the and to in i they it my me that have with are was'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in raw_corpus]

Task 2b: Remove words in stoplist and Lowercase each document


In [26]:
print('Task 2c: create a list list of non-distinct parsed words from doc') 
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

Task 2c: create a list list of non-distinct parsed words from doc


In [27]:
print('Task 2d: Only keep words that appear more than once')
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
#print(processed_corpus) #long list of distinct words

Task 2d: Only keep words that appear more than once


In [None]:
#data understanding
processed_corpus.info()

In [28]:
print('CRISP-DM Task: Model Building')
print('Task 1: Transform Data - Create dictionary/term-document matrix')
#associate each word in the processed corpus with a unique integer ID, using the gensim.corpora.Dictionary class. 
#This dictionary defines the vocabulary of all words that our processing knows about.
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary.token2id)

Task 3: Transform Data - Create dictionary/term-document matrix
{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [29]:
#To infer the latent structure in our training corpus we need a way to represent documents
#that we can manipulate mathematically. One approach is to represent each document as a vector. 
print('Task 2: convert training document by vectorizing processed corpus into "bag-of-words" vectors,' 
	  + 'using dictionary data structure')
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

CRISP-DM Task: Model Building
Task 1: convert training document by vectorizing processed corpus into "bag-of-words" vectors,using dictionary data structure


In [30]:
print('Task 3: Train Model by Applying NLP methodology (LSI Model) to vectorized "bag of words" corpus')
#LSI is being used in a variety of information retrieval and text processing applications, 
#although its primary application has been for concept searching and automated document categorization
modelLsi = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=10) #initalize & train model on vectorized data
print('output Lsi Training Model')
#Cosine measure returns similarities in the range <-1, 1> (the greater, the more similar).
print(modelLsi.print_topics(-1))

Task 2: Train Model by Applying NLP methodology (LSI Model) to vectorized "bag of words" corpus
output Lsi Training Model
[(0, '0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"'), (1, '0.623*"graph" + 0.490*"trees" + 0.451*"minors" + 0.274*"survey" + -0.167*"system" + -0.141*"eps" + -0.113*"human" + 0.107*"response" + 0.107*"time" + -0.072*"interface"'), (2, '0.426*"response" + 0.426*"time" + -0.361*"system" + 0.338*"user" + -0.330*"eps" + -0.289*"human" + -0.231*"trees" + -0.223*"graph" + 0.178*"survey" + 0.164*"computer"'), (3, '0.595*"computer" + 0.552*"interface" + 0.415*"human" + -0.333*"system" + -0.188*"eps" + -0.099*"user" + -0.074*"time" + -0.074*"response" + 0.032*"survey" + -0.025*"trees"'), (4, '0.594*"trees" + -0.537*"survey" + 0.332*"user" + -0.300*"minors" + 0.282*"interface" + -0.159*"system" + 0.115*"eps" + -0.107*"computer" + -0.106*"human" + 0.080*"res

In [31]:
print('CRISP-DM Task: Model Evaluation')
#key for text analytics is interpretability - does it make sense?  
print('Task 1: Test model by creating a topic via a Python list of keywords, then vectorize into a "bag of words" vector')
new_doc = "human computer interaction"
#new_doc = "branch bank service"
print('Test Theme: ' + new_doc)
new_vec = dictionary.doc2bow(new_doc.lower().split())
#Calling modelLsi[new_vec] creates a wrapper around the old corpus document stream
modelLsi_test = modelLsi[new_vec] #use "testing" data to transform the "new" document vector
print(modelLsi_test) #if model isn't high quality, continue to iterate

print('Task 2: formal tests of model accuracy')
#include if find - can also be done in Dataiku

print('CRISP-DM Task: Model Deployment')
#gensim contains ablility to save and update models with future iterations
#lecture is TBD - Pair with Data-Scikit & Dataiku

CRISP-DM Task: Model Evaluation
Task 1: Test model by creating a topic via a Python list of keywords, then vectorize into a "bag of words" vector
Test Theme: human computer interaction
[(0, 0.46182100453271607), (1, -0.070027665279000589), (2, -0.12452907551899114), (3, 1.0097125584438551), (4, -0.21303040605626802), (5, -0.59593845338206597), (6, -0.22041753546094417), (7, -0.0018778773554750036), (8, 0.085766854949955507)]
Task 2: formal tests of model accuracy
CRISP-DM Task: Model Deployment
