In [1]:
#Gensim Docs: http://pandas.pydata.org/pandas-docs/stable/
#Description: https://en.wikipedia.org/wiki/Word2vec
#Tutorials/Code Example:  https://github.com/RaRe-Technologies/gensim/blob/develop/gensim%20Quick%20Start.ipynb
#Library Purpose: Gensim Word2vec is a two-layer neural net that processes text.
#Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus
#it can be applied just as well to genes, code, likes, playlists, social media graphs and 
#other verbal or symbolic series in which patterns may be discerned.

"""Basic word2vec example."""
#open-source achine learning framework
import gensim 
from gensim import corpora
from gensim import models
from gensim.test.utils import get_tmpfile
#data pre-processing tools from gensim package
from gensim.parsing.preprocessing import preprocess_string 
from gensim.parsing.preprocessing import remove_stopwords 
from gensim.parsing.preprocessing import strip_non_alphanum 
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text #transform to lowercase then stem
from gensim.summarization import summarize
from gensim.summarization import keywords
#other packages
from collections import defaultdict #high performace dictionary - Python Standard Library

#import libraries for data structures and Gensim Word2Vec API
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

#improved data structures - numpy & pandas
import numpy as np
import pandas as pd

#import scikit-learn & other graphical libraries
# Any results you write to the current directory are saved as output.
from sklearn import preprocessing #data prep - module includes scaling, centering, normalization, binarization and imputation methods.
from sklearn.feature_extraction import text #used for removing stop words and obtaining feature extraction from text
import matplotlib.pyplot as plt
import seaborn as sns #Seaborn is a Python data visualization library based on matplotlib. 
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

#import sklearn libraries for NLP prep, model bilding & validation steps - for use in Dataiku
from sklearn.metrics import accuracy_score  #used for model evaluation - https://scikit-learn.org/stable/modules/model_evaluation.html#model-evaluation
from sklearn.feature_extraction.text import CountVectorizer  #bag-of-words vectorication for LDA model
from sklearn.decomposition import LatentDirichletAllocation #model for NLP topic extraction, similar to gensim LDA
from sklearn.datasets import make_multilabel_classification #create random test dataset
from sklearn.model_selection import train_test_split



Folder "C:\Users\JTBVEN~1\AppData\Local\Temp" will be used to save temporary dictionary and corpus.


In [2]:
#global variable that work as model parameters - adjust for model performance "fine-tuning"
n_samples = 2000 #sample size
n_features = 1000 #name/entity recgonition & group selection (vectors)
n_components = 5 #themes
n_top_words = 10 #words per theme

In [3]:
#open python standard data stream to read lins into a list data structure.  
#with open("C:\\Users\\JTB Ventures LLC\\Documents\\GitHub\\ODSA-PythonAdvModels\\Data\\text8", "r") as f:
#    raw_corpus = []
#    for item in f:
#        raw_corpus.append(item) #import all lines of text into python list
#print(raw_corpus)

#CRISP-DM Task: Data Preparation 
#import favorite text-based dataset for analysis using pandas dataframe - compatible w/scikit-learn
def read_text(path):
    print("Pandas File I/O Example - csv read")
    text=pd.read_csv(path) #import to pandas DataFrame
    return text #return pandas dataframe type

print('CRISP-DM Task: Data Preparation')
print('Task 1: Read-in a text-based document, aka "establishing the corpus')
documents = read_text("C:\\Users\\JTB Ventures LLC\\Documents\\GitHub\\ODSA-PythonAdvModels\\Data\\text8") #Relative path - to tabular csv file 
print(documents.head())

CRISP-DM Task: Data Preparation
Task 1: Read-in a text-based document, aka "establishing the corpus
Pandas File I/O Example - csv read
                                                text  Unnamed: 1
0  MY BANK is always good to me I have banked wit...         NaN
1  MY BANK is the best for me They help people wh...         NaN
2  MY BANK has been 100 percent on top on any ban...         NaN
3  Absolutely no problems with them Everything ha...         NaN
4  Absolutely They are efficient courteous and he...         NaN


In [5]:
#Data Understanding - Document Summarization
#create string of text, for gensim document summarization function
with open("C:\\Users\\JTB Ventures LLC\\Documents\\GitHub\\ODSA-PythonAdvModels\\Data\\text8") as f:
    raw_corpus2 = f.read() + '\n' # add trailing new line character

print("Document Summarization")
print(summarize(raw_corpus2, split=True)) #summarize entire text document. 

Document Summarization
["Actually we've never had any problems with them since we moved here 15 years ago We've never had any problems They've been very helpful We've opened accounts with them and they've led us in a good direction As long as I don't have any problem with them they are fine They seem to know what they are doing and I haven't had any banking problems with them of any kind", "All the tellers when I walk in they greet you call you by name they treat you like a family member or friend and not a number Every time I go in there they are able to help me If they aren't able to the branch manager helps me They are all good They are all dressed nice I was having problems several months back with one of my accounts so I sat down and they called the main branch and took the time to do that That wasn't the only thing I had to ask questions about transferring money from an IRA just very helpful I'm going to say the manager Cari Wells is very helpful very welcoming She treats people 

In [6]:
#Data Understanding - Keyword Analysis
print("Keyword Analysis")
print(keywords(raw_corpus2)) #display keywords in document

Keyword Analysis
bank
banked
banking
banks
good
like
liked
likely
branches
personable
person
personalized
personally
personalities
personality
personalization
account
friendliness
friendly
services
serviceable
serviced
help people
excellent service
branch manager helps
customer
customers
checking
check
checks
checked
opened accounts
helpful
helped
helping
helpfulness
tellers
teller
nice
nicely
personal attention
problems
problem
time
times
timely
things
thing
deposit
deposits
deposited
depositing
great
feel
feels
feeling
new
managers
manage
management
business
busy
businesses
right
rights
different
difference
got
open
opening
cards
card
best
work
worked
working
works
loan
loans
loaned
need
needs
needed
transferring money
know
knows
knowing
way
ways
home
hours
hour
actually
actual
called
calls
calling
job
jobs
went
online
answered
answer
answers
answering
fees
fee
care
caring
careful
experience
experiences
located
location
locations
took
wrong
overall atmosphere
atm
atms
convenient
conv

In [7]:
'''Data Prep - Clean-up raw corpus for analysis
def preprocess_text(corpus):
    i = 0
    corpus2 = []
    for corpu in corpus:  #iterate through rows in dataframe
        line = strip_punctuation(corpu)
        line = strip_non_alphanum(line)
        line = strip_numeric(line)
        line = strip_multiple_whitespaces(corpu)
        line = remove_stopwords(line)
        corpus2.append(line)
    return corpus2 

#apply preprocessing function to "raw" text corpus for a "cleaned" corpus, to use in vectorization
clean_corpus = preprocess_text(raw_corpus) 

stoplist = set('i it\'s'.split()) #remove extraneous words from analysis

# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in clean_corpus ]

#Long list of non-distinct parsed words from doc 
print(texts[992]) #comment 2
    ''' 

#preprocess data for use in text mining/NLP - refactored for pandas dataframe
def preprocess_text(corpus,field_name = 'text'):
    print("Preprocessing Corpus from pandas data frame")
    for index, row in corpus.iterrows():  #iterate through rows in dataframe
        line = row[field_name].strip('\n')
        line = strip_punctuation(line)
        line = strip_non_alphanum(line)
        line = strip_numeric(line)
        line = strip_multiple_whitespaces(line)
        #line = strip_short(line)
        #add cleaned text line to new dataframe
        corpus.at[index,field_name] = line #set value at row/column in corpus dataframet            
    return corpus

print('Task 2: Preprocessing dataset, including stoplist, word frequencies & filters')
print('Task 2a: Remove punctuation, non-alphanumeric and numeric characters')
#apply preprocessing function to pandas df text field "comment" to create a "raw" text corpus
raw_corpus = preprocess_text(documents,field_name = 'text') 

Task 2: Preprocessing dataset, including stoplist, word frequencies & filters
Task 2a: Remove punctuation, non-alphanumeric and numeric characters
Preprocessing Corpus from pandas data frame


In [8]:
print('Task 2b: remove english stopwords and add additional to remove from text document')
#set stopword list - see here for set of english "stop words": https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/stop_words.py
#scikit-learn uses a stoplist "frozenset" - immutable python set - "ENGLISH_STOP_WORDS" 
#use standard english stop words, along with aditionally defined in "Union Statement"
stop_words = text.ENGLISH_STOP_WORDS.union({"have", "with", "are"}) 
print(stop_words)

print('Task 2b: create a BOW vector for use with Latent-Dirichlet-Allocation (LDA) Models, using assigned stop words')
#Note: In Gensim Word2Vec, this is known as "Similarity Queues" - https://radimrehurek.com/gensim/tut3.html
#Bag-of-words is one choice - Gensim includes: Matrix-Market, LSI,SVMlight, LDA-C, GivvsLDA++, etc

#Convert a collection of text documents to a matrix of token counts - "bag-of-words", unless otherwise specified
bow_vector = CountVectorizer(stop_words=stop_words)

#"Comment" column is the 6th column in the dataset - index "5" in dataframe
value_list = [row[0] for row in raw_corpus.itertuples(index=False, name=None)]
#print(value_list[0:3])
#create term-document matrix and and place all relevant terms in vocabulary/dictionary
bow = bow_vector.fit_transform(value_list)
#dictionary stored in the vocabulary_ variable of the bow object 
print(bow_vector.vocabulary_)

Task 2b: remove english stopwords and add additional to remove from text document
frozenset({'empty', 'per', 'we', 'hers', 'them', 'for', 'sixty', 'although', 'keep', 'until', 'with', 'those', 'she', 'never', 'elsewhere', 'about', 'please', 'me', 'would', 'describe', 'many', 'after', 'someone', 'then', 'de', 'anyway', 'that', 'they', 'may', 'around', 'which', 'most', 'also', 'by', 'often', 'some', 'get', 'along', 'is', 'everywhere', 'seeming', 're', 'mostly', 'afterwards', 'detail', 'once', 'fifty', 'to', 'none', 'mine', 'where', 'other', 'nobody', 'against', 'cannot', 'much', 'amoungst', 'will', 'whereupon', 'whither', 'something', 'upon', 'than', 'not', 'has', 'everything', 'further', 'un', 'go', 'either', 'had', 'must', 'back', 'one', 'already', 'anything', 'thereafter', 'everyone', 'there', 'anyone', 'only', 'well', 'fire', 'himself', 'into', 'thru', 'myself', 'wherein', 'became', 'herein', 'formerly', 'a', 'found', 'next', 'an', 'wherever', 'ourselves', 'move', 'name', 'could', 't

In [9]:
# Data Prep - Count word frequencies
'''
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

#Remove Words that only appear once
processed_corpus = [
     [token for token in text if frequency[token] > 5]
     for text in texts
]
#print(processed_corpus) '''

print('CRISP-DM Task: Data Understanding')
print('Task 1: print information about bow verctor and/or corpus')
counts = np.asarray(bow.sum(axis=0)) 
count_words = counts[0]

keyword = 'saved' #keyword used for freq lookup

print("Keyword Freqency: " + keyword)
print(bow_vector.vocabulary_[keyword]) #get dictionary index of "get" keyword'
freq = count_words[109] #get word count of all terms in dictionary, using retured "get" word id - ex atm = 109
print(freq) #print wordcount frequency

CRISP-DM Task: Data Understanding
Task 1: print information about bow verctor and/or corpus
Keyword Freqency: saved
1585
29


In [10]:
#Data Understanding - Count the occurance of distinct words in each line
print('Task 2: use pandas and data viz libraries to explore & understand the columns and values in the text dataset')
print(documents.head())  #get first 5 observations in pandas dataframe
# Check data types for each variable
print(documents.info())
#assess data quailty - null values 
print(documents.isnull().sum())
#describe dataset values 
print(documents.describe())
#view histograp of categorical variables 
#summarize & plot pandas column using "groupby" function
summary = documents.groupby(['text'])["Index"].count().reset_index(name="count")
print(summary)
y = summary['count']
x = summary['text']   #iterate list to transfor dates for graphical use
data = pd.DataFrame({'Freq':y, 'text':x}).set_index(x) 
data.plot(kind='bar')
#plt.show()  #uncomment to show - otherwise holds execution until closed

Task 2: use pandas and data viz libraries to explore & understand the columns and values in the text dataset
                                                text  Unnamed: 1
0  MY BANK is always good to me I have banked wit...         NaN
1  MY BANK is the best for me They help people wh...         NaN
2  MY BANK has been percent on top on any bank Th...         NaN
3  Absolutely no problems with them Everything ha...         NaN
4  Absolutely They are efficient courteous and he...         NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1193 entries, 0 to 1192
Data columns (total 2 columns):
text          1193 non-null object
Unnamed: 1    0 non-null float64
dtypes: float64(1), object(1)
memory usage: 18.7+ KB
None
text             0
Unnamed: 1    1193
dtype: int64
       Unnamed: 1
count         0.0
mean          NaN
std           NaN
min           NaN
25%           NaN
50%           NaN
75%           NaN
max           NaN


KeyError: 'Column not found: Index'

In [None]:
#Data Prep - To infer the latent structure in our corpus we need a way to represent documents
#that we can manipulate mathematically. One approach is to represent each document as a vector. 
#convert document into "Bag of works"
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
#print(bow_corpus)

In [None]:
#Data Exploration - translate corpus to Matrix/Market format, then explore using lsi model
#first, transform bag-of-words corpus to lsi model
output_fname = 'corpus_mm'
corpora.MmCorpus.serialize(output_fname , bow_corpus )  #tmp filename, corpus
mm = corpora.MmCorpus(output_fname)

#second, apply lsi model for 2-dimensional exploration - defines relative relationship of words to eachother
lsi = models.LsiModel(mm, id2word=dictionary, num_topics=5)
#doc = "branch bank hours aren't convenient"
doc = "kittens hate chocolate"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] 
print("LSI Model")
print(vec_lsi)

#Data Exploration - similaritiy queries - Compute cosine similarity against a corpus of documents by storing the index matrix in memory.
from gensim.similarities import MatrixSimilarity
index = MatrixSimilarity(bow_corpus, 5) # transform corpus to LSI space and index it - corpus, # of features from dictionary
sims = index[vec_bow]
print("Similarity Queue")
print(sims)

In [76]:
#Now that we have vectorized our corpus we can begin to transform it using models. 
#We use model as an abstract term referring to a transformation from one document representation to another. 
#In gensim, documents are represented as vectors so a model can be thought of as a transformation between two vector spaces. 
#The details of this transformation are learned from the training corpus. '''

#One simple example of a model is tf-idf. The tf-idf model transforms vectors from the 
#bag-of-words representation to a vector space, where the frequency counts are weighted according 
#to the relative rarity of each word in the corpus.

# train the model using tf-idf model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" string
tfidf2 = tfidf[dictionary.doc2bow("ATM deposit".lower().split())]
print(tfidf2) #first entry is the token ID and the second entry is the tf-idf weighting


[(244, 0.6259802553579925), (402, 0.7798389063787101)]


In [None]:
#create lda model from processed corpus/tfidf transformation 
#
corpus_tfidf = tfidf[bow_corpus]
lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=10)
print(lda)
#Print Topics produced by LDA model
for i in range(0, lda.num_topics-1):
    print("topic " + str(i) + ":" + lda.print_topic(i))    

In [None]:
#Model Evaluation - LDA topic Coherence - Closer to 0, the better
#for additional measures, see: https://radimrehurek.com/gensim/models/coherencemodel.html
from gensim.models.coherencemodel import CoherenceModel
cm = CoherenceModel(model=lda, corpus=corpus_tfidf, coherence='u_mass')
print(cm.get_coherence())  # get coherence value

In [27]:
#compare vs. topics generated by LSI model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)  # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
lsi.print_topics(10)

[(0,
  '-0.260*"they" + -0.221*"service" + -0.212*"i\'ve" + -0.207*"friendly" + -0.205*"they\'re" + -0.196*"good" + -0.189*"bank" + -0.186*"because" + -0.185*"know" + -0.183*"like"'),
 (1,
  '-0.556*"service" + -0.453*"customer" + -0.221*"excellent" + -0.211*"good" + 0.155*"account" + 0.155*"bank" + 0.128*"i\'ve" + 0.120*"like" + 0.113*"my" + -0.111*"great"'),
 (2,
  '0.586*"they\'re" + 0.244*"friendly" + 0.241*"nice" + -0.237*"customer" + -0.227*"service" + 0.185*"helpful" + -0.173*"account" + -0.156*"bank" + -0.135*"my" + -0.121*"satisfied"'),
 (3,
  '-0.461*"i\'ve" + -0.386*"because" + -0.297*"problems" + -0.218*"years" + -0.200*"satisfied" + 0.193*"the" + 0.182*"account" + -0.149*"they\'ve" + 0.143*"they" + 0.130*"money"'),
 (4,
  '0.382*"they\'re" + -0.358*"because" + -0.255*"help" + 0.233*"problems" + -0.211*"treat" + -0.203*"way" + -0.201*"like" + 0.172*"i\'ve" + -0.165*"know" + 0.159*"account"'),
 (5,
  '-0.448*"problems" + 0.324*"they\'re" + 0.241*"best" + -0.228*"answer" + 0.

In [None]:
#other available transformations located at: https://radimrehurek.com/gensim/tut2.html
print('CRISP-DM Task: Model Building')
print("Fitting LDA models with n_topic=%d, n_samples=%d and n_features=%d..." % (n_components, n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)  #scikit-learn library
lda.fit(bow)
print("\nTopics in LDA model:")
bow_feature_names = bow_vector.get_feature_names()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print_top_words(lda, bow_feature_names, n_top_words) #note the improvement over last week's model!

In [None]:
print('CRISP-DM Task: Model Evaluation ')
print('Step1 : get test topics then score them vs. current LDA model')
train, test = train_test_split(list(documents['text'].values), test_size = 0.2)
test_vector = bow_vector.fit_transform(test)
test_lda = lda.fit(test_vector)  #fit text vector within existing model - transpose to work
#Calculate approximate log-likelihood as score.
print(test_lda.score(test_vector)) #not meaningful in itself - compare vs. re-run models (closer to 0, the better)

In [None]:
print('Step 2: Use formal model evaluation stats, such as "perplexity" from scikit-learn library') 
# create test/train text documents to evaluate model 
vectoriser = CountVectorizer(stop_words = 'english', max_features=500)  #max features must be less that "n_features" variable!
doc_train = vectoriser.fit_transform(train)
features = vectoriser.get_feature_names()
doc_test = vectoriser.fit_transform(test)
news_lda = lda.fit(doc_train)
print(news_lda.perplexity(doc_test)) # lower the perplexity, the better