In [48]:
#Gensim Docs: http://pandas.pydata.org/pandas-docs/stable/
#Description: https://en.wikipedia.org/wiki/Word2vec
#Tutorials/Code Example:  https://github.com/RaRe-Technologies/gensim/blob/develop/gensim%20Quick%20Start.ipynb
#Library Purpose: Gensim Word2vec is a two-layer neural net that processes text.
#Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus
#it can be applied just as well to genes, code, likes, playlists, social media graphs and 
#other verbal or symbolic series in which patterns may be discerned.

"""Basic word2vec example."""
#open-source achine learning framework
import gensim 
from gensim import corpora
from gensim import models
from gensim.test.utils import get_tmpfile
#data pre-processing tools from gensim package
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum 
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text #transform to lowercase then stem
from gensim.summarization import summarize
from gensim.summarization import keywords
#other packages
from collections import defaultdict #high performace dictionary - Python Standard Library

In [45]:
#open python standard data stream to read lins into a list data structure.  
with open("C:\\Users\\JTB Ventures LLC\\Documents\\GitHub\\ODSA-PythonAdvModels\\Data\\text8", "r") as f:
    raw_corpus = []
    for item in f:
        raw_corpus.append(item) #import all lines of text into python list
#print(raw_corpus)

Actually we've never had any problems with them since we moved here 15 years ago We've never had any problems They've been very helpful We've opened accounts with them and they've led us in a good direction As long as I don't have any problem with them they are fine They seem to know what they are doing and I haven't had any banking problems with them of any kind
All the tellers when I walk in they greet you call you by name they treat you like a family member or friend and not a number Every time I go in there they are able to help me If they aren't able to the branch manager helps me They are all good They are all dressed nice I was having problems several months back with one of my accounts so I sat down and they called the main branch and took the time to do that That wasn't the only thing I had to ask questions about transferring money from an IRA just very helpful I'm going to say the manager Cari Wells is very helpful very welcoming She treats people with respect and very helpfu

In [47]:
#Data Understanding - Document Summarization
#create string of text, for gensim document summarization function
with open("C:\\Users\\JTB Ventures LLC\\Documents\\GitHub\\ODSA-PythonAdvModels\\Data\\text8") as f:
    raw_corpus2 = f.read() + '\n' # add trailing new line character

print("Document Summarization")
print(summarize(raw_corpus2, split=True)) #summarize entire text document. 

Document Summarization
["Actually we've never had any problems with them since we moved here 15 years ago We've never had any problems They've been very helpful We've opened accounts with them and they've led us in a good direction As long as I don't have any problem with them they are fine They seem to know what they are doing and I haven't had any banking problems with them of any kind", "All the tellers when I walk in they greet you call you by name they treat you like a family member or friend and not a number Every time I go in there they are able to help me If they aren't able to the branch manager helps me They are all good They are all dressed nice I was having problems several months back with one of my accounts so I sat down and they called the main branch and took the time to do that That wasn't the only thing I had to ask questions about transferring money from an IRA just very helpful I'm going to say the manager Cari Wells is very helpful very welcoming She treats people 

NameError: name 'keywords' is not defined

In [49]:
#Data Understanding - Keyword Analysis
print("Keyword Analysis")
print(keywords(raw_corpus2)) #display keywords in document

Keyword Analysis
bank
banked
banking
banks
good
like
liked
likely
branches
personable
person
personalized
personally
personalities
personality
personalization
account
friendliness
friendly
services
serviceable
serviced
help people
excellent service
branch manager helps
customer
customers
checking
check
checks
checked
opened accounts
helpful
helped
helping
helpfulness
tellers
teller
nice
nicely
personal attention
problems
problem
time
times
timely
things
thing
deposit
deposits
deposited
depositing
great
feel
feels
feeling
new
managers
manage
management
business
busy
businesses
right
rights
different
difference
got
open
opening
cards
card
best
work
worked
working
works
loan
loans
loaned
need
needs
needed
transferring money
know
knows
knowing
way
ways
home
hours
hour
actually
actual
called
calls
calling
job
jobs
went
online
answered
answer
answers
answering
fees
fee
care
caring
careful
experience
experiences
located
location
locations
took
wrong
overall atmosphere
atm
atms
convenient
conv

In [50]:
# Data Prep - Clean-up raw corpus for analysis
def preprocess_text(corpus):
    i = 0
    corpus2 = []
    for corpu in corpus:  #iterate through rows in dataframe
        line = strip_punctuation(corpu)
        line = strip_non_alphanum(line)
        line = strip_numeric(line)
        line = strip_multiple_whitespaces(corpu)
        line = remove_stopwords(line)
        corpus2.append(line)
    return corpus2

#apply preprocessing function to "raw" text corpus for a "cleaned" corpus, to use in vectorization
clean_corpus = preprocess_text(raw_corpus) 

stoplist = set('i it\'s'.split()) #remove extraneous words from analysis

# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in clean_corpus ]

#Long list of non-distinct parsed words from doc 
print(texts[2]) #comment 2

['my', 'bank', '100', 'percent', 'bank', "they've", 'absolutely', 'amazing', 'tellers', 'way', 'tellers', 'absolutely', 'amazing', 'they', 'provide', 'excellent', 'service', 'time', 'walk', 'time', 'leave', "we're", 'satisfied', 'need', "i've", "i've", 'gotten', 'better', 'service', 'personal', 'attention', 'way', 'my', 'bank']


In [23]:
# Data Prep - Count word frequencies
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

#Remove Words that only appear once
processed_corpus = [
     [token for token in text if frequency[token] > 1]
     for text in texts
]
#print(processed_corpus)

In [8]:
# Data Prep - associate each word in the processed corpus with a unique integer ID, using the gensim.corpora.Dictionary class. 
#This dictionary defines the vocabulary of all words that our processing knows about.
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary.token2id)

{'bank': 0, 'banked': 1, 'dealt': 2, 'everything': 3, 'family': 4, 'good': 5, 'it': 6, 'like': 7, 'old': 8, 'personable': 9, 'years': 10, 'you': 11, 'because': 12, 'best': 13, 'help': 14, 'my': 15, 'need': 16, 'people': 17, 'they': 18, '100': 19, 'absolutely': 20, 'amazing': 21, 'attention': 22, 'better': 23, 'excellent': 24, 'gotten': 25, "i've": 26, 'leave': 27, 'percent': 28, 'personal': 29, 'provide': 30, 'satisfied': 31, 'service': 32, 'tellers': 33, "they've": 34, 'time': 35, 'walk': 36, 'way': 37, "we're": 38, 'great': 39, 'happy': 40, 'no': 41, 'problems': 42, 'trouble': 43, 'very': 44, 'whatsoever': 45, 'courteous': 46, 'efficient': 47, 'helpful': 48, 'actually': 49, 'friendliness': 50, '15': 51, 'accounts': 52, 'ago': 53, 'as': 54, 'banking': 55, "don't": 56, 'fine': 57, "haven't": 58, 'kind': 59, 'know': 60, 'long': 61, 'moved': 62, 'opened': 63, 'problem': 64, "we've": 65, 'all': 66, 'answered': 67, 'questions': 68, 'makes': 69, 'mentioned': 70, 'thing': 71, 'things': 72, '

In [9]:
#Data Understanding - Count the occurance of distinct words in each line
#Data Prep - Vectorization - using bag-of-words dictionary, vectorize "tellers weren't welcoming"
new_doc = "tellers weren't welcoming"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(33, 1), (114, 1), (153, 1)]


In [28]:
#Data Prep - To infer the latent structure in our corpus we need a way to represent documents
#that we can manipulate mathematically. One approach is to represent each document as a vector. 
#convert document into "Bag of works"
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
#print(bow_corpus)

In [21]:
#Data Exploration - translate corpus to Matrix/Market format, then explore using lsi model
#first, transform bag-of-words corpus to lsi model
output_fname = 'corpus_mm'
corpora.MmCorpus.serialize(output_fname , bow_corpus )  #tmp filename, corpus
mm = corpora.MmCorpus(output_fname)

#second, apply lsi model for 2-dimensional exploration - defines relative relationship of words to eachother
lsi = models.LsiModel(mm, id2word=dictionary, num_topics=5)
#doc = "branch bank hours aren't convenient"
doc = "kittens hate chocolate"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] 
print("LSI Model")
print(vec_lsi)

#Data Exploration - similaritiy queries - Compute cosine similarity against a corpus of documents by storing the index matrix in memory.
from gensim.similarities import MatrixSimilarity
index = MatrixSimilarity(bow_corpus, 5) # transform corpus to LSI space and index it - corpus, # of features from dictionary
sims = index[vec_bow]
print("Similarity Queue")
print(sims)

LSI Model
[(0, 0.0001278284986589068), (1, 0.00034288169871259794), (2, -0.00013918193961516987), (3, 0.0003765881327393379), (4, 0.0007176504773001273)]
Similarity Queue
[(1074, 0.4714045226573944)]


In [22]:
# Now that we have vectorized our corpus we can begin to transform it using models. 
#We use model as an abstract term referring to a transformation from one document representation to another. 
#In gensim, documents are represented as vectors so a model can be thought of as a transformation between two vector spaces. 
#The details of this transformation are learned from the training corpus. '''

#One simple example of a model is tf-idf. The tf-idf model transforms vectors from the 
#bag-of-words representation to a vector space, where the frequency counts are weighted according 
#to the relative rarity of each word in the corpus.

# train the model using tf-idf model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" string
tfidf2 = tfidf[dictionary.doc2bow("ATM deposit".lower().split())]
print(tfidf2) #first entry is the token ID and the second entry is the tf-idf weighting

[(301, 0.6259802553579925), (648, 0.7798389063787101)]


In [57]:
#create lda model from processed corpus/tfidf transformation 
#
corpus_tfidf = tfidf[bow_corpus]
lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=10)
print(lda)
#Print Topics produced by LDA model
for i in range(0, lda.num_topics-1):
    print("topic " + str(i) + ":" + lda.print_topic(i))    

LdaModel(num_terms=1208, num_topics=1, decay=0.5, chunksize=2000)


In [58]:
#Model Evaluation - LDA topic Coherence - Closer to 0, the better
#for additional measures, see: https://radimrehurek.com/gensim/models/coherencemodel.html
from gensim.models.coherencemodel import CoherenceModel
cm = CoherenceModel(model=lda, corpus=corpus_tfidf, coherence='u_mass')
print(cm.get_coherence())  # get coherence value

-1.9580854135997512


In [27]:
#compare vs. topics generated by LSI model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)  # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
lsi.print_topics(10)

[(0,
  '-0.260*"they" + -0.221*"service" + -0.212*"i\'ve" + -0.207*"friendly" + -0.205*"they\'re" + -0.196*"good" + -0.189*"bank" + -0.186*"because" + -0.185*"know" + -0.183*"like"'),
 (1,
  '-0.556*"service" + -0.453*"customer" + -0.221*"excellent" + -0.211*"good" + 0.155*"account" + 0.155*"bank" + 0.128*"i\'ve" + 0.120*"like" + 0.113*"my" + -0.111*"great"'),
 (2,
  '0.586*"they\'re" + 0.244*"friendly" + 0.241*"nice" + -0.237*"customer" + -0.227*"service" + 0.185*"helpful" + -0.173*"account" + -0.156*"bank" + -0.135*"my" + -0.121*"satisfied"'),
 (3,
  '-0.461*"i\'ve" + -0.386*"because" + -0.297*"problems" + -0.218*"years" + -0.200*"satisfied" + 0.193*"the" + 0.182*"account" + -0.149*"they\'ve" + 0.143*"they" + 0.130*"money"'),
 (4,
  '0.382*"they\'re" + -0.358*"because" + -0.255*"help" + 0.233*"problems" + -0.211*"treat" + -0.203*"way" + -0.201*"like" + 0.172*"i\'ve" + -0.165*"know" + 0.159*"account"'),
 (5,
  '-0.448*"problems" + 0.324*"they\'re" + 0.241*"best" + -0.228*"answer" + 0.

In [None]:
#other available transformations located at: https://radimrehurek.com/gensim/tut2.html