In [6]:
#Gensim Docs: http://pandas.pydata.org/pandas-docs/stable/
#Description: https://en.wikipedia.org/wiki/Word2vec
#Tutorials/Code Example:  https://github.com/RaRe-Technologies/gensim/blob/develop/gensim%20Quick%20Start.ipynb
#Library Purpose: Gensim Word2vec is a two-layer neural net that processes text.
#Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus
#it can be applied just as well to genes, code, likes, playlists, social media graphs and 
#other verbal or symbolic series in which patterns may be discerned.

"""Basic word2vec example."""
#open-source achine learning framework
import gensim 
from gensim import corpora
from gensim import models
from gensim.test.utils import get_tmpfile
#data pre-processing tools from gensim package
from gensim.parsing.preprocessing import preprocess_string 
from gensim.parsing.preprocessing import remove_stopwords 
from gensim.parsing.preprocessing import strip_non_alphanum 
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text #transform to lowercase then stem
from gensim.summarization import summarize
from gensim.summarization import keywords
#other packages
from collections import defaultdict #high performace dictionary - Python Standard Library

#import scikit-learn & graphical libraries
# Any results you write to the current directory are saved as output.
from sklearn import preprocessing #data prep - module includes scaling, centering, normalization, binarization and imputation methods.
from sklearn.feature_extraction import text #used for removing stop words and obtaining feature extraction from text
import matplotlib.pyplot as plt
import seaborn as sns #Seaborn is a Python data visualization library based on matplotlib. 
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

# Any results you write to the current directory are saved as output.
from sklearn import preprocessing #data prep - module includes scaling, centering, normalization, binarization and imputation methods.
from sklearn.feature_extraction import text #used for removing stop words and obtaining feature extraction from text

#import sklearn libraries for NLP prep, model bilding & validation steps - for use in Dataiku
from sklearn.metrics import accuracy_score  #used for model evaluation - https://scikit-learn.org/stable/modules/model_evaluation.html#model-evaluation
from sklearn.feature_extraction.text import CountVectorizer  #bag-of-words vectorication for LDA model
from sklearn.decomposition import LatentDirichletAllocation #model for NLP topic extraction, similar to gensim LDA
from sklearn.datasets import make_multilabel_classification #create random test dataset
from sklearn.model_selection import train_test_split

#import libraries for data structures and Gensim Word2Vec API
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

#improved data structures - numpy & pandas
import numpy as np
import pandas as pd

Folder "C:\Users\JTBVEN~1\AppData\Local\Temp" will be used to save temporary dictionary and corpus.


In [7]:
#open python standard data stream to read lins into a list data structure.  
with open("C:\\Users\\JTB Ventures LLC\\Documents\\GitHub\\ODSA-PythonAdvModels\\Data\\text8", "r") as f:
    raw_corpus = []
    for item in f:
        raw_corpus.append(item) #import all lines of text into python list
#print(raw_corpus)

#CRISP-DM Task: Data Preparation 
#import favorite text-based dataset for analysis using pandas dataframe - compatible w/scikit-learn
def read_text(path):
    print("Pandas File I/O Example - csv read")
    text=pd.read_csv(path) #import to pandas DataFrame
    return text #return pandas dataframe type

print('CRISP-DM Task: Data Preparation')
print('Task 1: Read-in a text-based document, aka "establishing the corpus')
documents = read_text("C:\\Users\\JTB Ventures LLC\\Documents\\GitHub\\ODSA-PythonAdvModels\\Data\\text8") #Relative path - to tabular csv file 
print(documents.head())

CRISP-DM Task: Data Preparation
Task 1: Read-in a text-based document, aka "establishing the corpus
Pandas File I/O Example - csv read
                                                text  Unnamed: 1
0  MY BANK is always good to me I have banked wit...         NaN
1  MY BANK is the best for me They help people wh...         NaN
2  MY BANK has been 100 percent on top on any ban...         NaN
3  Absolutely no problems with them Everything ha...         NaN
4  Absolutely They are efficient courteous and he...         NaN


In [8]:
# Data Prep - Clean-up raw corpus for analysis
'''def preprocess_text(corpus):
    i = 0
    corpus2 = []
    for corpu in corpus:  #iterate through rows in dataframe
        line = strip_punctuation(corpu)
        line = strip_non_alphanum(line)
        line = strip_numeric(line)
        line = strip_multiple_whitespaces(corpu)
        line = remove_stopwords(line)
        corpus2.append(line)
    return corpus2

#apply preprocessing function to "raw" text corpus for a "cleaned" corpus, to use in vectorization
clean_corpus = preprocess_text(raw_corpus) 

stoplist = set('i it\'s'.split()) #remove extraneous words from analysis

# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in clean_corpus ]

#Long list of non-distinct parsed words from doc 
print(texts[992]) #comment 2'''

'def preprocess_text(corpus):\n    i = 0\n    corpus2 = []\n    for corpu in corpus:  #iterate through rows in dataframe\n        line = strip_punctuation(corpu)\n        line = strip_non_alphanum(line)\n        line = strip_numeric(line)\n        line = strip_multiple_whitespaces(corpu)\n        line = remove_stopwords(line)\n        corpus2.append(line)\n    return corpus2\n\n#apply preprocessing function to "raw" text corpus for a "cleaned" corpus, to use in vectorization\nclean_corpus = preprocess_text(raw_corpus) \n\nstoplist = set(\'i it\'s\'.split()) #remove extraneous words from analysis\n\n# Lowercase each document, split it by white space and filter out stopwords\ntexts = [[word for word in document.lower().split() if word not in stoplist]\n         for document in clean_corpus ]\n\n#Long list of non-distinct parsed words from doc \nprint(texts[992]) #comment 2'

In [11]:
#preprocess data for use in text mining/NLP - refactored for pandas dataframe
def preprocess_text(corpus,field_name = 'text'):
    print("Preprocessing Corpus from pandas data frame")
    for index, row in corpus.iterrows():  #iterate through rows in dataframe
        line = row['text'].strip('\n')
        line = strip_punctuation(line)
        line = strip_non_alphanum(line)
        line = strip_numeric(line)
        line = strip_multiple_whitespaces(line)
        #line = strip_short(line)
        #add cleaned text line to new dataframe
        corpus.at[index,field_name] = line #set value at row/column in corpus dataframet            
    return corpus

print('Task 2: Preprocessing dataset, including stoplist, word frequencies & filters')
print('Task 2a: Remove punctuation, non-alphanumeric and numeric characters')
#apply preprocessing function to pandas df text field "comment" to create a "raw" text corpus
raw_corpus = preprocess_text(documents,field_name = 'text') 
#print(raw_corpus.head(20))

Task 2: Preprocessing dataset, including stoplist, word frequencies & filters
Task 2a: Remove punctuation, non-alphanumeric and numeric characters
Preprocessing Corpus from pandas data frame


In [15]:
# Data Prep - Count word frequencies
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

#Remove Words that only appear once
processed_corpus = [
     [token for token in text if frequency[token] > 5]
     for text in texts
]
#print(processed_corpus)

[[], []]


In [None]:
# Data Prep - associate each word in the processed corpus with a unique integer ID, using the gensim.corpora.Dictionary class. 
#This dictionary defines the vocabulary of all words that our processing knows about.
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary.token2id)

In [None]:
print('Task 2b: remove english stopwords and add additional to remove from text document')
#set stopword list - see here for set of english "stop words": https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/stop_words.py
#scikit-learn uses a stoplist "frozenset" - immutable python set - "ENGLISH_STOP_WORDS" 
#use standard english stop words, along with aditionally defined in "Union Statement"
stop_words = text.ENGLISH_STOP_WORDS.union({"have", "with", "are"}) 
print(stop_words)

In [None]:
#Data Prep - To infer the latent structure in our corpus we need a way to represent documents
#that we can manipulate mathematically. One approach is to represent each document as a vector. 
#convert document into "Bag of works"
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
print(bow_corpus)

In [None]:
print('Task 2b: create a BOW vector for use with Latent-Dirichlet-Allocation (LDA) Models, using assigned stop words')
#Note: In Gensim Word2Vec, this is known as "Similarity Queues" - https://radimrehurek.com/gensim/tut3.html
#Bag-of-words is one choice - Gensim includes: Matrix-Market, LSI,SVMlight, LDA-C, GivvsLDA++, etc

#Convert a collection of text documents to a matrix of token counts - "bag-of-words", unless otherwise specified
bow_vector = CountVectorizer(stop_words=stop_words)

#"Comment" column is the 6th column in the dataset - index "5" in dataframe
value_list = [row[0] for row in raw_corpus.itertuples(index=False, name=None)]
#print(value_list[0:3])
#create term-document matrix and and place all relevant terms in vocabulary/dictionary
bow = bow_vector.fit_transform(value_list)
#dictionary stored in the vocabulary_ variable of the bow object 
print(bow_vector.vocabulary_)

In [None]:
#Now that we have vectorized our corpus we can begin to transform it using models. 
#We use model as an abstract term referring to a transformation from one document representation to another. 
#In gensim, documents are represented as vectors so a model can be thought of as a transformation between two vector spaces. 
#The details of this transformation are learned from the training corpus. '''

#One simple example of a model is tf-idf. The tf-idf model transforms vectors from the 
#bag-of-words representation to a vector space, where the frequency counts are weighted according 
#to the relative rarity of each word in the corpus.

# train the model using tf-idf model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" string
tfidf2 = tfidf[dictionary.doc2bow("ATM deposit".lower().split())]
print(tfidf2) #first entry is the token ID and the second entry is the tf-idf weighting

In [None]:
#create lda model from processed corpus/tfidf transformation 
#
corpus_tfidf = tfidf[bow_corpus]
lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=10)
print(lda)
#Print Topics produced by LDA model
for i in range(0, lda.num_topics-1):
    print("topic " + str(i) + ":" + lda.print_topic(i))    

In [None]:
#global variable that work as model parameters - adjust for model performance "fine-tuning"
n_samples = 2000 #sample size
n_features = 1000 #name/entity recgonition & group selection (vectors)
n_components = 10 #themes
n_top_words = 10 #words per theme

print('CRISP-DM Task: Model Building')
print("Fitting LDA models with n_topic=%d, n_samples=%d and n_features=%d..." % (n_components, n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)  #scikit-learn library
lda.fit(bow)
print("\nTopics in LDA model:")
bow_feature_names = bow_vector.get_feature_names()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print_top_words(lda, bow_feature_names, n_top_words) #note the improvement over last week's model!

In [None]:
#Model Evaluation - LDA topic Coherence - Closer to 0, the better
#for additional measures, see: https://radimrehurek.com/gensim/models/coherencemodel.html
from gensim.models.coherencemodel import CoherenceModel
cm = CoherenceModel(model=lda, corpus=corpus_tfidf, coherence='u_mass')
print(cm.get_coherence())  # get coherence value

In [None]:
#compare vs. topics generated by LSI model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)  # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
lsi.print_topics(10)

In [None]:
#other available transformations located at: https://radimrehurek.com/gensim/tut2.html
print('CRISP-DM Task: Model Evaluation ')
print('Step1 : get test topics then score them vs. current LDA model')
train, test = train_test_split(list(documents['text'].values), test_size = 0.2)
test_vector = bow_vector.fit_transform(test)
test_lda = lda.fit(test_vector)  #fit text vector within existing model - transpose to work
#Calculate approximate log-likelihood as score.
print(test_lda.score(test_vector)) #not meaningful in itself - compare vs. re-run models (closer to 0, the better)

In [None]:
print('Step 2: Use formal model evaluation stats, such as "perplexity" from scikit-learn library') 
# create test/train text documents to evaluate model 
vectoriser = CountVectorizer(stop_words = 'english', max_features=500)  #max features must be less that "n_features" variable!
doc_train = vectoriser.fit_transform(train)
features = vectoriser.get_feature_names()
doc_test = vectoriser.fit_transform(test)
news_lda = lda.fit(doc_train)
print(news_lda.perplexity(doc_test)) # lower the perplexity, the better