In [1]:
import spacy

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin 
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
import hdbscan
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

import string
import time
import re
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [16,9]

import ElasticSearchClass
import importlib
importlib.reload(ElasticSearchClass)

<module 'ElasticSearchClass' from 'E:\\my_study_place\\python\\jupyter\\spacy\\ElasticSearchClass.py'>

In [2]:
def load20NewsGroups():
    # #############################################################################
    # Load some categories from the training set
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
     ]
    dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)
    print("%d documents" % len(dataset.data))
    print("%d categories" % len(dataset.target_names))
    return dataset.data

In [2]:
def loadStackoverflowFromES():
    esUtil = ElasticSearchClass.ElasticSearchClass("192.168.18.187", 9201)
    dsl = '''
    {
    "_source":["title", "body"],
    "query":{
        "bool":{
            "must":{
                "match":{"posttypeid":1}}
            }
        },
    "size":1
    }
    '''
    res = esUtil.search(indexName="posts", body=dsl)
    for doc in res['hits']['hits']:
         print("%s) %s" % (doc['_id'], doc['_source']))
    
def iterLoadStackoverflowFromES():
    esUtil = ElasticSearchClass.ElasticSearchClass("192.168.18.187", 9201)
    dsl = '''
    {
    "_source":["body"],
    "query":{
        "bool":{
            "must":{
                "match":{"posttypeid":1}}
            }
        }
    }
    '''
    res = esUtil.scrollSearch(indexName="posts", body=dsl)
    print(res)
    count = 0
    data = []
    for doc in res:
        if count > 10000:
            break
        count += 1
        data.append([doc['_id'], doc['_source']['body']])
        #print(doc['_id'], doc['_source']['body'])
    return data
            
#loadStackoverflowFromES()
start_time = time.time()
data = iterLoadStackoverflowFromES()
end_time = time.time()
print("Retrieved {} records in {} Seconds".format(len(data), end_time - start_time))
#Retrieved 101 records in 1.3270199298858643 Seconds
#10001 records in 246.03912162780762 Seconds

<generator object scan at 0x000002659A2661A8>
Retrieved 10001 records in 112.29061961174011 Seconds


In [3]:
###########create data clearner

#Custom transformer using spaCy 
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    # lowercase
    text = text.lower()
    return text

In [4]:
############create tokenizer

#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
# List of symbols we don't care about
punctuations = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve", "--", "//", "div"]
parser = spacy.load('en')
def tokenizeText(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]  
    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
    return tokens

In [5]:
##########Create preprocess pipline and run
def preProcessData(X_train, max_features=None):
    #create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
    #vectorizer = TfidfVectorizer(tokenizer = tokenizeText)
    #svd = TruncatedSVD(2)
    #normalizer = Normalizer(copy=False)
    vectorizer = CountVectorizer(tokenizer = tokenizeText, max_features=max_features)
    start_time = time.time()
    pipe_preprocess = Pipeline([("cleaner", CleanTextTransformer()),
                 ("vectorizer", vectorizer)])
    X_train_preprocess = pipe_preprocess.fit_transform(X_train)
    end_time = time.time()
    print("Preprocess done in {} Seconds".format(end_time - start_time))
    return X_train_preprocess, vectorizer

In [6]:
X_train = load20NewsGroups()
X_train_preprocess, vectorizer = preProcessData(X_train, 1000)

3387 documents
4 categories
Preprocess done in 241.15877604484558 Seconds


In [5]:
tokens=parser("123 test")
for tok in tokens:
    print(tok.lemma_)


123
test


In [18]:
print(data[1])
X_train = [row[1] for row in data]
print(X_train[:2])
X_train_preprocess, vectorizer = preProcessData(X_train, 1000)

['471814', 'First Some Background (incase it helps):\nMy application is a Web-based framework recently upgraded to v3.5 of the .Net Framework but does not use a Master Pages / User Controls system.  It\'s more akin to the MVC pattern (although much older) and outputs pure HTML down the response stream from Templates.  The Python expressions allow some rules and template variations to be achieved.\nThe old way\nWhen embedding the IronPython 1.x engine in C#, we were able to do code such as:\nPythonEngine pe = new PythonEngine();\nAssembly a = Assembly.LoadFile("path to assembly");\npe.LoadAssembly(a);\npe.Import("Script");\n\nthere is no Import() method in ipy 2.0 and the ImportModule() method doesn\'t seem to work the same way.  The Import() alleviated the need to put a line in every python script we write, such as:\nfrom MyAssembly import MyClass\n\nthe fact that MyClass is full of static methods, means that calls to MyClass.MyMethod() work really well.  I can\'t just instansiate an o

In [19]:
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 4
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=125)
lda.fit(X_train_preprocess)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_jobs=1, n_topics=4, perp_tol=0.1, random_state=125,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [20]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
feature_names = vectorizer.get_feature_names()
print_top_words(lda, feature_names, 20)

Topic #0:
use user page table datum server database like want way select work sql need query try form create view control
Topic #1:
class public object new method return string use set property code type value void // function error like null try
Topic #2:
0 1 text function image 2 // var code x like array div use int value html 3 return work
Topic #3:
use file like application project work code run way good need know want window just try look make time write



In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
data = pyLDAvis.sklearn.prepare(lda, X_train_preprocess, vectorizer)
pyLDAvis.show(data)

  params = attr.ib(convert=attr.converters.optional(tuple))
  ids = attr.ib(default=None, convert=_ensure_immutable_ids)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [14/Feb/2018 17:24:08] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Feb/2018 17:24:08] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Feb/2018 17:24:09] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Feb/2018 17:24:09] "GET /LDAvis.js HTTP/1.1" 200 -
