In [1]:
import spacy

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin 
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
import hdbscan
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

import string
import time
import re
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [16,9]
from bs4 import BeautifulSoup
import numpy as np
from joblib import Parallel, delayed

import ElasticSearchClass
import importlib
importlib.reload(ElasticSearchClass)

<module 'ElasticSearchClass' from 'E:\\my_study_place\\python\\jupyter\\spacy\\ElasticSearchClass.py'>

In [2]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
   #some data will raise NotImplementedError: subclasses of ParserBase must override error()
    try:
        bs = BeautifulSoup(text, "html.parser")
        #code = [s.extract() for s in bs('code')]
        # replace other HTML symbols
        text = bs.get_text()
    except Exception as e:
        #print(e)
        pass
    text = text.lower()
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    # replace @xxxx with @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@mention", text)
    # here we don't need @mention
    text = re.sub("@mention", '', text).strip()
    # delete numbers
    text = re.sub(r'\w*\d\w*', '', text).strip()
    return text

In [5]:
def iterLoadStackoverflowFromES():
    esUtil = ElasticSearchClass.ElasticSearchClass("192.168.18.187", 9201)
    dsl = '''
    {
    "_source":["body"],
    "query":{
        "bool":{
            "must":{
                "match":{"posttypeid":1}}
            }
        }
    }
    '''
    res = esUtil.scrollSearch(indexName="stackoverflow", body=dsl)
    print(res)
    count = 0
    data = []
    for doc in res:
        if count > 10000:
            break
        count += 1
        data.append([doc['_id'], doc['_source']['body']])
        #print(doc['_id'], doc['_source']['body'])
    return data
            
#loadStackoverflowFromES()
start_time = time.time()
data = iterLoadStackoverflowFromES()
end_time = time.time()
print("Retrieved {} records in {} Seconds".format(len(data), end_time - start_time))
#Retrieved 101 records in 1.3270199298858643 Seconds
#10001 records in 246.03912162780762 Seconds

<generator object scan at 0x0000026B4B43BBA0>
Retrieved 1092 records in 1.1353392601013184 Seconds


In [6]:

%time
X_train = [row[1] for row in data]
start_time = time.time()
X_train_preprocess = [cleanText(txt) for txt in X_train]
#print(X_train_preprocess[:10])
end_time = time.time()
print("Preprocess done in {} Seconds".format(end_time - start_time))

Wall time: 0 ns
Preprocess done in 0.4301903247833252 Seconds


In [None]:
from joblib import Parallel, delayed

if "__name__ = __main__":
    start_time = time.time()
    X_train_preprocess = [Parallel(n_jobs=8)(delayed(cleanText)(txt)for txt in X_train)]
    print(X_train_preprocess[:10])
    end_time = time.time()
    print("Preprocess done in {} Seconds".format(end_time - start_time))

In [None]:
import multiprocessing 
%time
start_time = time.time()
pool = multiprocessing.Pool(processes=2)
results = pool.map(cleanText, X_train)
#print(results)
end_time = time.time()
print("Preprocess done in {} Seconds".format(end_time - start_time))

Wall time: 0 ns


In [6]:
import mytest
import importlib
importlib.reload(mytest)
    
#for multiprocessing 'i' format requires -2147483648 <= number <= 2147483647 
import time
if __name__ == "__main__":
    N = 1500000
    n_chunks=10
    start_time=time.time()
    X=list(range(N))
    mytest.chunkParallel(X, 10)
    end_time=time.time()
    print("can process list of length {}, time {}".format(N, end_time - start_time))

0
1
2
3
4
5
6
7
8
9
1500000 [1499999]
can process list of length 1500000, time 0.461627721786499


In [1]:
#!/usr/bin/env python  
# -*- coding: utf8 -*- 
     
import logging 
import warnings  
     
     
logging.basicConfig(level=logging.INFO) 
 
 
def filterwarn(): 
    # warnings.simplefilter('ignore', UserWarning) 
    # advance warnings flter function 
    warnings.filterwarnings('ignore', '.*warn.*', UserWarning, 'module') 
 
 
def main(): 
    filterwarn() 
    # compare the two following items. 
    warnings.warn("This is a warning message.") 
    logging.warn("[+] This is a warn message.") 
 
    logging.info("[+] This is a info message.") 
 
if __name__ == "__main__": 
    main()

INFO:root:[+] This is a info message.


In [159]:
text = "projectName:""line-test"" AND body:""Hello"""

In [15]:
from urllib import parse

#这个是js的结果
# encodeURIComponent('中国')
# "%E4%B8%AD%E5%9B%BD"
#jsRet='%E4%B8%AD%E5%9B%BD'
#print(parse.unquote(jsRet))       #输出：中国
#print(jsRet==parse.quote('中国'))  #输出：True
print(parse.quote("projectName:\"avatar-chat\" AND (body:\"FAILURE_DB\")"))

projectName%3A%22avatar-chat%22%20AND%20%28body%3A%22FAILURE_DB%22%29


In [229]:
from Utils import utilIdentifier
import importlib
importlib.reload(utilIdentifier)

def identifyText(text):
    text = utilIdentifier.identifyIP(text)
    text = utilIdentifier.identifyDatetime(text)
    text = utilIdentifier.identifyUri(text)
    text = utilIdentifier.identifyNumber(text)
    return text

In [230]:
import spacy
import string

text = "192.168.1.100 fe80::20c:29ff:fe75:f519/64 2018-03-13 16:27:24 +0900 [warn]: dest=http://github.com/obdg/plda.git #0 failed to flush the buffer fe80::20c:29ff:fe75:f519/64 . retry_time=2 next_retry_seconds=2018-03-13 16:27:54 +0900 chunk=""56745277a4532957f8c4fe9e070b75d1"" error_class=NoMethodError error=""undefined method `has_key?' for #<String:0x007ff21c6d91b8>"
text = identifyText(text)
print(text)
#https://spacy.io/usage/processing-pipelines
#https://github.com/explosion/spaCy/issues/1837
parser = spacy.load('en', disable=['parser', 'ner'])
tokens = parser(text)
for tok in tokens:
    print(tok, tok.tag_, tok.lemma_)
    
#for ent in tokens.ents:
#    print(ent, ent.label, ent.lable_)


IP_TYPE IP_TYPE DATETIME_TYPE [warn]: dest=URL_TYPE #0 failed to flush the buffer IP_TYPE . retry_time=NUM_TYPE next_retry_seconds=DATETIME_TYPE chunk=56745277a4532957f8c4fe9e070b75d1 error_class=NoMethodError error=undefined method `has_key?' for #<String:0x007ff21c6d91b8>
IP_TYPE NNP ip_type
IP_TYPE NNP ip_type
DATETIME_TYPE NNP datetime_type
[ -LRB- [
warn VBP warn
] -RRB- ]
: : :
dest JJS d
= SYM =
URL_TYPE NNS url_type
# $ #
0 CD 0
failed VBD fail
to TO to
flush VB flush
the DT the
buffer NN buffer
IP_TYPE NN ip_type
. . .
retry_time NN retry_time
= . =
NUM_TYPE NNP num_type
next_retry_seconds NNS next_retry_second
= SYM =
DATETIME_TYPE NNS datetime_type
chunk=56745277a4532957f8c4fe9e070b75d1 VBP chunk=56745277a4532957f8c4fe9e070b75d1
error_class NN error_class
= SYM =
NoMethodError NN nomethoderror
error NN error
= SYM =
undefined JJ undefined
method NN method
` '' `
has_key NN has_key
? . ?
' '' '
for IN for
# $ #
< XX <
String:0x007ff21c6d91b8 XX string:0x007ff21c6d91b8
> XX >

In [172]:
text= "2014-07-12 05:21 INFO https source=192.168.32.10"
tokens = parser(text)
for tok in tokens:
    print(tok, tok.tag_, tok.lemma_)

2014 CD 2014
- SYM -
07 CD 07
- SYM -
12 CD 12
05:21 CD 05:21
INFO NNP info
https NN https
source=192.168.32.10 PRP source=192.168.32.10


In [213]:
for match in re.finditer("[/\:\-\,\s\_\+\@=]\d+[\s\,\.]","buffer fe80::20c:29ff:fe75:f519/64 . retry_time=2 "):
    print(match.group())
    print(match.group()[1:].rstrip(",. ").isnumeric())
    print("\n")

/64 
True


=2 
True


