## Текстовый анализ URL в задаче lookalike

In [1]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = SparkConf().set("spark.executor.instances", 64).set("spark.driver.maxResultSize", "32g").set('spark.driver.memory','32g')
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass


In [2]:
#Constants
n = 4

In [3]:
# I used alias to avoid confusion with the mllib library
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.mllib.linalg import SparseVector
from pyspark.ml.feature import HashingTF as MLHashingTF
from pyspark.ml.feature import IDF as MLIDF
from pyspark.sql.types import DoubleType



In [4]:
# Translate ru to eng, Transform text to n_gram list, Get n_gram index

#abc = list(set(''.join([e[0] for e in hc.sql('select url from prod_raw_liveinternet.access_log v where ymd = "2017-01-10" limit 100000').collect()])))

def n_gram(s, n):
    '''Returns n-gram list from string s.'''
    return [s[i:i+n] for i in range(len(s) - n + 1)]

def n_gram_index(ngr,abc):
    '''Returns index of n-gram ngr. ngr chars must be from abc list'''
    N = tr_abc_len
    ind = 0
    
    for i in range(len(ngr)):
        try:
            j = abc.index(ngr[i].lower())
            if j > N:
                j = abc.index(ngr[i].lower().translate(transl))
            ind += (N ** i) * j
        except ValueError:
            ind += (N ** i) * (N - 1)
    return ind

symbols = (u"абвгдеёжзийклмнопрстуфхцчшщъыьэюя&-?",
           u"abvgdeejzijklmnoprstufhzcss_y_eua   ")
transl = {ord(a):ord(b) for a, b in zip(*symbols)}

def handle_row(r,transl):
    '''Translate ru-> eng by letter and lower string'''
    #abc = list(u'abcdefghijklmnopqrstuvwxyz0123456789 _абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
    #tr_abc_len = abc.index(u'а') - 1
    res = ''
    for c in r:
        res += c.lower().translate(transl)
    return res


In [5]:
import datetime
print(datetime.datetime.now())

2017-01-18 17:55:05.774883


In [6]:


train_sample = hc.sql('select * from user_kposminin.url_text_20161108_2')


In [7]:
train_ngrams = (train_sample 
               .map(lambda r: [
                     r.label, 
                     r.first_day, 
                     reduce(lambda a,b:a+b,[n_gram(handle_row(e,transl),n) for e in r.up_bow])
                   ])
               .toDF()
               .withColumnRenamed("_1","label")
               .withColumnRenamed("_2","first_day")
               .withColumnRenamed("_3","ngram_list"))

#train_ngrams.show()
import datetime
print(datetime.datetime.now())

2017-01-18 17:56:09.125445


In [9]:
htf_method = MLHashingTF(numFeatures=2 ** 20, inputCol="ngram_list", outputCol="tf")
train_tf = htf_method.transform(train_ngrams)
#train_tf.show(truncate=False)

In [10]:
train_tf.cache()
idf_method = MLIDF(inputCol="tf", outputCol="idf", minDocFreq = 1).fit(train_tf)
train_tfidf = idf_method.transform(train_tf)
print(datetime.datetime.now())

2017-01-18 19:04:28.249702


In [11]:
train_data = (train_tfidf
             .map(lambda r:
                 (r.label,
                  r.first_day,
                  r.tf.size,
                  r.tf.indices.tolist(),
                  r.tf.values.tolist(),
                  r.idf.values.tolist()
                 ))
             .toDF()
             .withColumnRenamed("_1","label")
             .withColumnRenamed("_2","first_day")
             .withColumnRenamed("_3","tf_size")
             .withColumnRenamed("_4","tf_index")
             .withColumnRenamed("_5","tf_values")
             .withColumnRenamed("_6","idf_values")
             )
#train_data.write.saveAsTable("url_text_feat_tst")

print(datetime.datetime.now())

2017-01-18 19:04:29.284590


In [12]:
train_data.write.saveAsTable("user_kposminin.url_text_feat_20161108_6")

print(datetime.datetime.now())

2017-01-18 19:06:59.851414


In [13]:
sc.stop()

In [None]:
test_sample = hc.sql('select * from user_kposminin.url_text_20161115')

test_ngrams = (test_sample 
               .map(lambda r: [
                     r.label, 
                     r.first_day, 
                     reduce(lambda a,b:a+b,[n_gram(handle_row(e,transl),n) for e in r.up_bow])
                   ])
               .toDF()
               .withColumnRenamed("_1","label")
               .withColumnRenamed("_2","first_day")
               .withColumnRenamed("_3","ngram_list"))

test_tf = htf_method.transform(test_ngrams)
test_tfidf = idf_method.transform(test_tf)

test_data = (test_tfidf
             .map(lambda r:
                 (r.label,
                  r.first_day,
                  r.tf.size,
                  r.tf.indices.tolist(),
                  r.tf.values.tolist(),
                  r.idf.values.tolist()
                 ))
             .toDF()
             .withColumnRenamed("_1","label")
             .withColumnRenamed("_2","first_day")
             .withColumnRenamed("_3","tf_size")
             .withColumnRenamed("_4","tf_index")
             .withColumnRenamed("_5","tf_values")
             .withColumnRenamed("_6","idf_values")
             )


In [None]:
test_data.write.saveAsTable("user_kposminin.url_text_feat_20161115")

KeyboardInterrupt: 

In [None]:
documents = hc.createDataFrame([
    ('0', 0, "hello spark hello", "data1"),
    ('1', 1, "this is example", "data2"),
    ('2', 0, "spark is fast","data3"),
    ('3', 0, "hello world","data4")], ["doc_id", "label", "doc_text", "another"])

documents.printSchema()
# root
# |-- doc_id: long (nullable = true)
# |-- doc_text: string (nullable = true)
# |-- another: string (nullable = true)

In [None]:
df = (documents
  .map(lambda x : (x.doc_id,x.doc_text.split(" ")))
  .toDF()
  .withColumnRenamed("_1","doc_id")
  .withColumnRenamed("_2","features"))

In [None]:
df.show()

In [None]:
htf = MLHashingTF(inputCol="features", outputCol="tf")
tf = htf.transform(df)
tf.show(truncate=False)
# +------+-------------------+------------------------------------------+
# |doc_id|features           |tf                                        |
# +------+-------------------+------------------------------------------+
# |0     |[hello, spark]     |(262144,[62173,71890],[1.0,1.0])          |
# |1     |[this, is, example]|(262144,[3370,69994,151198],[1.0,1.0,1.0])|
# |2     |[spark, is, fast]  |(262144,[3370,62173,251996],[1.0,1.0,1.0])|
# |3     |[hello, world]     |(262144,[71890,72594],[1.0,1.0])          |
# +------+-------------------+------------------------------------------+

In [None]:
idf = MLIDF(inputCol="tf", outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(truncate=False)
# +------+-------------------+------------------------------------------+---------------------------------------------------------------------------------------+
# |doc_id|features           |tf                                        |idf                                                                                    |
# +------+-------------------+------------------------------------------+---------------------------------------------------------------------------------------+
# |0     |[hello, spark]     |(262144,[62173,71890],[1.0,1.0])          |(262144,[62173,71890],[0.5108256237659907,0.5108256237659907])                         |
# |1     |[this, is, example]|(262144,[3370,69994,151198],[1.0,1.0,1.0])|(262144,[3370,69994,151198],[0.5108256237659907,0.9162907318741551,0.9162907318741551])|
# |2     |[spark, is, fast]  |(262144,[3370,62173,251996],[1.0,1.0,1.0])|(262144,[3370,62173,251996],[0.5108256237659907,0.5108256237659907,0.9162907318741551])|
# |3     |[hello, world]     |(262144,[71890,72594],[1.0,1.0])          |(262144,[71890,72594],[0.5108256237659907,0.9162907318741551])                         |
# +------+-------------------+------------------------------------------+---------------------------------------------------------------------------------------+

In [None]:
res = tfidf.rdd.map(lambda x : (x.doc_id,x.features,x.tf,x.idf,(None if x.idf is None else x.idf.values.sum())))

for r in res.take(10):
    print r

# (0, [u'hello', u'spark'], SparseVector(262144, {62173: 1.0, 71890: 1.0}), SparseVector(262144, {62173: 0.5108, 71890: 0.5108}), 1.0216512475319814)
# (1, [u'this', u'is', u'example'], SparseVector(262144, {3370: 1.0, 69994: 1.0, 151198: 1.0}), SparseVector(262144, {3370: 0.5108, 69994: 0.9163, 151198: 0.9163}), 2.3434070875143007)
# (2, [u'spark', u'is', u'fast'], SparseVector(262144, {3370: 1.0, 62173: 1.0, 251996: 1.0}), SparseVector(262144, {3370: 0.5108, 62173: 0.5108, 251996: 0.9163}), 1.9379419794061366)
# (3, [u'hello', u'world'], SparseVector(262144, {71890: 1.0, 72594: 1.0}), SparseVector(262144, {71890: 0.5108, 72594: 0.9163}), 1.4271163556401458)

In [None]:
from pyspark.sql.functions import udf

sum_ = udf(lambda v: float(v.values.sum()), DoubleType())
tfidf.withColumn("idf_sum", sum_("idf")).show()

## +------+-------------------+--------------------+--------------------+------------------+
## |doc_id|           features|                  tf|                 idf|           idf_sum|
## +------+-------------------+--------------------+--------------------+------------------+
## |     0|     [hello, spark]|(262144,[62173,71...|(262144,[62173,71...|1.0216512475319814|
## |     1|[this, is, example]|(262144,[3370,699...|(262144,[3370,699...|2.3434070875143007|
## |     2|  [spark, is, fast]|(262144,[3370,621...|(262144,[3370,621...|1.9379419794061366|
## |     3|     [hello, world]|(262144,[71890,72...|(262144,[71890,72...|1.4271163556401458|