###### Текстовый анализ URL в задаче lookalike

In [1]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = (SparkConf()
        .set("spark.executor.instances", 64)
        .set("spark.driver.maxResultSize", "64g")
        .set('spark.driver.memory','32g')
        .set("spark.executor.memory", '16g')
        .set("spark.yarn.executor.memoryOverhead", 6048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass


In [11]:
#Constants
n = 2
tf_size = 2 ** 20

In [12]:
# I used alias to avoid confusion with the mllib library
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.mllib.linalg import SparseVector
from pyspark.ml.feature import HashingTF as MLHashingTF
from pyspark.ml.feature import IDF as MLIDF
from pyspark.sql.types import DoubleType
import urllib


In [13]:
# Translate ru to eng, Transform text to n_gram list, Get n_gram index

#abc = list(set(''.join([e[0] for e in hc.sql('select url from prod_raw_liveinternet.access_log v where ymd = "2017-01-10" limit 100000').collect()])))

def n_gram(s, n):
    '''Returns n-gram list from string s.'''
    return [s[i:i+n] for i in range(len(s) - n + 1)]

def n_gram_index(ngr,abc):
    '''Returns index of n-gram ngr. ngr chars must be from abc list'''
    N = tr_abc_len
    ind = 0
    
    for i in range(len(ngr)):
        try:
            j = abc.index(ngr[i].lower())
            if j > N:
                j = abc.index(ngr[i].lower().translate(transl))
            ind += (N ** i) * j
        except ValueError:
            ind += (N ** i) * (N - 1)
    return ind

#abc = list(u'abcdefghijklmnopqrstuvwxyz0123456789 _абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
#tr_abc_len = abc.index(u'а') - 1

symbols = (u"абвгдеёжзийклмнопрстуфхцчшщъыьэюя&-?%#!/\=_.-~$",
           u"abvgdeejzijklmnoprstufhzcss_y_eua           ")
transl = {ord(a):ord(b) for a, b in zip(*symbols)}

def handle_str(s,transl):
    '''Translate ru-> eng by letter and lower string'''
    return re.sub('[ ]+',' '*(n-1),re.sub('''[$#=\[\]_~+!&()*\./:;\?|'"%-\[\],]''',' ',urllib.unquote(s.encode('UTF-8','ignore')).decode('UTF-8','ignore').lower().translate(transl)))


In [5]:
#a = train_sample.collect()
#for i in range(len(a)):
#    print i
#    b = handle_str(a[i].up,transl)
s= u'''ok.ru/feed glistof.net/board/statusy_pro_ulybku/32 ok.ru/dk?st.cmd=anonymMain mail.ru/?from=odnoklassniki mail.ru/?from=odnoklassniki mail.ru/?from=odnoklassniki mail.ru/?from=odnoklassniki
mail.ru/?from=odnoklassniki ok.ru/ mail.ru/?from=odnoklassniki ok.ru/game/candyvalley2 ok.ru/dk?st.cmd=anonymMain
ok.ru/game/candyvalley2 ok.ru/gifts ok.ru/feed ok.ru/ ok.ru/feed ok.ru/game/candyvalley2 ok.ru/game/candyvalley
ok.ru/dk?st.cmd=anonymMain ok.ru/ my-hit.org/film/19945/ news.mail.ru/politics/27714036/?frommail=1 my-hit.org/film/19945/ news.mail.ru/politics/27714036/?frommail=1 filmogo.co/5902-odnazhdy-v-odesse-2-sezon-vse-seriyiiiiji.html 
ok.ru/game/candyvalley allserials.net/serial-3560-zapovednik-straha-1-sezon.html fast-torrent.ru/film/mezhdu-zhiznyu-i-smertyu.html ok.ru/feed bigcinema.to/series/severnyy-veter-serial.html
go.mail.ru/search?rf=1011&fm=1&q=%D1%87%D1%82%D0%BE%20%D1%82%D0%B0%D0%BA%D0%BE%D0%B5%D0%B1%D1%80%D1%8E%D0%BA%D0%B8%20%D0%BA%D1%8E%D0%BB%D0%BE%D1%82%D1%8B&sbmt=1478611919453 bigcinema.to/series/severnyy-veter-serial.html 
bolshoyvopros.ru/questions/1828070-chto-takoe-brjuki-kjuloty-kak-oni-vygljadjat-kto-ih-avtor.html go.mail.ru/search?fm=1&rf=1011&q=ex.ua ok.ru/?_erv=vaywlyirbwpynedplup ok.ru/game/vegamix ok.ru/game/piratetreasures mail.ru/?from=odnoklassniki
ok.ru/?_erv=vaywlyirbwpynedplup mail.ru/?from=odnoklassniki filmogo.co/5902-odnazhdy-v-odesse-2-sezon-vse-seriyiiiiji.html mail.ru/?from=odnoklassniki allserials.net/serial-3560-zapovednik-straha-1-sezon.html 
mail.ru/?from=odnoklassniki news.mail.ru/politics/27714037/?frommail=1 ok.ru/dk?st.cmd=anonymMain news.mail.ru/politics/27714037/?frommail=1 mail.ru/?from=odnoklassniki go.mail.ru/search?fm=1&rf=1011&q=cnfnecs ghj ek%2Cre 
mail.ru/?from=odnoklassniki statusas.ru/ulibka/33-ulibka.html mail.ru/?from=odnoklassniki go.mail.ru/search?fm=1&rf=1011&q=%D0%B1%D1%80%D1%8E%D0%BA%D0%B8 %D0%BA%D1%8E%D0%BB%D0%BE%D1%82%D1%8B mail.ru/?from=odnoklassniki
mail.ru/?from=odnoklassniki ok.ru/feed mail.ru/?from=odnoklassniki ok.ru/profile/330316254834 mail.ru/?from=odnoklassniki pogoda.mail.ru/prognoz/kiev/ mail.ru/?from=odnoklassniki pogoda.mail.ru/prognoz/kiev/
news.mail.ru/politics/27714036/?frommail=1 mail.ru/?from=odnoklassniki news.mail.ru/politics/27714036/?frommail=1 mail.ru/?from=odnoklassniki glistof.net/board/statusy_pro_ulybku/32 ok.ru/?_erv=viewlyirbwpynedra
mail.ru/?from=odnoklassniki ok.ru/ mail.ru/?from=odnoklassniki ok.ru/ bigcinema.to/series/podzemnyy-perehod-serial.html bigcinema.to/series/podzemnyy-perehod-serial.html fast-torrent.ru/film/mezhdu-zhiznyu-i-smertyu.html 
bigcinema.to/series/severnyy-veter-serial.html bigcinema.to/series/severnyy-veter-serial.html mail.ru/?from=odnoklassniki mail.ru/?from=odnoklassniki mail.ru/?from=odnoklassniki mail.ru/?from=odnoklassniki pogoda.mail.ru/prognoz/kiev/
pogoda.mail.ru/prognoz/kiev/ mail.ru/?from=odnoklassniki mail.ru/?from=odnoklassniki news.mail.ru/incident/27707964/?frommail=1 news.mail.ru/incident/27707964/?frommail=1'''

#print(handle_str(s,transl))


In [15]:
def generate_ngram_stat(sdf,n,tf_size,idf_method = None, minDocFreq = 1):
    '''
    Generates n-gram statistics of sdf.
    
    Input:
      sdf -PySpark DataFrame.sdf last column contains text data to analyse (type string)
      n - size of n-grams
      tf_size -  dimension of a  space ngram projected to;
      
    Returns DataFrame with all sdf columns except last + columns:
        tf_size - dimension of a  space ngram projected to;
        tf_index - list of  indexes of n-gram found in sdf text columns;
        tf_values - list of corresponding TF values (n-gram counts);
        idf_values - list of corresponding TFIDF values.
    '''
    cols = sdf.columns
    df_ngrams = (sdf
               .rdd
               .map(lambda r: list(r[:-1]) + [n_gram(handle_str(r[-1],transl),n)])
               .toDF()
               .withColumnRenamed("_{}".format(len(cols)),"ngram_list"))    
    htf_method = MLHashingTF(numFeatures = tf_size, inputCol="ngram_list", outputCol="tf")
    df_tf = htf_method.transform(df_ngrams)    
    if not idf_method:
        print('Fitting idf_method')
        idf_method = MLIDF(inputCol="tf", outputCol="idf", minDocFreq = minDocFreq).fit(df_tf)        
    df_tfidf = idf_method.transform(df_tf)
    df_data = (df_tfidf
             .rdd
             .map(lambda r:
                  list(r[:-3]) + 
                  [r.tf.indices.tolist(),
                  r.tf.values.tolist(),
                  r.idf.values.tolist()]
                 )
             .toDF()
             .withColumnRenamed("_{}".format(len(cols)),"tf_index")
             .withColumnRenamed("_{}".format(len(cols)+1),"tf_values")
             .withColumnRenamed("_{}".format(len(cols)+2),"idf_values")
             )
    for i in range(len(cols)):
        df_data = df_data.withColumnRenamed("_{}".format(i+1),cols[i])
    return df_data, idf_method
    
    

In [None]:
train_sample = hc.sql('select phone_num,label,first_day,up from user_kposminin.url_text_20161108_2')
print(datetime.datetime.now())

In [None]:
train_data,idf_method = generate_ngram_stat(train_sample, n = n, tf_size = tf_size)
print(datetime.datetime.now())

In [None]:

train_data.write.saveAsTable("user_kposminin.url_text_feat_20161108_7")
print(datetime.datetime.now())

In [7]:
test_sample = hc.sql('select phone_num,label,first_day,up from user_kposminin.url_text_20161115 where (substr(md5(phone_num),1,1) in ("0","1") or label = 1)')
print(datetime.datetime.now())

2017-01-19 16:07:26.249256


In [None]:
from pyspark.ml.feature import IDFModel
idfm = MLIDF._new_java_obj("org.apache.spark.ml.feature.IDFModel.load", "idf_model")
idf_method = IDFModel(idfm)

test_data, _ = generate_ngram_stat(test_sample, n = n, tf_size = tf_size, idf_method = idf_method)
print(datetime.datetime.now())

2017-01-19 16:38:30.312345


In [None]:
test_data.write.saveAsTable("user_kposminin.url_text_feat_20161115_12")
print(datetime.datetime.now())

In [None]:
#idf_method.save('idf_model.m')
sc.parallelize(Seq(idf_method), 1).saveAsObjectFile("idf_model.model")

In [None]:
# Save IDF model
writer = idf_method._call_java("write")
writer.save("idf_model")

In [5]:
import datetime
print(datetime.datetime.now())

2017-01-19 17:28:50.775353


In [83]:
tst = hc.sql('select phone_num,label,first_day,concat(substr(up,1,20),"kkkkkk") as up from user_kposminin.url_text_20161108_2  limit 5')
df_tst = tst.toPandas()
print(datetime.datetime.now())

2017-01-19 18:06:28.583026


In [84]:
tst_data,idf_method2 = generate_ngram_stat(tst, n = n, tf_size = tf_size)
df_tst_data = tst_data.toPandas()

Fitting idf_method


In [90]:
df_tst_data

Unnamed: 0,phone_num,label,first_day,tf_index,tf_values,idf_values
0,70195908524,0,0,"[1097, 1102, 3139, 3143, 3240, 3241, 3245, 324...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.69314718056, 0.69314718056, 1.09861228867, ..."
1,70292460512,0,0,"[1091, 1094, 1110, 3180, 3241, 3276, 3349, 335...","[2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, ...","[1.38629436112, 0.69314718056, 1.09861228867, ..."
2,70292959915,0,0,"[1097, 1101, 1106, 3110, 3112, 3163, 3246, 329...","[1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, ...","[0.69314718056, 2.19722457734, 0.69314718056, ..."
3,70638960894,0,0,"[1091, 1096, 3114, 3180, 3238, 3335, 3380, 341...","[1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 5.0, ...","[0.69314718056, 1.09861228867, 1.09861228867, ..."
4,70639551572,0,0,"[1094, 1095, 1102, 1106, 3132, 3194, 3231, 323...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.69314718056, 1.09861228867, 0.69314718056, ..."


In [91]:
sdf = tst
idf_method = None
if True:
    cols = sdf.columns
    df_ngrams = (sdf
               .rdd
               .map(lambda r: list(r[:-1]) + [n_gram(handle_str(r[-1],transl),n)])
               .toDF()
               .withColumnRenamed("_{}".format(len(cols)),"ngram_list"))    
    htf_method = MLHashingTF(numFeatures = tf_size, inputCol="ngram_list", outputCol="tf")
    df_tf = htf_method.transform(df_ngrams)    
    if not idf_method:
        print('Fitting idf_method')
        idf_method = MLIDF(inputCol="tf", outputCol="idf", minDocFreq = 1).fit(df_tf)        
    df_tfidf = idf_method.transform(df_tf)
    df_data = (df_tfidf
             .rdd
             .map(lambda r:
                  list(r[:-3]) + 
                  [r.tf.indices.tolist(),
                  r.tf.values.tolist(),
                  r.idf.values.tolist()]
                 )
             .toDF()
             .withColumnRenamed("_{}".format(len(cols)),"tf_index")
             .withColumnRenamed("_{}".format(len(cols)+1),"tf_values")
             .withColumnRenamed("_{}".format(len(cols)+2),"idf_values")
             )
    for i in range(len(cols)):
        df_data = df_data.withColumnRenamed("_{}".format(i+1),cols[i])

Fitting idf_method


In [87]:
#len([e for e in df.iloc[:,3] if 'k ' in e])
len([e for e in df.iloc[1:,3] if 'k ' in e])

2

In [97]:
df = df_tfidf.toPandas()

In [102]:
df.iloc[0,4]

SparseVector(1048576, {1097: 1.0, 1102: 1.0, 3139: 1.0, 3143: 1.0, 3240: 1.0, 3241: 1.0, 3245: 1.0, 3247: 1.0, 3362: 1.0, 3364: 1.0, 3424: 5.0, 3477: 1.0, 3480: 1.0, 3511: 1.0, 3520: 1.0, 3521: 1.0, 3556: 1.0, 3628: 2.0, 3635: 1.0, 3681: 1.0})

In [53]:
df.iloc[1,3],len(set(df.iloc[1,3])),len(df.iloc[1,3])

([u'vk',
  u'k ',
  u' c',
  u'co',
  u'om',
  u'm ',
  u' v',
  u'vk',
  u'k ',
  u' c',
  u'co',
  u'om',
  u'm ',
  u' f',
  u'fr',
  u'ri',
  u'ie',
  u'en'],
 12,
 18)

In [104]:
df.tf[1],df.idf[1]

(SparseVector(1048576, {1091: 2.0, 1094: 1.0, 1110: 1.0, 3180: 2.0, 3241: 1.0, 3276: 1.0, 3349: 2.0, 3356: 1.0, 3411: 2.0, 3424: 5.0, 3517: 1.0, 3550: 2.0, 3639: 1.0, 3765: 2.0}),
 SparseVector(1048576, {1091: 1.3863, 1094: 0.6931, 1110: 1.0986, 3180: 1.3863, 3241: 0.6931, 3276: 1.0986, 3349: 1.3863, 3356: 1.0986, 3411: 1.3863, 3424: 0.0, 3517: 0.6931, 3550: 1.3863, 3639: 1.0986, 3765: 2.1972}))

In [77]:
import numpy as np
tf = 2./12

idf = np.log((18.+1)/(1.+1))
print('"vk": tf {:3f}. idf {:3f}.  tfidf {:3f}'.format(tf,idf,tf*idf))

"vk": tf 0.166667. idf 2.251292.  tfidf 0.375215


In [106]:
tf = 1.

idf = np.log((18.+1)/(2.+1))
print('"k ": tf {:3f}. idf {:3f}.  tfidf {:3f}'.format(tf,idf,tf*idf))

"k ": tf 1.000000. idf 1.845827.  tfidf 1.845827


In [95]:
df_tst_data


Unnamed: 0,phone_num,label,first_day,tf_index,tf_values,idf_values
0,70195908524,0,0,"[1097, 1102, 3139, 3143, 3240, 3241, 3245, 324...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.69314718056, 0.69314718056, 1.09861228867, ..."
1,70292460512,0,0,"[1091, 1094, 1110, 3180, 3241, 3276, 3349, 335...","[2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, ...","[1.38629436112, 0.69314718056, 1.09861228867, ..."
2,70292959915,0,0,"[1097, 1101, 1106, 3110, 3112, 3163, 3246, 329...","[1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, ...","[0.69314718056, 2.19722457734, 0.69314718056, ..."
3,70638960894,0,0,"[1091, 1096, 3114, 3180, 3238, 3335, 3380, 341...","[1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 5.0, ...","[0.69314718056, 1.09861228867, 1.09861228867, ..."
4,70639551572,0,0,"[1094, 1095, 1102, 1106, 3132, 3194, 3231, 323...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.69314718056, 1.09861228867, 0.69314718056, ..."
