## Текстовый анализ URL в задаче lookalike

In [82]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = (SparkConf()
        .set("spark.executor.instances", 2)
        .set("spark.driver.maxResultSize", "16g")
        .set('spark.driver.memory','16g')
        .set("spark.executor.memory", '4g')
        .set("spark.yarn.executor.memoryOverhead", 1048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass


In [57]:
#Constants
n_list = [1,2,3,4,5]
tf_size = 2 ** 20

In [58]:
# I used alias to avoid confusion with the mllib library
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.mllib.linalg import SparseVector
from pyspark.ml.feature import HashingTF as MLHashingTF
from pyspark.ml.feature import IDF as MLIDF
from pyspark.sql.types import DoubleType
import urllib

In [None]:

! hadoop fs -du -h hdfs://nameservice1/user/k.p.osminin/url_text_tf_train_dir

In [None]:
start_time = datetime.datetime.now()

train_select_query = '''
select ymd,phone_num,approve_label,full_app_label,full_app_first_day,sampled,strong_sampled,urls_str 
from user_kposminin.url_text_train

'''


df1 = hc.sql(train_select_query)
# df2, idf = generate_ngram_stat_manual(df1,n_list,tf_size) # Не проработал за выходные 
df2 = generate_ngram_tf_manual(df1, n_list, tf_size)



(df2
.saveAsTextFile('url_text_tf_train_dir')
 )

#.map(lambda (k,v):','.join([str(e) for e in k]) + ',' + ';'.join([' '.join([str(ee) for ee in e]) for e in v]) + '\n')



print('train was handled in {}.'.format(datetime.datetime.now()  - start_time))


In [None]:
#prepare libsvm file. All n-grams united

# url_text_tf_train_dir columns: ymd,phone_num, approve_label,full_app_label,full_app_first_day,sampled,strong_sampled

def cast_int(v):
    try:
        return int(v)
    except:
        return None

(sc.textFile('url_text_tf_train_dir')
          .map(lambda r: (r.split(',')[2], sorted(list(set([cast_int(re.sub('[()\[\]L]','',e)) for e in r.split(',')[7::2]])))))
          .map(lambda r: ' '.join([str(r[0])] + ['{}:1'.format(e) for e in r[1] if e]).strip())
          .saveAsTextFile('url_text_train_libsvm')
)


In [61]:
#a = sc.textFile('url_text_tf_train_dir').take(2)
import re
def cast_int(v):
    try:
        return int(v)
    except:
        return None


def process_row(r):
    l = [re.sub('[()\[\]L ]','',e) for e in r.split(',')]
    feat = sorted([(cast_int(l[2*i + 7]),l[2*i + 8]) for i in range((len(l)-7)/2)])
    feat = [feat[i] for i in range(len(feat)-1) if feat[i][0] != feat[i+1][0]] + feat[-1:] #remove duplicate indices
    return  ' '.join(l[2:3] + ['{}:{}'.format(*e) for e in feat])
#process_row(r)
! hadoop fs -rm -r url_text_train_tf_libsvm

17/02/02 17:13:57 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 1440 minutes, Emptier interval = 0 minutes.
Moved: 'hdfs://nameservice1/user/k.p.osminin/url_text_train_tf_libsvm' to trash at: hdfs://nameservice1/user/k.p.osminin/.Trash/Current


In [62]:
#prepare libsvm file with tf values. All n-grams united. Positive class is approved applic
import re
def cast_int(v):
    try:
        return int(v)
    except:
        return None

def process_row(r):
    l = [re.sub('[()\[\]L ]','',e) for e in r.split(',')]
    feat = sorted([(cast_int(l[2*i + 7]),l[2*i + 8]) for i in range((len(l)-7)/2)])
    feat = [feat[i] for i in range(len(feat)-1) if feat[i][0] != feat[i+1][0]] + feat[-1:] #remove duplicate indices
    return  ' '.join(l[2:3] + ['{}:{}'.format(*e) for e in feat])

    
(sc.textFile('url_text_tf_train_dir')
          .map(process_row)
          .saveAsTextFile('url_text_train_tf_libsvm')
)


In [70]:
! rm ./external_hdfs/url_text_train_train_tf_libsvm.txt

  File: `./external_hdfs/url_text_train_train_tf_libsvm.txt'
  Size: 0         	Blocks: 0          IO Block: 1048576 regular empty file
Device: 1bh/27d	Inode: 538258290   Links: 1
Access: (0664/-rw-rw-r--)  Uid: (   99/  nobody)   Gid: (   99/  nobody)
Access: 2017-02-02 18:02:34.707000000 +0300
Modify: 2017-02-02 18:12:38.988000000 +0300
Change: 2017-02-02 18:12:38.988000000 +0300


In [66]:
#!hadoop fs -ls | grep libsvm
os.popen('hadoop fs -cat url_text_train_tf_libsvm/* > ./external_hdfs/url_text_train_tf_libsvm.txt').read() # Скидываем в один файл и на локаль

#!hadoop fs -du -h | grep url_text_train_libsvm.txt
#! du -ah ./data | grep libsvm
#! wc -l ./data/url_text_train_libsvm.txt # 307840
#! head -250000 ./data/url_text_train_libsvm.txt > ./external_hdfs/url_text_train_train_libsvm.txt
#! tail -57000 ./data/url_text_train_libsvm.txt > ./external_hdfs/url_text_valid_libsvm.txt
! head -250000 ./external_hdfs/url_text_train_tf_libsvm.txt > ./external_hdfs/url_text_train_train_tf_libsvm.txt
! tail -57000 ./external_hdfs/url_text_train_tf_libsvm.txt > ./external_hdfs/url_text_valid_tf_libsvm.txt

/bin/sh: ./external_hdfs/url_text_train_train_tf_libsvm.txt: Permission denied
/bin/sh: ./external_hdfs/url_text_valid_tf_libsvm.txt: Permission denied


In [10]:
import xgboost as xgb
dtrain = xgb.DMatrix('./external_hdfs/url_text_train_train_libsvm.txt')
dvalid = xgb.DMatrix('./external_hdfs/url_text_valid_libsvm.txt')

In [18]:
param = {'bst:max_depth':5, 'silent':1, 'objective':'binary:logistic' }

param['eval_metric'] = 'auc'

evallist  = [(dvalid,'eval'), (dtrain,'train')]

In [19]:
num_round = 50
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-auc:0.563897	train-auc:0.558476
[1]	eval-auc:0.563762	train-auc:0.563430
[2]	eval-auc:0.575056	train-auc:0.579148
[3]	eval-auc:0.577535	train-auc:0.583780
[4]	eval-auc:0.577546	train-auc:0.590134
[5]	eval-auc:0.594114	train-auc:0.611501
[6]	eval-auc:0.593749	train-auc:0.619849
[7]	eval-auc:0.597691	train-auc:0.639583
[8]	eval-auc:0.602108	train-auc:0.653352
[9]	eval-auc:0.602476	train-auc:0.661798
[10]	eval-auc:0.609029	train-auc:0.672112
[11]	eval-auc:0.619646	train-auc:0.681447
[12]	eval-auc:0.623031	train-auc:0.691910
[13]	eval-auc:0.627075	train-auc:0.702705
[14]	eval-auc:0.627038	train-auc:0.707417
[15]	eval-auc:0.625863	train-auc:0.713642
[16]	eval-auc:0.627058	train-auc:0.717362
[17]	eval-auc:0.629855	train-auc:0.724428
[18]	eval-auc:0.628943	train-auc:0.729411
[19]	eval-auc:0.629905	train-auc:0.733581
[20]	eval-auc:0.624326	train-auc:0.739696
[21]	eval-auc:0.628357	train-auc:0.743814
[22]	eval-auc:0.628884	train-auc:0.746629
[23]	eval-auc:0.626559	train-auc:0.750020
[2

In [16]:
xgboost [35]	eval-auc:0.6407 на валидирующей выборке

In [6]:
from sklearn.externals.joblib import Memory
from sklearn.datasets import load_svmlight_file
#mem = Memory("./mycache")


def get_data(filename):
    data = load_svmlight_file(filename)
    return data[0], data[1]

X, y = get_data('./external_hdfs/url_text_train_train_libsvm.txt')

In [14]:
Xt, yt = get_data('./external_hdfs/url_text_valid_libsvm.txt')

In [13]:
import sklearn
from sklearn import naive_bayes
clfNB = sklearn.naive_bayes.BernoulliNB()
clfNB.fit(X,y)


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [29]:
from sklearn.metrics import roc_auc_score,precision_recall_curve,auc

#pt = clfNB.predict_proba(Xt)

print('NaiveBayes Validation AUCROC {}. AUC PR {}.'.format(
        roc_auc_score(y_true = yt, y_score = pt[:,1]),
        auc(*precision_recall_curve(y_true = yt, probas_pred  = pt[:,1])[:2],reorder = True)
     ))

NaiveBayes Validation AUCROC 0.574530107059. AUC PR 0.0826299541565.


In [37]:
import sklearn
from sklearn.linear_model import LogisticRegression
clfLR = LogisticRegression(penalty='l1', C=0.4)
clfLR.fit(X,y)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
pt1 = clfLR.predict_proba(Xt)

print('LogisticRegression Validation AUCROC {}. AUC PR {}.'.format(
        roc_auc_score(y_true = yt, y_score = pt1[:,1]),
        auc(*precision_recall_curve(y_true = yt, probas_pred  = pt1[:,1])[:2],reorder = True)
     ))

LogisticRegression Validation AUCROC 0.564393834024. AUC PR 0.0733905747138.


In [None]:
print(datetime.datetime.now())

In [None]:
df_test = test_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()
print(datetime.datetime.now())

In [None]:
#Build AUCROC metric and print results
import sklearn
AUCROC = {}
for c in df_test.columns:
    if c!= 'Label':
        AUCROC[c] = sklearn.metrics.roc_auc_score(df_test['Label'],df_test[c])
        
print('Methods AUCROC performance on test sample ({0:.0f} samples with {1:.0f} positives):\n'.format(len(df_test),df_test['Label'].sum()) +
     '\n'.join(['{0:<30}{1:.5f}'.format(k,v) for (k,v) in AUCROC.items()]))
print(datetime.datetime.now())

In [None]:
train_conf_str = '''
# task type, support train and predict
task = train

# boosting type, support gbdt for now, alias: boosting, boost
boosting_type = gbdt

# application type, support following application
# regression , regression task
# binary , binary classification task
# lambdarank , lambdarank task
# alias: application, app
objective = binary

# eval metrics, support multi metric, delimite by ',' , support following metrics
# l1 
# l2 , default metric for regression
# ndcg , default metric for lambdarank
# auc 
# binary_logloss , default metric for binary
# binary_error
metric = auc,binary_logloss

# frequence for metric output
metric_freq = 1

# true if need output metric for training data, alias: tranining_metric, train_metric
is_training_metric = true

# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
max_bin = 255

# training data
# if exsting weight file, should name to "binary.train.weight"
# alias: train_data, train
#data = binary.train

# validation data, support multi validation data, separated by ','
# if exsting weight file, should name to "binary.test.weight"
# alias: valid, test, test_data, 
#valid_data = binary.test

# number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
num_trees = 100

# shrinkage rate , alias: shrinkage_rate
learning_rate = 0.05

# number of leaves for one tree, alias: num_leaf
num_leaves = 63

# type of tree learner, support following types:
# serial , single machine version
# feature , use feature parallel to train
# data , use data parallel to train
# voting , use voting based parallel to train
# alias: tree
tree_learner = data

# number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 
# num_threads = 8

# feature sub-sample, will random select 80% feature to train on each iteration 
# alias: sub_feature
feature_fraction = 0.8

#classes are unbalanced
is_unbalance = true

# Support bagging (data sub-sample), will perform bagging every 5 iterations
bagging_freq = 5

# Bagging farction, will random select 80% data on bagging
# alias: sub_row
bagging_fraction = 0.8

# minimal number data for one leaf, use this to deal with over-fit
# alias : min_data_per_leaf, min_data
min_data_in_leaf = 5

# minimal sum hessians for one leaf, use this to deal with over-fit
#min_sum_hessian_in_leaf = 5.0

# save memory and faster speed for sparse feature, alias: is_sparse
is_enable_sparse = true

# when data is bigger than memory size, set this to true. otherwise set false will have faster speed
# alias: two_round_loading, two_round
use_two_round_loading = true

# true if need to save data to binary file and application will auto load data from binary file next time
# alias: is_save_binary, save_binary
is_save_binary_file = false

# output model file
output_model = lgbm.model

'''

lgbm_dir = './lgbm'
import os
os.popen('mkdir -p '+lgbm_dir).read()
open(lgbm_dir + '/train.conf','w').write(train_conf_str)

In [None]:
#! ls /data1/share/kosm/data
train_file_path = '/data1/share/kosm/data/train_data.txt'
test_file_path = '/data1/share/kosm/data/test_data_all.txt'
test1_file_path = '/data1/share/kosm/data/url_text_test_data_sampled.txt' # Тест с исходной долей положительного класса
lgbm_exec_path = "/data1/share/LightGBM/lightgbm"

In [None]:
#os.popen(lgbm_exec_path +' config=' + lgbm_dir + '/train.conf data=' + train_file_path + ' valid=' + test_file_path + '> ' + lgbm_dir + '/log').read()
#print(os.popen('tail -200 ' + lgbm_dir + '/log').read())

In [90]:
#a = sc.textFile('url_text_tf_train_dir').map(lambda r:(r.split(',')[0],1)).reduceByKey(lambda v1,v2:v1+v2).collect()
#sorted(a)
re.sub("[u()\[\]L \']",'',a[0][0])

u'2016-05-13'

In [None]:
#prepare libsvm file with tf values. All n-grams united. Positive class is approved applic
import re
def cast_int(v):
    try:
        return int(v)
    except:
        return None

def process_row(r):
    l = [re.sub('[()\[\]L ]','',e) for e in r.split(',')]
    feat = sorted([(cast_int(l[2*i + 7]),l[2*i + 8]) for i in range((len(l)-7)/2)])
    feat = [feat[i] for i in range(len(feat)-1) if feat[i][0] != feat[i+1][0]] + feat[-1:] #remove duplicate indices
    return  ' '.join(l[2:3] + ['{}:{}'.format(*e) for e in feat])

    
(sc.textFile('url_text_tf_train_dir')
          .filter(lambda row: row.split(',')[3].strip() == '1')
          .filter(lambda row: re.sub("[u()\[\]L \']",'',row.split(',')[0]) < '2016-11-01')
          .map(process_row)
          .saveAsTextFile('url_text_train_tf_libsvm')
)

In [94]:
#z = sc.textFile('url_text_tf_train_dir').take(1)
# url_text_tf_train_dir columns: ymd,phone_num, approve_label,full_app_label,full_app_first_day,sampled,strong_sampled

###Подготовить выборку полных заявок против одобренных. Признаки - tf.

In [93]:
print(datetime.datetime.now())
start = datetime.datetime.now()
(sc.textFile('url_text_tf_train_dir')
          .filter(lambda row: row.split(',')[3].strip() == '1')
          .filter(lambda row: re.sub("[u()\[\]L \']",'',row.split(',')[0]) < '2016-11-01')
          .map(process_row)
          .saveAsTextFile('url_text_fullapp_train_tf_libsvm')
)
print(datetime.datetime.now() - start)
start = datetime.datetime.now()
(sc.textFile('url_text_tf_train_dir')
          .filter(lambda row: row.split(',')[3].strip() == '1')
          .filter(lambda row: re.sub("[u()\[\]L \']",'',row.split(',')[0]) >= '2016-11-01')
          .map(process_row)
          .saveAsTextFile('url_text_fullapp_test_tf_libsvm')
)
print(datetime.datetime.now() - start)


2017-02-07 12:20:15.434579
0:44:17.318954
0:24:22.569780


In [95]:
os.popen('hadoop fs -cat url_text_fullapp_train_tf_libsvm/* > ./external_hdfs/url_text_fullapp_train_tf_libsvm.txt').read()
os.popen('hadoop fs -cat url_text_fullapp_test_tf_libsvm/* > ./external_hdfs/url_text_fullapp_test_tf_libsvm.txt').read()

''

In [134]:
! ls ./external_hdfs | grep fullapp
#! rm -rf ./external_hdfs/url_text_fullapp_test_tf_libsvm

#! mkdir ./external_hdfs/tst
#! echo 213 > ./external_hdfs/tst/tst.txt
#! rm -rf ./external_hdfs/tst

#os.popen('rm -rf ./external_hdfs/url_text_fullapp_test_tf_libsvm').read()
#os.popen('hadoop fs -rm -r url_text_fullapp_train_tf_libsvm').read()


url_text_fullapp_test_tf_libsvm.txt
url_text_fullapp_train_tf_libsvm.txt


In [8]:
#! wc -l ./external_hdfs/url_text_fullapp_train_tf_libsvm.txt
os.popen('''perl -ne 'print substr($_,0,1) . "\n"' ./external_hdfs/url_text_fullapp_train_tf_libsvm.txt | grep 1 | wc -l''').read()


'12746\n'

In [1]:
import xgboost as xgb


In [2]:
dtrain = xgb.DMatrix('./external_hdfs/url_text_fullapp_train_tf_libsvm.txt')
dtest = xgb.DMatrix('./external_hdfs/url_text_fullapp_test_tf_libsvm.txt') 

In [15]:
param = {'bst:max_depth':6, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic', 'alpha':1, 'tree_method':'approx','scale_pos_weight':8}
#param['eval_metric'] = ['auc','logloss']
evallist  = [(dtest,'eval'), (dtrain,'train')]


In [31]:
num_round = 400
bst = xgb.train( param, dtrain, num_round, evallist )

[0]	eval-error:0.488034	train-error:0.438163
[1]	eval-error:0.516251	train-error:0.442425
[2]	eval-error:0.507632	train-error:0.434123
[3]	eval-error:0.485892	train-error:0.416090
[4]	eval-error:0.480619	train-error:0.408499
[5]	eval-error:0.467113	train-error:0.391105
[6]	eval-error:0.458012	train-error:0.385884
[7]	eval-error:0.452788	train-error:0.378035
[8]	eval-error:0.459601	train-error:0.370284
[9]	eval-error:0.443519	train-error:0.351319
[10]	eval-error:0.439522	train-error:0.345050
[11]	eval-error:0.431674	train-error:0.336056
[12]	eval-error:0.428448	train-error:0.332274
[13]	eval-error:0.418745	train-error:0.322560
[14]	eval-error:0.416265	train-error:0.316461
[15]	eval-error:0.413930	train-error:0.313300
[16]	eval-error:0.403265	train-error:0.305761
[17]	eval-error:0.400111	train-error:0.302361
[18]	eval-error:0.400039	train-error:0.299901
[19]	eval-error:0.391636	train-error:0.293890
[20]	eval-error:0.388193	train-error:0.288945
[21]	eval-error:0.385112	train-error:0.28350

In [30]:
#help(xgb.cv)
res = xgb.cv(param, dtrain,num_boost_round = 300, nfold = 5, metrics = ['auc'],show_progress = True,early_stopping_rounds = 5)
print(res)

Will train until cv error hasn't decreased in 5 rounds.
[0]	cv-test-auc:0.6444712+0.00329681727731	cv-train-auc:0.6605436+0.0045101052582
[1]	cv-test-auc:0.6826206+0.00590037139848	cv-train-auc:0.7083654+0.00686434304504
[2]	cv-test-auc:0.6961196+0.00776884801241	cv-train-auc:0.7282232+0.0103359824574
[3]	cv-test-auc:0.7117018+0.00651794327683	cv-train-auc:0.7468652+0.00830747175499
[4]	cv-test-auc:0.7231412+0.00676834932314	cv-train-auc:0.762578+0.00829820337181
[5]	cv-test-auc:0.732629+0.00609884585803	cv-train-auc:0.7743148+0.00838997872226
[6]	cv-test-auc:0.7403924+0.00685325127512	cv-train-auc:0.7852554+0.0112800877585
[7]	cv-test-auc:0.7452848+0.00598923158343	cv-train-auc:0.7932248+0.0114380644936
[8]	cv-test-auc:0.7503126+0.00544879265893	cv-train-auc:0.8006922+0.0105570273922
[9]	cv-test-auc:0.7565064+0.00473033159937	cv-train-auc:0.8097298+0.00944158695136
[10]	cv-test-auc:0.7619796+0.00331857473021	cv-train-auc:0.816801+0.00883543343589
[11]	cv-test-auc:0.7669514+0.002080889

     test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0         0.644471      0.003297        0.660544       0.004510
1         0.682621      0.005900        0.708365       0.006864
2         0.696120      0.007769        0.728223       0.010336
3         0.711702      0.006518        0.746865       0.008307
4         0.723141      0.006768        0.762578       0.008298
5         0.732629      0.006099        0.774315       0.008390
6         0.740392      0.006853        0.785255       0.011280
7         0.745285      0.005989        0.793225       0.011438
8         0.750313      0.005449        0.800692       0.010557
9         0.756506      0.004730        0.809730       0.009442
10        0.761980      0.003319        0.816801       0.008835
11        0.766951      0.002081        0.823977       0.007974
12        0.771901      0.003220        0.831645       0.007515
13        0.775988      0.002208        0.837175       0.007547
14        0.780109      0.002678        

[299]	cv-test-auc:0.9061806+0.0035495046753	cv-train-auc:0.9999774+1.85472369907e-06


In [23]:
res.shape

(40, 4)

In [25]:
sc.stop()