## Cred_scor_url_titles_bow
## Анализ заголовков страниц в задаче кред скоринга
DMP-3643

In [1]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except:
    pass
conf = (SparkConf()
        .set("spark.executor.instances", 16)
        .set("spark.driver.maxResultSize", "16g")
        .set('spark.driver.memory','16g')
        .set("spark.executor.memory", '16g')
        .set("spark.yarn.executor.memoryOverhead", 1048)
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

### Слова как множество: с коэф. 1

In [30]:
hashingTF = HashingTF(numFeatures = 2 ** 17)
train_data = (hc.sql('select phone_num, call_ymd, approve, bow from user_kposminin.cred_scor_title_bow1 where not approve is Null')
                .filter('call_ymd < "2016-11-01"')
                .rdd
                .map(lambda row:LabeledPoint(row.approve,hashingTF.transform(row.bow)))
                .map(lambda lp:LabeledPoint(lp.label,SparseVector(lp.features.size,lp.features.indices,[1] * len(lp.features.indices))))
             )

test_data = (hc.sql('select phone_num, call_ymd, approve, bow from user_kposminin.cred_scor_title_bow1 where not approve is Null')
                .filter('call_ymd >= "2016-11-01"')
                .rdd
                .map(lambda row:LabeledPoint(row.approve,hashingTF.transform(row.bow)))
                .map(lambda lp:LabeledPoint(lp.label,SparseVector(lp.features.size,lp.features.indices,[1] * len(lp.features.indices))))
             )
train_data.cache()

PythonRDD[525] at RDD at PythonRDD.scala:48

In [20]:
#a = train_data.take(5)
#lp = a[0]
#SparseVector(lp.features.size,lp.features.indices,[1] * len(lp.features.indices))

In [31]:
#Train NaiveBayes model

modelNB = NaiveBayes.train(train_data)

def predict_proba_NB(f,model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability. f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Function selects winning class with it probability.
    Output: tuple with model selected class number as first element (type int) and it probability as second (type float).
    '''
    logp = [[i,f.dot(model.theta[i]) + model.pi[i]] for i in range(len(model.theta))] # classes with log probabilities
    wi = sorted(logp, key = lambda e:  - e[1])[0][0] #winning index
    prob = 1./sum([np.exp(e[1] - logp[wi][1]) for e in logp]) #winning class probability
    return wi, prob

def predict_proba_NB_2(f, model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability for 2-class classification.
    f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Output: probability of class 1 (type float).
    '''
    if len(model.theta) != 2:
        print('Model is NOT a 2-class classifier')
        return None
    logp = [f.dot(model.theta[i]) + model.pi[i] for i in range(2)]    
    return 1./(1. + np.exp(logp[0] - logp[1]))


In [32]:
#LogisticRegression model
modelLR = LogisticRegressionWithSGD.train(train_data)
modelLR.clearThreshold()

#LogisticRegression model
#modelLR2 = LogisticRegressionWithSGD.train(train_data,regType = 'l1',regParam = 0.05)
#modelLR2.clearThreshold()

In [23]:
'''
ML approach
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
hashingTF = HashingTF(inputCol="bow", outputCol="Features", numFeatures=2**17)
df_train_tf = hashingTF.transform(df_train)
'''

'\nML approach\nfrom pyspark.ml.feature import HashingTF, IDF, Tokenizer\nhashingTF = HashingTF(inputCol="bow", outputCol="Features", numFeatures=2**17)\ndf_train_tf = hashingTF.transform(df_train)\n'

In [33]:
df_test = test_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()
df_test.shape

(72719, 3)

In [34]:
#Build AUCROC metric and print results
import sklearn,sklearn.metrics
AUCROC = {}
for c in df_test.columns:
    if c!= 'Label':
        AUCROC[c] = sklearn.metrics.roc_auc_score(df_test['Label'],df_test[c])
        
print('Methods AUCROC performance on test sample ({0:.0f} samples with {1:.0f} positives):\n'.format(df_test.size,df_test['Label'].sum()) +
     '\n'.join(['{0:<30}{1:.5f}'.format(k,v) for (k,v) in AUCROC.items()]))

Methods AUCROC performance on test sample (218157 samples with 28991 positives):
LogisticRegression            0.58303
NaiveBayes                    0.62817


### 1 лучше чем tf. NaiveBayes         AUC ROC           0.62817

In [37]:
from pyspark.mllib.tree import RandomForest, RandomForestModel


# Load and parse the data file into an RDD of LabeledPoint.

clfRF = RandomForest.trainClassifier(train_data, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=200, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)
#clfRF.clearThreshold()

# Evaluate model on test instances and compute test error
#predictions = model.predict(testData.map(lambda x: x.features))
pred_labels = test_data.map(lambda lp: lp.label, float(clfRF.predict(lp.features))).collect()
print('Random Forest classifier AUC ROC {}'.format(
    sklearn.metrics.roc_aur_score(
        y_true  = [e[1] for e in pred_labels],
        y_score = [e[0] for e in pred_labels]
        )
    )
)

#testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
#print('Test Error = ' + str(testErr))

AttributeError: 'RandomForestModel' object has no attribute 'clearThreshold'

In [53]:
import pyspark.mllib.tree
def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel. 
    '''
    
    # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = pyspark.mllib.tree.DecisionTreeModel(trees[0]).predict(data.map(lambda x: x.features))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in range(1,ntrees):
        dtm = pyspark.mllib.tree.DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x/ntrees)

def testError(lap):
    testErr = lap.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))

pred_labels = predict_proba(clfRF,test_data).collect()

'''
print('Random Forest classifier AUC ROC {}'.format(
    sklearn.metrics.roc_auc_score(
        y_true  = [e[1] for e in pred_labels],
        y_score = [e[0] for e in pred_labels]
        )
    ))
'''

"\nprint('Random Forest classifier AUC ROC {}'.format(\n    sklearn.metrics.roc_auc_score(\n        y_true  = [e[1] for e in pred_labels],\n        y_score = [e[0] for e in pred_labels]\n        )\n    ))\n"

In [56]:
len(pred_labels)

TypeError: object of type 'PipelinedRDD' has no len()

### TFIDF

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
#hashingTF = HashingTF(numFeatures = 2 ** 17)
train_data = (hc.sql('''select phone_num, call_ymd, approve, bow 
                        from user_kposminin.cred_scor_title_bow1 
                        where (not approve is Null)
                        and call_ymd < "2016-10-01"
                        ''')                
             )

test_data =  (hc.sql('''select phone_num, call_ymd, approve, bow 
                        from user_kposminin.cred_scor_title_bow1 
                        where (not approve is Null)
                        and call_ymd >= "2016-10-01"
                        ''')                
             )

hashingTF = HashingTF(inputCol="bow", outputCol="TF", numFeatures=2 ** 16)
train_featurizedData = hashingTF.transform(train_data)
test_featurizedData = hashingTF.transform(test_data)

idf = IDF(inputCol="TF", outputCol="TFIDF")
idfModel = idf.fit(train_featurizedData)
train_rescaledData = idfModel.transform(train_featurizedData)
test_rescaledData = idfModel.transform(test_featurizedData)

train_rescaledData.cache()
test_rescaledData.cache()



DataFrame[phone_num: string, call_ymd: string, approve: int, bow: array<string>, TF: vector, TFIDF: vector]

In [12]:

#train_rescaledData.show()
#a = train_rescaledData.toPandas()
#sv = a.iloc[0,4]
#type(sv)
#train_rescaledData["TF"].getItem('indices')
#train_rescaledData.schema
#pyspark.ml.linalg.SparseVector(sv.size,sv.indices,[1]*len(sv.indices))
#train_rescaledData.select("TF").rdd.map(lambda r: pyspark.sql.Row(ones=pyspark.ml.linalg.SparseVector(r[0].size,r[0].indices,[1]*len(r[0].indices)))).toDF().show()

In [None]:
import pyspark.sql.functions
import pyspark.sql.types
import pyspark.sql
#pyspark.sql.types
#pyspark.sql.types.VectorUDT()
#train_rescaledData.show()
#toDoublefunc = UserDefinedFunction(lambda x: x,DoubleType())
#changedTypedf = joindf.withColumn("label",toDoublefunc(joindf['show']))

#to_ones_func = pyspark.sql.functions.udf(lambda sv: pyspark.ml.linalg.SparseVector(sv.size,sv.indices,[1]*sv.size),pyspark.ml.linalg.SparseVector)

train_rescaledData.withColumn(
    "Ones", 
    train_rescaledData.select("TF").rdd.map(lambda r: pyspark.sql.Row(ones=pyspark.ml.linalg.SparseVector(r[0].size,r[0].indices,[1]*len(r[0].indices)))).toDF()['ones']
   ).show()

#train_data.withColumn("Ones", train_data["approve"]+1).show()

In [3]:
import pyspark.ml.classification 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

fout = open('results_20170706.txt','w')

classifiers = {
    'Logistic Regression': pyspark.ml.classification.LogisticRegression(maxIter=50, regParam=0.3),
    'Naive Bayes': pyspark.ml.classification.NaiveBayes(smoothing=1.0, modelType="multinomial"),
    'Random Forest': pyspark.ml.classification.RandomForestClassifier(numTrees=100, maxDepth=5, labelCol="label", seed=42)    
}

features_list = ['TFIDF','TF']

for feats in features_list:
    print('-'*70 + '\n' + feats)
    for clfname,clf in classifiers.items():
        model = clf.fit(train_rescaledData.selectExpr('approve as label',feats + ' as features'))
        test_predict = model.transform(test_rescaledData.selectExpr('approve as label',feats + ' as features'))
        evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
        string = '\n' + clfname + ' metrics on test'
        print(string)
        fout.write(string)
        for metric in ("areaUnderROC","areaUnderPR"):
            string = '{:>20}\t{:<10}'.format(metric, evaluator.evaluate(test_predict.selectExpr('label','rawPrediction'), {evaluator.metricName: metric}))
            print(string)
            fout.write(string)

fout.close()

----------------------------------------------------------------------
TFIDF

Naive Bayes metrics on test
        areaUnderROC	0.490816983125
         areaUnderPR	0.402173464803

Random Forest metrics on test
        areaUnderROC	0.636960561114
         areaUnderPR	0.525347154553

Logistic Regression metrics on test
        areaUnderROC	0.631053664803
         areaUnderPR	0.516976789034
----------------------------------------------------------------------
TF

Naive Bayes metrics on test
        areaUnderROC	0.49830793005
         areaUnderPR	0.406598254595

Random Forest metrics on test
        areaUnderROC	0.636960561114
         areaUnderPR	0.525347154553

Logistic Regression metrics on test
        areaUnderROC	0.631053664803
         areaUnderPR	0.516976789034


In [24]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# Prepare training and test data.
#train, cv = train_rescaledData.randomSplit([0.8, 0.2], seed=12345)

lr = LinearRegression(maxIter=100)
feats = 'TFIDF'

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.5,0.1, 0.01,0.001]) \
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train_rescaledData.selectExpr('approve as label',feats + ' as features'))

# Make predictions on test data. model is the model with combination of parameters
# that performed best.
#model.transform(test_rescaledData)\
#    .select("features", "label", "prediction")\
#    .show()
    
test_predict = model.transform(test_rescaledData.selectExpr('approve as label',feats + ' as features'))

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
#string = '\n' + clfname + ' metrics on test'
#print(string)
#        fout.write(string)

for metric in ("areaUnderROC","areaUnderPR"):
    string = '{:>20}\t{:<10}'.format(metric, evaluator.evaluate(test_predict.selectExpr('label','prediction'), {evaluator.metricName: metric}))
    print(string)
    #fout.write(string)

        areaUnderROC	0.619979838736
         areaUnderPR	0.50721843726


###Подбор параметра по кросс-валидации как-то не помог



### Добавляем факторы из боевой модели

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
create_query = '''
set hive.support.quoted.identifiers=none;
create table user_kposminin.cred_score_oldfeats_and_bow as
select a.*,
       b.`(phone_num|call_ymd|approve)?+.+`
  from user_kposminin.cred_scor_title_bow1 a
  full join user_kposminin.id_feat_ccall b on substr(a.phone_num,3,20) = substr(b.phone_mobile,2,20) and b.call_ymd = a.call_ymd
;
set hive.support.quoted.identifiers=column;
'''

#hc.sql(create_query)
#double_feats = hc.sql('select * from user_kposminin.cred_score_oldfeats_and_bow where (not phone_num is Null) and (not phone_mobile is Null)')

#hashingTF = HashingTF(numFeatures = 2 ** 17)
train_ddata = (hc.sql('''select * 
                        from user_kposminin.cred_score_oldfeats_and_bow 
                        where (not phone_num is Null) 
                        and (not phone_mobile is Null) 
                        and (not approve is Null)
                        and call_ymd < "2016-10-01"
                        ''')                
             )

test_ddata =  (hc.sql('''select * 
                        from user_kposminin.cred_score_oldfeats_and_bow 
                        where (not phone_num is Null) 
                        and (not phone_mobile is Null) 
                        and (not approve is Null)
                        and call_ymd < "2016-10-01"
                        ''')                
              )

hashingTF = HashingTF(inputCol="bow", outputCol="TF", numFeatures = 2 ** 16)
train_featurizedData = hashingTF.transform(train_ddata)
test_featurizedData = hashingTF.transform(test_ddata)

idf = IDF(inputCol="TF", outputCol="TFIDF")
idfModel = idf.fit(train_featurizedData)
train_ddata = idfModel.transform(train_featurizedData)
test_ddata = idfModel.transform(test_featurizedData)

#train_ddata.cache()
#test_ddata.cache()



In [83]:
import pyspark
import pyspark.ml.linalg as pmlla

train_w_feat = (
    train_ddata
          .rdd
          .map(lambda r:pyspark.sql.Row(label=r.approve,features = pmlla.SparseVector(
                 len(r.TFIDF) + len(r) - 9,
                 range(len(r) - 9) + [i + len(r) - 9 for i in r.TFIDF.indices],
                 r[4:5] + r[6:41] + r[42:-5] + r[-4:-2] +  tuple(r.TFIDF.values)
              )))
              )

test_w_feat = (
    test_ddata
          .rdd
          .map(lambda r:pyspark.sql.Row(label=r.approve,features = pmlla.SparseVector(
                 len(r.TFIDF) + len(r) - 9,
                 range(len(r) - 9) + [i + len(r) - 9 for i in r.TFIDF.indices],
                 r[4:5] + r[6:41] + r[42:-5] + r[-4:-2] +  tuple(r.TFIDF.values)
              )))
              )

In [86]:
import pyspark.ml.classification 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

fout = open('results_20170810.txt','w')

classifiers = {
    'Logistic Regression': pyspark.ml.classification.LogisticRegression(maxIter=50, regParam=0.3),
    'Random Forest': pyspark.ml.classification.RandomForestClassifier(numTrees=100, maxDepth=5, labelCol="label", seed=42)    
}



for clfname,clf in classifiers.items():
    model = clf.fit(train_w_feat.toDF())
    test_predict = model.transform(test_w_feat.toDF())
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    string = '\n' + clfname + ' metrics on test'
    print(string)
    fout.write(string)
    for metric in ("areaUnderROC","areaUnderPR"):
        string = '{:>20}\t{:<10}'.format(metric, evaluator.evaluate(test_predict.selectExpr('label','rawPrediction'), {evaluator.metricName: metric}))
        print(string)
        fout.write(string)
fout.close()


Random Forest metrics on test
        areaUnderROC	0.651822654377
         areaUnderPR	0.488557043134

Logistic Regression metrics on test
        areaUnderROC	0.5       
         areaUnderPR	0.677756963519


### Явно недообучается, т.к. боевая модель имеет AUC ROC ~0.69.

In [90]:
train_w_feat.toDF().toPandas().to_csv('./data/train_w_feat.csv')
test_w_feat.toDF().toPandas().to_csv('./data/test_w_feat.csv')

Py4JJavaError: An error occurred while calling o1371.collectToPython.
: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.next(SparkPlan.scala:258)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.next(SparkPlan.scala:254)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.foreach(SparkPlan.scala:254)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeCollect$1.apply(SparkPlan.scala:276)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeCollect$1.apply(SparkPlan.scala:275)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:275)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply$mcI$sp(Dataset.scala:2768)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2765)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2765)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2788)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:2765)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
sc.stop()