##Кредитный скоринг
### Разметка выборки

In [None]:
import datetime
print('Now is {}'.format(datetime.datetime.now()))

In [1]:
#### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except NameError:
    pass
    
conf = (SparkConf()
        .set("spark.executor.instances", 10)
        .set("spark.driver.maxResultSize", "26g")
        .set('spark.driver.memory','26g')
        .set("spark.executor.memory", '6g')
        .set("spark.yarn.executor.memoryOverhead", 1048)      
        .set('spark.akka.frameSize',2040)
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

In [None]:
# In case sparkcontext doesn't work, use following (remove --driver-memory 4g string):
#import os
#os.environ['PYSPARK_SUBMIT_ARGS'] = '''
#--master yarn --deploy-mode client --num-executors 2 --executor-memory 8g --executor-cores 1 --conf spark.yarn.queue=kposminin
#  pyspark-shell
#'''

#### Создаем таблицу визитов

#### Обсчитываем признаки

In [None]:
gen_features_query = '''

-- -- user_kposminin.cred_app_visits -- --

-- select ymd,count(*) from cred_app_visits group by ymd order by ymd;
-- user_kposminin.ccall_visits_aza_test_20170309

create table user_kposminin.ccall_sc_aza_20170309_1 as
select 
  phone_mobile, 
  call_ymd,
  (unix_timestamp(max(ymd), 'yyyy-MM-dd') - unix_timestamp(min(ymd), 'yyyy-MM-dd'))/60/60/24 as ymd_range,
  stddev(unix_timestamp(ymd, 'yyyy-MM-dd')/60/60 + avg_hour) as time_std,
  count(distinct ymd) as ymd_cnt,
  count(distinct id) as id_cnt,
  avg(avg_hour) as avg_hour,
  percentile_approx(avg_hour,0.1) as avg_hour_q10,
  percentile_approx(avg_hour,0.9) as avg_hour_q90,
  urlfr,
  count(*) as cnt,
  sum(cnt) as hits,
  avg(duration) as avg_duration
from 
  user_kposminin.ccall_visits_aza_test_20170309  v
where
  call_ymd > ymd and call_ymd < date_add(ymd,180)
group by
  phone_mobile, 
  call_ymd,
  urlfr
;

create table user_kposminin.ccall_sc_aza_20170309_2 as 
  select 
     v.phone_mobile,     
     v.call_ymd,
     v.urlfr,
     log((t1.cnt_positive + 1)/(t1.cnt_total - t1.cnt_positive + 1)) as score1,
     t2.score as score2,
     t3.score as score3,
     v.cnt,
     v.hits,
     v.avg_duration,
     v.time_std, 
     v.ymd_range, 
     v.avg_hour,
     v.avg_hour_q10, 
     v.avg_hour_q90, 
     v.ymd_cnt,
     substr(y.section_ind, 0, 6) as yaca_ind   
  from
     user_kposminin.ccall_sc_aza_20170309_1 v
     left join user_kposminin.urlfr_tgt_cnt_ccall_20161201 t1 on t1.urlfr = v.urlfr
     left join (
       select urlfr,score from prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
       where ymd = '2017-01-15' and target = 'tinkoff_platinum_approved_application03@tinkoff_action'
       and (cnt_total > 30000 or cnt_positive > 10)) t2 on t2.urlfr = v.urlfr
     left join (
       select urlfr,score from prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
       where ymd = '2017-01-15' and target = 'tinkoff_platinum_complete_application03@tinkoff_action'
       and (cnt_total > 30000 or cnt_positive > 10)) t3 on t3.urlfr = v.urlfr
     left join user_kposminin.yaca_urlfr y on y.urlfr = v.urlfr

;


create table user_kposminin.ccall_sc_aza_20170309_3 as 
select
  phone_mobile                   as phone_mobile,
  call_ymd                       as call_ymd, 
  yaca_ind                       as yaca_ind,
  sum(cnt)                       as visits_cnt
from user_kposminin.ccall_sc_aza_20170309_2 a
group by
  phone_mobile, call_ymd, yaca_ind 
;


create table user_kposminin.ccall_sc_aza_20170309_4 as 
select
  phone_mobile                   as phone_mobile,
  call_ymd                       as call_ymd, 
  sum(cnt)                       as visits_cnt
from user_kposminin.ccall_sc_aza_20170309_2 a
group by
  phone_mobile, call_ymd
;


#Здесь не хочет считать. Помогает, если таблицу user_kposminin.cc_sc_tr_2_t  хранить в формате bzip2 и выставить настройки :


create table user_kposminin.ccall_sc_aza_20170309_5 as 
select 
  phone_mobile as phone_mobile, 
  call_ymd as call_ymd, 
  count(*) as cnt, 
  sum(cnt) as visits_cnt, 
  sum(hits) as hits, 
  avg(avg_duration) as avg_duration, 
  avg(time_std) as avg_time_std, 
  avg(ymd_range) as avg_ymd_range, 
  avg(ymd_cnt) as avg_ymd_cnt, 
  avg(avg_hour) as avg_hour, 
  avg(avg_hour_q10) as avg_hour_q10, 
  avg(avg_hour_q90) as avg_hour_q90, 
  max(score1) as max_score1, 
  avg(score1) as avg_score1, 
  percentile_approx(score1,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score1, 
  max(score2) as max_score2, 
  avg(score2) as avg_score2, 
  percentile_approx(score2,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score2, 
  max(score3) as max_score3,
  avg(score3) as avg_score3, 
  percentile_approx(score3,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score3 
from user_kposminin.ccall_sc_aza_20170309_2 a 
group by a.phone_mobile, a.call_ymd
;

create table user_kposminin.ccall_sc_aza_20170309_5_part2 as 
select 
  phone_mobile as phone_mobile, 
  call_ymd as call_ymd, 
  sum(if(urlfr like 'e.mail.ru%',1,0)) as emailru,
  sum(if(urlfr like 'm.%',1,0))/sum(1) as mobile_share,
  sum(if(urlfr rlike '^(m\\.)?vk.com%', 1, 0))/sum(1) as vk_share,
  sum(if(urlfr like 'vk.com%' or urlfr rlike '^(m\\.)?ok\\.ru' or urlfr like 'm.odnoklassniki.ru%' or urlfr rlike '^(m\\.)?my.mail.ru',1,0))/sum(1) as social_share,

  sum(if(avg_hour >= 9 and avg_hour <= 20,cnt,0))/sum(1) as work_hours_hits_share,
  stddev(avg_hour) as hour_std,  
  count( if(score1 > 1, urlfr,Null))/sum(1) as good_urlfr_share_score1,
  count( if(score2 > -7, urlfr,Null))/sum(1) as good_urlfr_share_score2,
  count( if(score3 > -7, urlfr,Null))/sum(1) as good_urlfr_share_score3,
  avg( if(score1 > 1, time_std ,Null)) as good_urlfr_timestd_score1,
  max(
             named_struct(
             'score1', score1,
             'time_std', time_std
             )           
     ).time_std as max_urlfr_time_std_1
from user_kposminin.ccall_sc_aza_20170309_2 a 
group by a.phone_mobile, a.call_ymd
;


create table user_kposminin.ccall_sc_aza_20170309_6 as 
select
  b.phone_mobile                 as phone_mobile,
  b.call_ymd                     as call_ymd, 
  concat_ws(" ",sort_array(collect_list(concat(b.yaca_ind,":",format_number(b.visits_cnt/greatest(c.visits_cnt,cast(1 as bigint)),5))))) as yaca_str
  
from user_kposminin.ccall_sc_aza_20170309_3 b 
  left join user_kposminin.ccall_sc_aza_20170309_5 c on c.phone_mobile = b.phone_mobile and c.call_ymd = b.call_ymd
group by
  b.phone_mobile, b.call_ymd ;
 

create table user_kposminin.ccall_sc_aza_20170309_scoring as
select
  a.*,
  c.emailru, 
  c.mobile_share, 
  c.vk_share, 
  c.social_share,
  c.work_hours_hits_share, 
  c.hour_std, 
  c.good_urlfr_share_score1, 
  c.good_urlfr_share_score2, 
  c.good_urlfr_share_score3, 
  c.good_urlfr_timestd_score1, 
  c.max_urlfr_time_std_1, 
  b.yaca_str
from
  user_kposminin.ccall_sc_aza_20170309_5 a
  left join ccall_sc_aza_20170309_6 b on b.phone_mobile = a.phone_mobile and b.call_ymd = a.call_ymd
  left join ccall_sc_aza_20170309_5_part2 c on c.phone_mobile = a.phone_mobile and c.call_ymd = a.call_ymd
;

'''

#### Загрузить и обработать таблицу с признаками

In [None]:
cols = [u'phone_mobile', u'call_ymd', u'cnt', u'visits_cnt',
       u'hits', u'avg_duration', u'avg_time_std', u'avg_ymd_range',
       u'avg_ymd_cnt', u'avg_hour', u'avg_hour_q10', u'avg_hour_q90',
       u'max_score1', u'avg_score1', u'q95_score1', u'q90_score1',
       u'q70_score1', u'q50_score1', u'q30_score1', u'max_score2',
       u'avg_score2', u'q95_score2', u'q90_score2', u'q70_score2',
       u'q50_score2', u'q30_score2', u'max_score3', u'avg_score3',
       u'q95_score3', u'q90_score3', u'q70_score3', u'q50_score3',
       u'q30_score3', u'emailru', u'mobile_share', u'vk_share',
       u'social_share', u'work_hours_hits_share', u'hour_std',
       u'good_urlfr_share_score1', u'good_urlfr_share_score2',
       u'good_urlfr_share_score3', u'good_urlfr_timestd_score1',
       u'max_urlfr_time_std_1', u'yaca_str']

In [None]:
#### ND таблица считается так сложно, т.к. поле default_flg зачастую нулевое и выбрасывает ошибку, что не может определить тип поля по первым 100 строкам

In [7]:

#raw_cols  = hc.sql('select * from  user_kposminin.cred_app_scoring_2 limit 1').collect()[0].__fields__
#hc.sql('set hive.support.quoted.identifiers=none')
df_all = (hc.sql('select * from user_kposminin.cred_app_scoring_2 a')
        .repartition(20)
        .fillna(-1, subset = ['default_flg'])
        .rdd
        .map(lambda r: [r[0] if r[0] else None] + list(r[1:16]) + (r[16] if r[16] else []) + list(r[17:19]) + (r[19] if r[19] else []) 
               + list(r[20:22]) + (r[22] if r[22] else [])  + list(r[26:35]) + list(r[36:]))
        .toDF()
        .toPandas()
         )
#hc.sql('set hive.support.quoted.identifiers=column')


Py4JJavaError: An error occurred while calling o258.collectToPython.
: org.apache.spark.SparkException: Job 8 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:806)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:804)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:804)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1581)
	at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1731)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1229)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1730)
	at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:147)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
	at org.apache.spark.sql.DataFrame$$anonfun$collectToPython$1.apply$mcI$sp(DataFrame.scala:1778)
	at org.apache.spark.sql.DataFrame$$anonfun$collectToPython$1.apply(DataFrame.scala:1778)
	at org.apache.spark.sql.DataFrame$$anonfun$collectToPython$1.apply(DataFrame.scala:1778)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)
	at org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2125)
	at org.apache.spark.sql.DataFrame.collectToPython(DataFrame.scala:1777)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
def try_float(v):
    try:
        return float(v)
    except:
        return None

df_all = (sc.textFile('/user/hive/warehouse/user_kposminin.db/cred_app_scoring_2')
        .map(lambda r: r.split('\x01'))
        .map(lambda r: [r[0] if r[0] else None] + list(r[1:16]) + (r[16].split('\x02') if r[16] else []) + list(r[17:19]) 
             + (r[19].split('\x02') if r[19] else []) + list(r[20:22]) + (r[22].split('\x02') if r[22] else [])  + list(r[26:35]) + list(r[36:]))
        .map(lambda r: list(r[:4]) + [try_float(e) for e in r[4:-1]] + [r[-1]])
        .collect()
         )
len(df_all)

In [None]:
raw_cols  = hc.sql('select * from  user_kposminin.cred_app_scoring_2 limit 1').collect()[0].__fields__
df_all = (hc.sql('select * from user_kposminin.cred_app_scoring_2')        
        .map(lambda r: [r[0] if r[0] else None] + list(r[1:16]) + (r[16] if r[16] else []) + list(r[17:19]) + (r[19] if r[19] else []) 
               + list(r[20:22]) + (r[22] if r[22] else [])  + list(r[26:35]) + list(r[36:]))
        .toDF()
        .toPandas()
         )
df_all.columns = ['default_flg','financial_product_type_cd'] + cols 
feat_cols = df_all.columns[4:-1]

In [None]:
# Import data types
from pyspark.sql.types import *

df_all = (hc.sql('select * from user_kposminin.cred_app_scoring_2').rdd)

# Load a text file and convert each line to a Row.
lines = sc.textFile("examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))
# Each line is converted to a tuple.
people = parts.map(lambda p: (p[0], p[1].strip()))

# The schema is encoded in a string.
schemaString = "name age"

fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)

# Apply the schema to the RDD.
schemaPeople = spark.createDataFrame(people, schema)

# Creates a temporary view using the DataFrame
schemaPeople.createOrReplaceTempView("people")

In [None]:
feat_cols = df_all.columns[4:-1]
#label     = 'approve'


###Факторы Я.каталога

In [None]:
import cPickle
from sklearn.feature_extraction import DictVectorizer
# cPickle.dump(v,open('data/ccall_scoring_dict_vectorizer','w'))
v1 = cPickle.load(open('data/ccall_scoring_dict_vectorizer','r'))
type(v1)

In [None]:
#df_train_all_yaca_dense = v1.fit_transform(df_train_all['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
df_all_yaca_dense = v1.transform(df_all['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))

In [None]:
yaca_cols = ['yaca_{}'.format(i) for i in range(df_all_yaca_dense.shape[1])]
for i in range(df_all_yaca_dense.shape[1]):
    #df_train_all.loc[:,'yaca_{}'.format(i)] = df_train_all_yaca_dense[:,i]
    df_all.loc[:,'yaca_{}'.format(i)] = df_all_yaca_dense[:,i]

In [None]:
import xgboost as xgb
import numpy as np
feat_cols_w_yaca = feat_cols.tolist() + yaca_cols
#dtrain_all_yaca = xgb.DMatrix( df_train_all[feat_cols_w_yaca], label=df_train_all['approve'], missing = np.nan)
dall_yaca = xgb.DMatrix( df_all[feat_cols_w_yaca], missing = np.nan)

In [None]:
import cPickle
bst1 = cPickle.load(open('data/ccall_scoring_xgb.model','r'))
type(bst1)


In [None]:
df_all.loc[:,'pred'] = bst1.predict(dall_yaca)

In [None]:
import sklearn
print('Prediction of default. All products')
print('AUC ROC on all  data: {:.5f}'.format(sklearn.metrics.roc_auc_score(
            y_true = df_all['default_flg'],
            y_score = - df_all['pred'])
))

print('AUC ROC on 2015 data: {:.5f}'.format(sklearn.metrics.roc_auc_score(
            y_true = df_all[df_all['call_ymd'] < '2016-01-01']['default_flg'],
            y_score = - df_all.loc[df_all['call_ymd'] < '2016-01-01','pred'])
))

print('AUC ROC on 2016 data: {:.5f}'.format(sklearn.metrics.roc_auc_score(
            y_true = df_all[df_all['call_ymd'] >= '2016-01-01']['default_flg'],
            y_score = - df_all.loc[df_all['call_ymd'] >= '2016-01-01','pred'])
))

print('PR ROC on all  data: {:.5f}'.format(sklearn.metrics.average_precision_score(
            y_true = df_all['default_flg'],
            y_score = - df_all['pred'])
))

print('PR ROC on 2015 data: {:.5f}'.format(sklearn.metrics.average_precision_score(
            y_true = df_all[df_all['call_ymd'] < '2016-01-01']['default_flg'],
            y_score = - df_all.loc[df_all['call_ymd'] < '2016-01-01','pred'])
))

print('PR ROC on 2016 data: {:.5f}'.format(sklearn.metrics.average_precision_score(
            y_true = df_all[df_all['call_ymd'] >= '2016-01-01']['default_flg'],
            y_score = - df_all.loc[df_all['call_ymd'] >= '2016-01-01','pred'])
))
print('Avg default rate on all  data: {:.5f}'.format(df_all['default_flg'].mean()))
print('Avg default rate on 2015 data: {:.5f}'.format(df_all[df_all['call_ymd'] < '2016-01-01']['default_flg'].mean()))
print('Avg default rate on 2016 data: {:.5f}'.format(df_all[df_all['call_ymd'] >= '2016-01-01']['default_flg'].mean()))

In [None]:
print('Prediction of default. Per product. All periods.')
print('AUC ROC on CCR all years: {:.5f}'.format(sklearn.metrics.roc_auc_score(
            y_true = df_all[df_all['financial_product_type_cd'] == 'CCR']['default_flg'],
            y_score = - df_all[df_all['financial_product_type_cd'] == 'CCR']['pred'])
))
print('PR ROC on CCR all years: {:.5f}'.format(sklearn.metrics.average_precision_score(
            y_true = df_all[df_all['financial_product_type_cd'] == 'CCR']['default_flg'],
            y_score = - df_all[df_all['financial_product_type_cd'] == 'CCR']['pred'])
))
print('Avg default rate on CCR all years: {:.5f}'.format(df_all[df_all['financial_product_type_cd'] == 'CCR']['default_flg'].mean()))

print('AUC ROC on LON all years: {:.5f}'.format(sklearn.metrics.roc_auc_score(
            y_true = df_all[df_all['financial_product_type_cd'] == 'LON']['default_flg'],
            y_score = - df_all[df_all['financial_product_type_cd'] == 'LON']['pred'])
))
print('PR ROC on LON all years: {:.5f}'.format(sklearn.metrics.average_precision_score(
            y_true = df_all[df_all['financial_product_type_cd'] == 'LON']['default_flg'],
            y_score = - df_all[df_all['financial_product_type_cd'] == 'LON']['pred'])
))
print('Avg default rate on LON all years: {:.5f}'.format(df_all[df_all['financial_product_type_cd'] == 'LON']['default_flg'].mean()))

In [None]:
df_all.default_flg.unique()

In [None]:
hc.registerDataFrameAsTable(hc.createDataFrame(df_all), 'df_all')
print(1)

In [None]:
'user_kposminin.default_az_cred_scor_result'

In [None]:
#hc.registerDataFrameAsTable(hc.createDataFrame(df_all), 'df_all')
hc.sql('drop table if exists big_data_science.default_az_cred_scor_result')
hc.sql('create table big_data_science.default_az_cred_scor_result as select * from df_all')
print('done')

In [None]:
#df_all.to_csv('data/default_az_cred_scor_result.csv',index = False,columns = False,header=False)
df_all.default_flg.unique()

#### Почему-то спарк неограниченно долго работает, но по факту таблица записывается.

In [None]:
df_all.shape