In [14]:
# -*- coding: utf-8 -*-
from datetime import date, timedelta
HIVE_HOST = 'ds-hadoop-cs01p'
HIVE_PORT = 10000
HIVE_USER = 'kposminin'
CONF={'hive.vectorized.execution.enabled':'true'
    ,'mapreduce.map.memory.mb':'4096'
    ,'mapreduce.map.child.java.opts':'-Xmx4g'
    ,'mapreduce.task.io.sort.mb':'1024'
    ,'mapreduce.reduce.child.java.opts':'-Xmx4g'
    ,'mapreduce.reduce.memory.mb':'7000'
    ,'mapreduce.reduce.shuffle.input.buffer.percent':'0.5'
    ,'mapreduce.input.fileinputformat.split.minsize':'536870912'
    ,'mapreduce.input.fileinputformat.split.maxsize':'1073741824'
    ,'hive.optimize.ppd':'true'
    ,'hive.merge.smallfiles.avgsize':'536870912'
    ,'hive.merge.mapredfiles':'true'
    ,'hive.merge.mapfiles':'true'
    ,'hive.hadoop.supports.splittable.combineinputformat':'true'
    ,'hive.exec.reducers.bytes.per.reducer':'536870912'
    ,'hive.exec.parallel':'true'
    ,'hive.exec.max.created.files':'10000000'
    ,'hive.exec.compress.output':'true'
    ,'hive.exec.dynamic.partition.mode':'nonstrict'
    ,'hive.exec.max.dynamic.partitions':'1000000'
    ,'hive.exec.max.dynamic.partitions.pernode':'100000'
    ,'io.seqfile.compression.type':'BLOCK'
          }

In [15]:
from pyhive import hive
conn = hive.Connection(host=HIVE_HOST, port=HIVE_PORT, username=HIVE_USER, configuration=CONF)
cursor = conn.cursor()

In [50]:
   
def prepare_tables(ymd):
    queries = []
    ind = ymd.replace('-', '')
    queries.append('drop table if exists user_kposminin.la_ind_0'.replace('ind', ind))
    queries.append("""
    create table user_kposminin.la_ind_0 as
    select
     id
     ,regexp_extract(regexp_extract(url, "([^\?]*)", 0), '^([^/]*)', 1) as domain
     ,regexp_extract(regexp_extract(url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) lev0
     ,regexp_extract(regexp_extract(url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) lev1
     ,regexp_extract(regexp_extract(url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) lev2
    from
     prod_raw_liveinternet.access_log
    where
     ymd = "ymd_dt"
     and url not in ('vk.com', 'm.vk.com', 'ok.ru', 'm.ok.ru','mail.ru', 'new.vk.com','new.vk.com/groups','vk.com/groups','vk.com/im'
                    ,'vk.com/friends', 'm.vk.com/mail')
        """.replace('ind',ind).replace('ymd_dt', ymd)
        )
    queries.append('drop table if exists user_kposminin.la_ind_1'.replace('ind', ind))
    queries.append("""
    create table user_kposminin.la_ind_1 as
    select distinct
     *
     from
     ( 
     select
       id
      ,domain as up
     from
      prod_lookalike.la_ind_0
     where
      length(domain) > 0
     union all
     select
      id
      ,concat(domain,'[0]',lev0) as up
     from
      user_kposminin.la_ind_0
     where
      length(lev0) > 0
    union all
     select
      id
      ,concat(domain,'[1]',lev1) as up
     from
      user_kposminin.la_ind_0
     where
      length(lev1) > 0
     union all
     select
      id
      ,concat(domain,'[2]',lev2) as up
     from
      user_kposminin.la_ind_0
     where
      length(lev2) > 0
       ) t
        """.replace('ind',ind)
        )
    return ';\n'.join(queries + [''])

In [51]:
# Init segments

#based ob source table with full url
segments1 = [
['banks',[
    ['rsb_cc',['anketa.rsb.ru%']],
    ['raiff_cc',['raiffeisen.ru/retail/cards/credit%']],
    ['creeur_cc',['crediteurope.ru/privately/kreditnye_karty_card_credit%','crediteurope.ru/privately/credit_cards%']],
#    ['sber_cc',['%sberbank.ru/ru/person/bank_cards/credit%']],
    ['trust_c',['trust.ru/retail/cash%']],
]],
['aggregators',[
    ['bankcreditcard',['bankcreditcard.ru%']],
    ['bankiros',['bankiros.ru%']],
    ['sravni_c',['sravni.ru/kredity%', 'sravni.ru/zaimy%', 'sravni.ru/kreditnyj-skoring%']],
    ['allcredits',['allcredits.ru%']],
    ['moneymatika_c',['moneymatika.ru/bank/kreditnie-karty%', 'moneymatika.ru/bank/potrebitelskie-kredity%' , 'moneymatika.ru/bank/mikrokredity%']],
    ['kompaskreditov',['kompaskreditov.ru%']],
    ['infapronet.ru',['infapronet.ru%']],
]],
['mfo', [
    ['cashalot',['cashalot.su%']],
    ['filkos',['filkos.com%']],
    ['zaymiprosto',['zaymiprosto.ru%']],
    ['vkarmane-online',['vkarmane-online.ru%']],
    ['burocreditov',['burocreditov.ru%']],
    ['mos-zaim',['mos-zaim.ru%']],
]],
]

#based on id_up table.
segments2 = [
['tcs_cp_app_top100_la',[
    ['la1',['rubanks.in[1]26.html']],
    ['la2',['bank-advisor.ru[1]tinkoffbank']],
    ['la3',['zaimitut.ru[0]banks']],
    ['la4',['zanimaem-online.ru[1]credit-cards']],
    ['la5',['creditstories.ru[0]app']],
    ['la6',['creditstories.ru[1]credit']],
    ['la7',['allbanci.ru']],
    ['la8',['bankcreditcard.ru[0]na-dom']],
    ['la9',['bankcreditcard.ru[0]tinkoff-platinum']],
    ['la10',['rubanks.in[1]27.html']],
    ['la11',['cfy.su[0]kreditnye-karty']],
    ['la12',['bankcreditcard.ru[0]18']],
    ['la13',['anketa-v-bank.ru[0]kreditnaia-karta.php']],
    ['la14',['bankcreditcard.ru[0]online-zayavka-odobrenye']],
    ['la15',['zanimaem-online.ru[0]products']],
    ['la16',['credityt.ru[0]kreditnaja-karta-onlaine-anketa-i-oformlenie.html']],
    ['la17',['onlinezayavkanacredit.ru[0]onlajn-zayavka-na-kreditnuyu-kartu-vostochnyj-ekspress-bank']],
    ['la18',['bankcreditcard.ru[0]passport']],
    ['la19',['всемикрозаймы.рф[1]кредитные-карты-по-паспорту-с-моментальным-решением']],
    ['la20',['burocreditov.ru[0]kreditnye-karty']],
    ['la21',['direct.binbank.ru[0]index_anketa.php']],
    ['la22',['kredit-nsk.com[0]kreditnyye-karty']],
    ['la23',['bankcreditcard.ru[0]kreditnaya-karta-bez-propiski']],
    ['la24',['rubanks.in[1]185.html']],
    ['la25',['bankcreditcard.ru[0]express']],
    ['la26',['infapronet.ru[0]rusfinans-bank']],
    ['la27',['rubanks.in[1]13.html']],
    ['la28',['bankcreditcard.ru[0]renessans']],
    ['la29',['kredist.ru[0]vydacha-kreditnyh-kart-s-plohoj-ki']],
    ['la30',['bankcreditcard.ru[0]bez-spravok']],
    ['la31',['krasnojarsk.vbr.ru[1]kredity']],
    ['la32',['masterdohodov.ru[1]kak-vzyat-kredit-esli-oficialno-ne-rabotaesh.html']],
    ['la33',['kreditonomika.ru[0]kreditnye_karty']],
    ['la34',['sravni.ru[0]karty-bez-spravok-o-dokhodakh']],
    ['la35',['rubanks.in[1]krasnodar']],
    ['la36',['anketa-v-bank.ru']],
    ['la37',['bankcreditcard.ru[0]evroset-kukuruza-s-kreditnym-limitom']],
    ['la38',['creditstories.ru[1]kredit18let']],
    ['la39',['masterdohodov.ru[0]kredity']],
    ['la40',['bankcreditcard.ru[0]karta-bezrabotnym']],
    ['la41',['онлайн-кредитка.рф[0]kredit-na-kartu-onlajn-srochno-ne-vyxodya-iz-doma-bez-otkaza']],
    ['la42',['zaimexpert.ru[0]mikrozajmy']],
    ['la43',['webcredit.biz[0]gorod']],
    ['la44',['k-f-b.ru[2]zayavka_na_kreditnuyu_kartu_onlayn']],
    ['la45',['k-f-b.ru[1]creditcards']],
    ['la46',['rubanks.in[1]chelyabinsk']],
    ['la47',['kreditka-onlain.ru']],
    ['la48',['damoney.ru[2]banki-dayut-kredit-18-let.php']],
    ['la49',['bankcreditcard.ru[0]tinkoff']],
    ['la50',['rubanks.in[1]76.html']],
    ['la51',['bankcreditcard.ru[0]s-plohoy-kreditnoj-istoriej']],
    ['la52',['bankcreditcard.ru[0]post']],
    ['la53',['creditstories.ru[0]money']],
    ['la54',['rubanks.in[1]16.html']],
    ['la55',['zaem.info[1]kakie-banki-dayut-kredit-s-20-let.html']],
    ['la56',['sravni.ru[2]pochemu-bank-mojet-otkazat-v-kredite']],
    ['la57',['krednall.ru[0]kredit-nalichnymi']],
    ['la58',['rubanks.in[1]57.html']],
    ['la59',['rubanks.in[1]53.html']],
    ['la60',['m.centrzaimov.ru[0]thankyou']],
    ['la61',['moscowkredit.ru[0]kredit-s-ploxoj-kreditnoj-istoriej']],
    ['la62',['bank-rf.ru[1]articles']],
    ['la63',['rubanks.in[1]68.html']],
    ['la64',['plohaya-kreditnaya-istoriya.ru[0]banki']],
    ['la65',['cfy.su']],
    ['la66',['kreditnyi-calculator.ru[1]tinkoff']],
    ['la67',['creditstories.ru']],
    ['la68',['rubanks.in[1]168.html']],
    ['la69',['privatbankrf.ru[0]kartyi']],
    ['la70',['kredit-ex.ru']],
    ['la71',['kredit-onlinezayavka.ru[2]0-223']],
    ['la72',['kredit-onlinezayavka.ru[1]onlajn_kredit_bez_spravok_i_poruchitelej_100_odobrenie_s_18_let']],
    ['la73',['direct.binbank.ru']],
    ['la74',['gocredit.ru[2]s-plokhoy-kreditnoy-istoriey']],
    ['la75',['creditzzz.ru[1]kredit-bez-oficialnogo-trudoustrojstva.html']],
    ['la76',['ok.ru[0]tinkoffbank']],
    ['la77',['onlinekredit-zayavka.ru[0]go']],
    ['la78',['cryptopilot.ru[0]kreditnye-karty-po-pasportu-s-momentalnym-resheniem.html']],
    ['la79',['creditbery.ru[2]kredit-studentu.html']],
    ['la80',['anketa.rsb.ru[2]6344']],
    ['la81',['kreditstock.ru[2]potrebitelskij-kredit-s-20-let.html']],
    ['la82',['anketa.rsb.ru[2]6342']],
    ['la83',['infapronet.ru[0]svyaznoy']],
    ['la84',['vk.com[0]idfghftrutyuty']],
    ['la85',['filkos.com[0]credit-card.html']],
    ['la86',['rubanks.in[1]117.html']],
    ['la87',['onlinekredit-zayavka.ru[0]zayavka_potrbitelskij_kredit']],
    ['la88',['rubanks.in[0]online-kredit']],
    ['la89',['creditsrf.com[2]kredit-bez-spravok-o-dohodah']],
    ['la90',['infapronet.ru[1]voprosy-po-kreditnim-kartam']],
    ['la91',['sravni.ru[1]na-20000-rublej']],
    ['la92',['dengi-vzaimy.ru[0]loan_trust_credit.html']],
    ['la93',['infapronet.ru[1]164-poluchaem-kredit-nalichnymi-v-sberbanke.html']],
    ['la94',['sravni.ru[1]sevastopol']],
    ['la95',['sredstva.ru[1]6513.html']],
    ['la96',['webcredit.biz']],
    ['la97',['rubanks.in[1]34.html']],
    ['la98',['zaym-onlayn.ru']],
    ['la99',['creditradar.ru[0]s20let']],
    ['la100',['bankcreditcard.ru[0]russkiy-standart-otzyvy']],
]],
]

#+tinkoff_applications + ta+mailru


In [52]:

def pos_id_query_seg(ymd):
    q = ''
    ind = ymd.replace('-','')
    q += '''
    drop table if exists user_kposminin.la_id_posit_seg_#ind;
    create table user_kposminin.la_id_posit_seg_#ind
    (id String)
    partitioned by(ymd string, seg string)
    stored as sequencefile;
    '''.replace('#ind',ind)
    
    for seg in segments1:    
        q += ('''
    insert into table user_kposminin.la_id_posit_seg_#ind partition (ymd, seg)
    select distinct
         id, ymd, '#seg' as seg
    from prod_raw_liveinternet.access_log
    where ymd = "#ymd"
        and (''' + \
        '\n        or '.join(["url like '" + tu + "'" for tu in reduce(lambda x,y:x+y,[e[1] for e in seg[1]])]) + \
        ''');
        ''').replace('#ind', ind).replace('#ymd', ymd).replace('#seg',seg[0])
    
    for seg in segments2:
        q += ('''
    insert into table user_kposminin.la_id_posit_seg_#ind partition (ymd, seg)
    select distinct
         id, '#ymd' as ymd, '#seg' as seg
    from user_kposminin.la_#ind_1
    where  (''' + \
        '\n        or '.join(["up like '" + tu + "%'" for tu in reduce(lambda x,y:x+y,[e[1] for e in seg[1]])]) + \
        ''');
        ''').replace('#ind', ind).replace('#ymd', ymd).replace('#seg',seg[0])
    return q
        

In [53]:
#tinkoff applications
def cc_wuid_li_query(start_day, end_day):
    '''Generates hql query to extract liveinternet uid for users started application process on a given day (day in format '2016-09-26')'''
    ind = (start_day + '_' + end_day).replace('-','')
    return '''
        drop table if exists user_kposminin.cc_wuid_#ind;
        create table user_kposminin.cc_wuid_#ind as
        select distinct 
            dt_created,
            ymd, 
            wuid, 
            (case is_processed when 3 then 1 else 0 end) as completed_flag,
            0 as revisited
        from prod_dds.portal_application 
        where dt_created >= '#start_day' and dt_created <= '#end_day'
        and wuid is not null 
        and product_name = 'cc_platinum' 
        and lower(lp) not like '%agent%'
        -- and is_processed in (3,21)
        and linked_id is Null
        ;

        drop table if exists user_kposminin.cc_wuid_li_#ind;
        create table user_kposminin.cc_wuid_li_#ind as
        select
             a.ymd
            ,a.dt_created
            ,a.wuid
            ,a.completed_flag
            ,a.revisited
            ,b.dmp_id
            ,c.source_id as li_id
        from
         user_kposminin.cc_wuid_#ind a
         inner join (select distinct source_id, dmp_id from prod_emart.datamind_matching_table where source_type = 'tcs') b on a.wuid = b.source_id
         inner join (select distinct source_id, dmp_id from prod_emart.datamind_matching_table where source_type = 'liveinternet') c on b.dmp_id = c.dmp_id
        ;


        drop table if exists user_kposminin.cc_wuid_li_unique_#ind;
        create table user_kposminin.cc_wuid_li_unique_#ind as
        select 
          a.ymd
         ,a.li_id
         ,max(a.dt_created) as dt_created
         ,min(a.revisited) as revisited
         ,max(a.completed_flag) as completed_flag 
        from 
         user_kposminin.cc_wuid_li_#ind a
         inner join 
        (
        select
         wuid
         ,count(distinct li_id) li_cnt
        from
         user_kposminin.cc_wuid_li_#ind 
        group by
         wuid
        having
         li_cnt = 1
        ) t on a.wuid = t.wuid
        where not li_id is NULL
        group by a.ymd, a.li_id
        ;
    '''.replace('#start_day', start_day).replace('#end_day', end_day).replace('#ind', ind)
           
def tcs_cc_plat_app_started_pos_id_query(ymd_list):
    start_day = min(ymd_list)
    end_day = max(ymd_list)
    ind_period = (start_day + '_' + end_day).replace('-','')
    q = cc_wuid_li_query(start_day, end_day)
    for day in ymd_list:
        ind = day.replace('-','')
        q += '''    
        insert into table user_kposminin.la_id_posit_seg_#ind partition (ymd, seg)
        select 
            li_id as id, ymd, 'tcs_cc_appl_started' as seg
        from user_kposminin.cc_wuid_li_unique_#ind_period where ymd = '#day';
        '''.replace('#day', day).replace('#ind_period', ind_period).replace('#ind', ind)
    return q

def tcs_cc_plat_app_started_w_mail_pos_id_query(ymd_list):
    start_day = min(ymd_list)
    end_day = max(ymd_list)
    ind_period = (start_day + '_' + end_day).replace('-','')
    q = ''
    #q = cc_wuid_li_query(start_day, end_day)
    for day in ymd_list:
        ind = day.replace('-','')
        q += '''    
        insert into table user_kposminin.la_id_posit_seg_#ind partition (ymd, seg)
        select distinct a.id, a.ymd, 'tcs_cc_appl_started_w_mail' as seg
        from
            (select li_id as id, ymd from user_kposminin.cc_wuid_li_unique_#ind_period where ymd = '#day') a
            inner join user_kposminin.la_#ind_1 b on a.id = b.id and b.up = 'e.mail.ru[1]inbox';
        '''.replace('#day', day).replace('#ind_period', ind_period).replace('#ind', ind)
    return q

In [54]:
#print(tcs_cc_plat_app_started_pos_id_query(ymd_list))
#print(tcs_cc_plat_app_started_w_mail_pos_id_query(ymd_list))

In [65]:
def up_scores_for_day_query(ymd, segments = None):
    ind = ymd.replace('-','')
    
    query = '''
    drop table if exists user_kposminin.up_scores_#ind;
    create table user_kposminin.up_scores_#ind
    (up String, score double, total int, positive int,  upd_ymd String)
    partitioned by (seg String)
    stored as sequencefile;
    '''
    if not segments:
        cursor.execute('select distinct seg from user_kposminin.la_id_posit_seg_#ind'.replace('#ind', ind))
        segments = [e[0] for e in cursor.fetchall()]
    for seg in segments:
        query += '''
        insert into user_kposminin.up_scores_#ind partition (seg)
        select
            up,
            log((positive + 0.1)/(total - positive + 0.1)) as score,
            total,
            positive,
            '#ymd' as upd_ymd,
            '#seg' as seg
        from
           (select a.up, count(distinct a.id) as total, count(distinct b.id) as positive
           from user_kposminin.la_#ind_1 a
           left join user_kposminin.la_id_posit_seg_#ind b on a.id = b.id and b.ymd = '#ymd' and seg = '#seg'
           group by a.up) c
        where total > 30000 or positive > 1
        ;
        '''.replace('#seg', seg)
    return query.replace('#ymd', ymd).replace('#ind', ind)
    
def up_scores_query(ymd_list, segments = None):
    start_day = min(ymd_list)
    end_day = max(ymd_list)    
    if not segments:
        q = 'select distinct seg from (' + '\n    union all '.join(
                'select distinct seg from user_kposminin.la_id_posit_seg_#ind'.replace('#ind', ymd.replace('-','')) for ymd in ymd_list
        ) + ') a'
        cursor.execute(q)
        segments = [e[0] for e in cursor.fetchall()]
    query = ''
    for ymd in ymd_list:
        query += up_scores_for_day_query(ymd, segments)
    return query

In [56]:
segments = [u'aggregators',
 u'banks',
 u'mfo',
 u'tcs_cc_appl_started',
 u'tcs_cc_appl_started_w_mail',
 u'tcs_cp_app_la']

#print(up_scores_query(ymd_list, segments))

In [57]:

create_top_up_query = '''
drop table if exists user_kposminin.urlfr_top_scores;
create table user_kposminin.urlfr_top_scores as
select *, concat('group',3 - if(score > -2.5,1,0) - if(score > -3.5,1,0)) as grp from 
(
    select 
        urlfr,
        log((sum(positive) + 0.1)/(sum(total) - sum(positive) + 0.1)) as score,
        cast(avg(total) as int) as total,
        cast(avg(positive) as int) as positive,
        max(ymd) as upd_ymd 
    from
        (select ymd, urlfr, total, positive, count(ymd) over (partition by urlfr) as dcnt from prod_features_liveinternet.urlfr_scores
         where ymd between '2016-08-01' and '2016-08-28' and score > -5) a 
    where dcnt > 18 
    group by urlfr order by score desc
) b;
'''
    
average_up_scores = '''
drop table if exists user_kposminin.up_scores_cumul_20160919_23;

CREATE TABLE `user_kposminin.up_scores_cumul_20160919_23`(
	  `up` string, 
	  `score` double, 
	  `total` int, 
	  `positive` int, 
	  `upd_ymd` string)
	PARTITIONED BY ( 
	  `seg` string)
	ROW FORMAT SERDE 
	  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
	STORED AS INPUTFORMAT 
	  'org.apache.hadoop.mapred.SequenceFileInputFormat' 
	OUTPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
	LOCATION
	  'hdfs://nameservice1/user/hive/warehouse/user_kposminin.db/up_scores_cumul_20160919_23';


insert into user_kposminin.up_scores_cumul_20160919_23 partition (seg)
    select 
        up,
        log((sum(positive) + 0.1)/(sum(total) - sum(positive) + 0.1)) as score,
        cast(sum(total) as int) as total,
        cast(sum(positive) as int) as positive,
        max(upd_ymd) as upd_ymd,
        seg
    from
        (select  up, total, positive, seg, upd_ymd, count(upd_ymd) over (partition by up,seg) as dcnt from 
            (select * from user_kposminin.up_scores_20160919 union all
            select * from user_kposminin.up_scores_20160920 union all
            select * from user_kposminin.up_scores_20160921 union all
            select * from user_kposminin.up_scores_20160922 union all
            select * from user_kposminin.up_scores_20160923) u          
         where score > -6) a 
    where dcnt >= 2 
    group by up,seg order by score desc, seg
;


'''
    
create_update_threshold = '''
drop table if exists prod_lookalike.threshold;
create table prod_lookalike.threshold
(segment_nm String, threshold double)
ROW FORMAT SERDE 
	  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
	STORED AS INPUTFORMAT 
	  'org.apache.hadoop.mapred.SequenceFileInputFormat' 
	OUTPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat';

insert into prod_lookalike.threshold 
select 'aggregators' as segment_nm, -5 as threshold union all
select 'banks' as segment_nm, -5 as threshold union all
select 'mfo' as segment_nm, -5 as threshold union all
select 'tcs_cc_appl_started' as segment_nm, -5 as threshold union all
select 'tcs_cc_appl_started_w_mail' as segment_nm, -5 as threshold union all
select 'tcs_cp_app_la' as segment_nm, -5 as threshold;

'''

In [85]:
def average_up_scores_query(ymdlist):
    ind = (min(ymdlist) + '_' + max(ymdlist)).replace('-','')
    q = '''
    drop table if exists user_kposminin.up_scores_cumul_#ind;
    
    CREATE TABLE `user_kposminin.up_scores_cumul_#ind`(
      `up` string, 
      `score` double, 
      `total` int, 
      `positive` int, 
      `upd_ymd` string)
    PARTITIONED BY ( 
      `seg` string)
    ROW FORMAT SERDE 
      'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
    STORED AS INPUTFORMAT 
      'org.apache.hadoop.mapred.SequenceFileInputFormat' 
    OUTPUTFORMAT 
      'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
    LOCATION
      'hdfs://nameservice1/user/hive/warehouse/user_kposminin.db/up_scores_cumul_#ind';

    insert into user_kposminin.up_scores_cumul_#ind partition (seg)
    select 
        up,
        log((sum(positive) + 0.1)/(sum(total) - sum(positive) + 0.1)) as score,
        cast(sum(total) as int) as total,
        cast(sum(positive) as int) as positive,
        max(upd_ymd) as upd_ymd,
        seg
    from
        (select  up, total, positive, seg, upd_ymd, count(upd_ymd) over (partition by up,seg) as dcnt from 
            (''' + (' union all\n'+' '*13).join('select * from user_kposminin.up_scores_{0}'.format(ymd.replace('-','')) for ymd in ymdlist) + \
    ''') u          
         where score > -6) a 
    where dcnt >= ''' + str(len(ymdlist)/2) + ''' 
    group by up,seg order by score desc, seg
    ;
    '''
    return q.replace('#ind',ind)

def update_prod_lookalike_coef_table_query(tablename):
    q = '''
    drop table if exists prod_lookalike.lookalike_coeff_backup;
    alter table prod_lookalike.lookalike_coeff rename to prod_lookalike.lookalike_coeff_backup;
    create table prod_lookalike.lookalike_coeff like prod_lookalike.lookalike_coeff_backup;
    insert into prod_lookalike.lookalike_coeff partition (segment_nm)
    select up,score,seg as segment_nm from `#tablename`;
    '''.replace('#tablename',tablename)
    return q

In [87]:
#print(average_up_scores(['2016-08-17','2016-08-16']))

In [76]:
def calc_id_corpus(ymd):
    ind = ymd.replace('-','')
    q = """
    create table prod_lookalike.la_ind_2 as
    select
     c.h_uid_rk
     ,max(b.score) score
     ,b.segment_nm
     ,max("ymd_dt") as ymd
    from
     prod_lookalike.la_ind_1 a
     inner join prod_lookalike.lookalike_coeff b on a.up = b.up 
     inner join prod_dds.h_uid c on c.load_src = 'LI.02' and a.id = c.uid_str
     left join prod_lookalike.threshold t on t.segment_nm = b.segment_nm
    where max(b.score) > t.threshold
    group by
      c.h_uid_rk
     ,b.segment_nm
    """.replace('ind',ind).replace('ymd_dt', ymd)
    return q



In [63]:
def update_add_segments(cursor, ymdlist, update_prod = False):
    '''  '''
    q = ''
    for ymd in ymdlist:
        q += prepare_tables(ymd)
        q += pos_id_query_seg(ymd)
    q += tcs_cc_plat_app_started_pos_id_query(ymd_list)
    q += tcs_cc_plat_app_started_w_mail_pos_id_query(ymd_list)
    for query in q.split(';')[:-1]: 
        cursor.execute(query)
    up_query = up_scores_query(cursor, ymd_list)
    for query in up_query.split(';')[:-1]: 
        cursor.execute(query)
    if update_prod == True:
        for query in average_up_scores_query(ymdlist).split(';')[:-1]: 
            cursor.execute(query)
        tabname = 'user_kposminin.up_scores_cumul_' + (min(ymdlist) + '_' + max(ymdlist)).replace('-','')
        for query in update_prod_lookalike_coef_table_query(tabname).split(';')[:-1]: 
            cursor.execute(query) 
        # Update thresholds manually!

## Собственно, функции к использованию:

In [None]:
train_ymdlist = ['2016-09-26','2016-09-27','2016-09-28']
#example: update_add_segments(cursor, train_ymdlist, update_prod = False)
#example: calc id for mailing: for q in calc_id_corpus('2016-09-29').split(';')[:-1]: cursor.execute(q) 