In [1]:
import pandas as pd
from datetime import date
import teradatasql
import numpy as np
import os

In [2]:
path = r'P:\\CP_PLM\\Reporting\\Report_Data\\Revenue & Subs\\'
for file in enumerate(os.listdir(path)):
    print(file)
file_index = int(input('Какой номер нужен? '))
shortname = None
for i, file in enumerate(os.listdir(path)):
    if i == file_index:
        print(f'Выбран {file}')
        shortname = file
        filename = path + shortname
        break
if not shortname:
    print('Файл не выбран')

(0, 'SAP_data_for_BO1 old.xlsb')
(1, 'SAP_data_for_BO1.xlsb')
(2, 'SAP_data_for_BO1_BU.xlsb')
(3, 'SAP_data_for_BO1_BU21.xlsb')
Какой номер нужен? 3
Выбран SAP_data_for_BO1_BU21.xlsb


### Загрузка данных SAP

In [3]:
phd = pd.read_excel(filename, sheet_name = 'phd', engine='pyxlsb',
                 names = ['version','month','year','region','base_type','tariff','account','param_value'],
                 usecols = ['version','month','year','region','base_type','tariff','account','param_value'],
                 dtype = {'version': str,
                        'month': str,
                        'year': str,
                        'region': str,
                        'base_type': str,
                        'tariff': str,
                        'account': str,
                        'param_value': float
                       }, skiprows = 1)
phd = phd.dropna()
phd.head()

Unnamed: 0,version,month,year,region,base_type,tariff,account,param_value
0,BU_2021_01,1,2021,203000,NEW,1100,Churn,-5068.0
1,BU_2021_01,1,2021,203000,NEW,1100,DATA traffic,1201475000.0
2,BU_2021_01,1,2021,203000,NEW,1100,Gross Intake,9569.0
3,BU_2021_01,1,2021,203000,NEW,1100,International MtL Revenue,6878.894
4,BU_2021_01,1,2021,203000,NEW,1100,International MtL VC,-29280.67


In [4]:
rev = pd.read_excel(filename, sheet_name = 'rev', engine='pyxlsb',
                 names = ['version','month','year','region','base_type','tariff','account','param_value'],
                 usecols = ['version','month','year','region','base_type','tariff','account','param_value'],
                 dtype = {'version': str,
                        'month': str,
                        'year': str,
                        'region': str,
                        'base_type': str,
                        'tariff': str,
                        'account': str,
                        'param_value': float
                       }, skiprows = 1)
rev = rev.dropna()
rev.head()

Unnamed: 0,version,month,year,region,base_type,tariff,account,param_value
0,BU_2021_01,1,2021,200000,#,#,Gross Margin 1,2397322.0
1,BU_2021_01,1,2021,203000,NEW,1100,Revenue,23022970.0
2,BU_2021_01,1,2021,203000,NEW,1100,Recurring Revenue,23022970.0
3,BU_2021_01,1,2021,203000,NEW,1100,Service revenue,23022970.0
4,BU_2021_01,1,2021,203000,NEW,1100,Service revenue (w/o interconnect),18882410.0


In [5]:
# phd_rev_union = phd
phd_rev_union = pd.concat([phd,rev])
phd_rev = phd_rev_union.copy() # версия датафрейма для отката, если потребуется

### Трансформация данных SAP

In [6]:
phd_rev['report_month'] = phd_rev.apply(lambda row: date(int(row.year), int(row.month), 1), axis = 1)
tariff_dict = {'1100' : 'Bundle', '2000' : 'PAYG'}
phd_rev.tariff = phd_rev.tariff.replace(tariff_dict)
phd_rev.param_value = phd_rev.param_value.apply(lambda x: round(x,4))
phd_rev.base_type = phd_rev.base_type.replace('#','NOT A')
phd_rev.tariff = phd_rev.tariff.replace('#','NO SUBS')
phd_rev.version = phd_rev.version.apply(lambda x: 'BU' if 'BU' in x else x)
versions_dict = {'AC':'AC', 'F3_2020_01':'F3_2020_01'}
versions = phd_rev.version.value_counts().index.to_list()
for ver in versions:
    if ver in versions_dict:
        phd_rev.version = phd_rev.version.replace(ver, versions_dict[ver])
#     elif ver not in versions_dict.values():
#         new_ver = input(f'Как переназовем {ver}? ')
#         phd_rev.version = phd_rev.version.replace(ver, new_ver)

In [7]:
with teradatasql.connect() as session:
    old_query = '''sel *
            from UAT_PRODUCT.DIC_SAP_CODES
            '''
    query = '''
    sel
        branch_id,
        sap_code,
        sap_name_ru
    FROM PRD2_DIC_V.BRANCH
    where SAP_CODE is not null
        '''
    dic_sap_codes = pd.read_sql(query, session)
# dic_sap_codes.head()
phd_rev_branch = pd.merge(phd_rev, dic_sap_codes, how='inner', left_on='region', right_on='SAP_CODE')
phd_rev_branch.head(3)

Unnamed: 0,version,month,year,region,base_type,tariff,account,param_value,report_month,BRANCH_ID,SAP_CODE,SAP_NAME_RU
0,BU,1,2021,203000,NEW,Bundle,Churn,-5068.0,2021-01-01,66.0,203000,Tele2-Бурятия
1,BU,1,2021,203000,NEW,Bundle,DATA traffic,1201475000.0,2021-01-01,66.0,203000,Tele2-Бурятия
2,BU,1,2021,203000,NEW,Bundle,Gross Intake,9569.0,2021-01-01,66.0,203000,Tele2-Бурятия


### Определение данных, подлежащих загрузке

In [8]:
#по комбинации версии, месяца и счета
columns = ['version','report_month','account']

#забираем все, что лежит в PRODUCT_PARAMETERS
with teradatasql.connect() as session:
    query = '''
            sel
                PARAM_1 as "version",
                REPORT_DATE as "report_month",
                PARAM_2 as "account"
            from UAT_PRODUCT.PRODUCT_PARAMETERS
            WHERE (param_1 in ('BU','AC')
                OR param_1 LIKE '%R%' OR param_1 LIKE '%F%') 
            GROUP BY 1,2,3
            '''
    ver_month_kpi_old = pd.read_sql(query, session)
ver_month_kpi_old = ver_month_kpi_old.groupby(by=columns, as_index=False).size().reset_index()
ver_month_kpi_old = ver_month_kpi_old[columns]
ver_month_kpi_new = phd_rev_branch.groupby(by=columns, as_index=False).size().reset_index()
ver_month_kpi_new = ver_month_kpi_new[columns]

In [21]:
def dataframe_difference(df1, df2, which='right_only'):
    """Find rows which are different between two DataFrames."""
    comparison_df = df1.merge(df2,
                              indicator=True,
                              how='outer')
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]
    return diff_df

#находим разницу между имеющимися в PRODUCT_PARAMETERS и подготовленными данными по комбинации версии, месяца и счета (right_only)
ver_month_kpi_to_insert = dataframe_difference(ver_month_kpi_old, ver_month_kpi_new)

#отрезаем phd_rev_branch по комбинации версии, месяца и счета
phd_rev_branch_ready = pd.merge(phd_rev_branch, ver_month_kpi_to_insert,  how='inner', on = columns, indicator = False)

#переформатируем таблицу по макету PRODUCT_PARAMETERS в терадате
cols = ['report_month', 'version', 'account',
        'NULL', 'NULL1', 'BRANCH_ID', 'base_type',
        'NULL2', 'NULL3', 'tariff', 'NULL4', 'param_value']
df = phd_rev_branch_ready.reindex(columns = cols,  fill_value = np.NaN).where((pd.notnull(phd_rev_branch_ready)), None)
df.head()

In [9]:
cols = ['report_month', 'version', 'account',
        'NULL', 'NULL1', 'BRANCH_ID', 'base_type',
        'NULL2', 'NULL3', 'tariff', 'NULL4', 'param_value']
df = phd_rev_branch.reindex(columns = cols,  fill_value = np.NaN).where((pd.notnull(phd_rev_branch)), None)
df.head()

Unnamed: 0,report_month,version,account,NULL,NULL1,BRANCH_ID,base_type,NULL2,NULL3,tariff,NULL4,param_value
0,2021-01-01,BU,Churn,,,66,NEW,,,Bundle,,-5068.0
1,2021-01-01,BU,DATA traffic,,,66,NEW,,,Bundle,,1201480000.0
2,2021-01-01,BU,Gross Intake,,,66,NEW,,,Bundle,,9569.0
3,2021-01-01,BU,International MtL Revenue,,,66,NEW,,,Bundle,,6878.89
4,2021-01-01,BU,International MtL VC,,,66,NEW,,,Bundle,,-29280.7


In [10]:
df.version.unique()

array(['BU'], dtype=object)

In [12]:
df.report_month.unique()

array([datetime.date(2021, 1, 1), datetime.date(2021, 2, 1),
       datetime.date(2021, 3, 1), datetime.date(2021, 4, 1),
       datetime.date(2021, 5, 1), datetime.date(2021, 6, 1),
       datetime.date(2021, 7, 1), datetime.date(2021, 8, 1),
       datetime.date(2021, 9, 1), datetime.date(2021, 10, 1),
       datetime.date(2021, 11, 1), datetime.date(2021, 12, 1)],
      dtype=object)

### BACKUP данных PRODUCT_PARAMETERS

In [11]:
# чистим PRODUCT_PARAMETERS_BACKUP и заполняем данными
with teradatasql.connect() as con:
    with con.cursor() as cur:
        cur.execute('''
            delete from UAT_PRODUCT.PRODUCT_PARAMETERS_BACKUP; 
            INSERT INTO UAT_PRODUCT.PRODUCT_PARAMETERS_BACKUP
            select *
            from uat_product.product_parameters
            WHERE (param_1 in ('BU','AC')
                OR param_1 LIKE '%R%' OR param_1 LIKE '%F%')
        ''')
print(f'{cur.rowcount} rows inserted into UAT_PRODUCT.PRODUCT_PARAMETERS_BACKUP.')

2600469 rows inserted into UAT_PRODUCT.PRODUCT_PARAMETERS_BACKUP.


### Очистка и заполнение промежуточной таблицы PRODUCT_PARAMETERS_TEST_FOR_SAP

In [13]:
with teradatasql.connect() as con:
    with con.cursor() as cur:

        print('deleting from uat_product.product_parameters_test_for_sap...')
        cur.execute('''
            delete from uat_product.product_parameters_test_for_sap;
        ''')
        print(f'{cur.rowcount} rows deleted from uat_product.product_parameters_test_for_sap')
        
        batchsize = 100000
        print('inserting into uat_product.product_parameters_test_for_sap...')
        for num in range(0, len(df), batchsize):
            cur.executemany(f'''
             INSERT into uat_product.product_parameters_test_for_sap ({','.join('?'*len(df.columns))})
            ''',
                [tuple(row) for row in df.iloc[num:num+batchsize,:].itertuples(index=False)]
                )
        print(f'{len(df)} rows inserted into uat_product.product_parameters_test_for_sap.')

deleting from uat_product.product_parameters_test_for_sap...
178571 rows deleted from uat_product.product_parameters_test_for_sap
inserting into uat_product.product_parameters_test_for_sap...
89508 rows inserted into uat_product.product_parameters_test_for_sap.


### Очистка PRODUCT_PARAMETERS (при необходимости)

In [14]:
with teradatasql.connect() as con:
    with con.cursor() as cur:
        cur.execute('''
            delete from UAT_PRODUCT.PRODUCT_PARAMETERS
            WHERE param_1 in ('BU')
                AND report_date between date'2021-01-01' and date'2021-12-31'
        ''')      
        print(f'{cur.rowcount} rows deleted from uat_product.product_parameters')

2630421 rows deleted from uat_product.product_parameters


### Заполнение PRODUCT_PARAMETERS

In [15]:
with teradatasql.connect() as con:
    with con.cursor() as cur:
        cur.execute('''
            insert into UAT_PRODUCT.PRODUCT_PARAMETERS
                sel
                    cal.calendar_date as REPORT_DATE,
                    product_parameters_test_for_sap.PARAM_1,
                    product_parameters_test_for_sap.PARAM_2,
                    product_parameters_test_for_sap.PARAM_3,
                    product_parameters_test_for_sap.PARAM_4,
                    product_parameters_test_for_sap.BRANCH_ID,
                    product_parameters_test_for_sap.BASE_TYPE,
                    product_parameters_test_for_sap.TP_ID_1,
                    product_parameters_test_for_sap.TP_ID_2,
                    product_parameters_test_for_sap.TARIFF_1,
                    product_parameters_test_for_sap.TARIFF_2,
                    PARAM_VALUE/EXTRACT(DAY FROM LAST_DAY(REPORT_DATE)) as PARAM_VALUE
                 from uat_product.product_parameters_test_for_sap
                 left join Sys_Calendar.BusinessCalendar cal
                    on product_parameters_test_for_sap.REPORT_DATE = trunc(cal.calendar_date,'mon')
                 where product_parameters_test_for_sap.PARAM_1 <> 'AC'

                 union all

                 sel *
                 from uat_product.product_parameters_test_for_sap
                  where product_parameters_test_for_sap.PARAM_1 = 'AC'
        ''')      
        print(f'{cur.rowcount} rows inserted into UAT_PRODUCT.PRODUCT_PARAMETERS.')

2722554 rows inserted into UAT_PRODUCT.PRODUCT_PARAMETERS.


In [None]:
### Очистка PRODUCT_PARAMETERS при необходимости заменить устаревшие данные
# rows_counter = 0
# with teradatasql.connect() as con:
#     with con.cursor() as cur:
#         for row in ver_month_kpi_new.itertuples(index=False):
#             version = row[0]
#             report_month = row[1]
#             kpi = row[2]
#             cur.execute(
#                   f'''
#                         delete from uat_product.product_parameters
#                         WHERE PARAM_1='{version}'
#                               AND REPORT_DATE = DATE'{report_month}'
#                               AND PARAM_2 in {kpi}
#                         ;
#                     '''
#                  )
#             rows_counter = rows_counter + cur.rowcount
#         print(f'{rows_counter} rows deleted on {tuple(kpis)} between {start_date} and {end_date}')

In [None]:
# newfilename = r'*.xlsx'
# phd_rev_branch.to_excel(newfilename, index = False)