### Модел 2: Моделирање со помош на Beta распределба
#### Припрема на податоците за обработка

In [1]:
import numpy as np
import os
import pandas as pd

#### Читање на влезните податоци

In [2]:
df = pd.read_csv('../data/dataset/Spenser_1_8_new_format_on_PE_markers.tsv', sep='\t')

# Отстранување на редиците каде нема ниту една вредност за 'SAPECoeficient', 'APECoeficient', 'FPECoeficient'
valid_indices = df.index[(df.notna()['SAPECoeficient'] | \
                          df.notna()['APECoeficient'] | \
                          df.notna()['FPECoeficient'])]
df = df.iloc[valid_indices]

# Приказ
df[::len(df)//10]

Unnamed: 0,CustomerID,Calendarday,month,PerformanceEvaluationPlanned,PerformanceEvaluationExecuted,ExternalEvaluation,TERRID,SAPECCSalesGroup,SAPECoeficient,APECoeficient,FPECoeficient,SAMEDAYVISIT,OVERDUE,UNDERDUE,OK,NOTOK,STATUS,OVERDUEDAYS
1,100000136,2017-05-04,5,1,1,0,Employee 44,Team 1,90.987,,90.987,,,,,,,
13979,100001198,2017-06-24,6,0,0,1,Employee 6,Team 8,,65.05,73.783,,,,,,,
28224,100002467,2017-06-21,6,1,1,0,Employee 5,Team 8,24.185,,24.185,,,,ok,,ok,
41553,100003788,2017-12-22,12,1,1,0,Employee 60,Team 7,93.463,,94.066,,,,,,,
55455,100005167,2017-11-24,11,1,1,0,Employee 47,Team 4,66.746,,66.746,,,,,,,
69401,100006770,2017-08-01,8,1,1,0,Employee 24,Team 11,24.575,,24.575,,,,,,,
83346,100008926,2017-06-15,6,1,1,0,Employee 65,Team 12,68.858,,73.662,,,,,,,
97263,100014629,2017-08-17,8,0,0,1,Employee 33,Team 10,,50.603,43.846,,,,,,,
111324,100021612,2017-10-11,10,0,0,1,Employee 34,Team 6,,81.15,86.05,,,,,,,
125322,100023141,2017-12-15,12,1,1,0,Employee 56,Team 11,75.455,,75.455,,,,,,,


In [3]:
print(f'Остануваат {len(df)} валидни ставки за работа.')

Остануваат 96837 валидни ставки за работа.


#### Проверка дали има подмножество ('CustomerID', 'month') за кое е извршена повеќе од 1 контролна посета

#### <font style="color: #FF0000">Спора операција!!!</font>

In [4]:
for cid in set(df['CustomerID']):
    for month in set(df[df['CustomerID'] == cid]['month']):
        sub_df = df[(df['CustomerID'] == cid) & (df['month'] == month)]
        
        # Земи ги сите ad-hoc посети за ова подмножество ('CustomerID', 'month')
        ad_hoc_df = sub_df[(sub_df['PerformanceEvaluationPlanned'] == 0) & \
                           (sub_df['PerformanceEvaluationExecuted'] == 1)]

        # Земи ги сите планирани посети за ова подмножество ('CustomerID', 'month')
        planned_df = sub_df[(sub_df['PerformanceEvaluationPlanned'] == 1) & \
                            (sub_df['PerformanceEvaluationExecuted'] == 1)]

        # Земи ги сите контролни посети за ова подмножество ('CustomerID', 'month')
        external_df = sub_df[sub_df['ExternalEvaluation'] == 1]
        
        if len(external_df) > 1:
            display(sub_df)

Не е извршена повеќе од 1 контролна посета во подмножествата ('CustomerID', 'month').

#### Собирање на примероци

In [5]:
ad_hoc_pre   = list()  # пред да се случи ExternalEvaluation
ad_hoc_pst   = list()  # после ExternalEvaluation 
planned_pre  = list()  # пред да се случи ExternalEvaluation
planned_pst  = list()  # после ExternalEvaluation 
external_val = list()  # вледностите на ExternalEvaluation

for cid in set(df['CustomerID']):
    for month in set(df[df['CustomerID'] == cid]['month']):
        sub_df = df[(df['CustomerID'] == cid) & (df['month'] == month)]
        
        # Земи ги сите ad-hoc посети за ова подмножество ('CustomerID', 'month')
        ad_hoc_df = sub_df[(sub_df['PerformanceEvaluationPlanned'] == 0) & \
                           (sub_df['PerformanceEvaluationExecuted'] == 1)]

        # Земи ги сите планирани посети за ова подмножество ('CustomerID', 'month')
        planned_df = sub_df[(sub_df['PerformanceEvaluationPlanned'] == 1) & \
                            (sub_df['PerformanceEvaluationExecuted'] == 1)]

        # Земи ги сите контролни посети за ова подмножество ('CustomerID', 'month')
        external_df = sub_df[sub_df['ExternalEvaluation'] == 1]
        
        if len(external_df) == 0:  # не постои контролна посета
            ad_hoc_pre  += list(ad_hoc_df['SAPECoeficient'].values)  # .dropna().values
            planned_pre += list(planned_df['SAPECoeficient'].values)  # .dropna().values
        else:
            external_val += list(external_df['APECoeficient'].values)

            for item in planned_df.iterrows():
                if item[1]['Calendarday'] < external_df.iloc[0]['Calendarday']:
                    planned_pre.append(item[1]['SAPECoeficient'])
                elif item[1]['Calendarday'] > external_df.iloc[0]['Calendarday']:
                    planned_pst.append(item[1]['SAPECoeficient'])

            for item in ad_hoc_df.iterrows():
                if item[1]['Calendarday'] < external_df.iloc[0]['Calendarday']:
                    ad_hoc_pre.append(item[1]['SAPECoeficient'])
                elif item[1]['Calendarday'] > external_df.iloc[0]['Calendarday']:
                    ad_hoc_pst.append(item[1]['SAPECoeficient'])

#### Запишување на излезните податоци во npy формат

In [6]:
data_out = '../data/dataset/03_m3_beta_distr_01a_data_extraction'

if not os.path.exists(data_out):
    os.makedirs(data_out)
    
ad_hoc_pre_arr = np.array(ad_hoc_pre)
ad_hoc_pre_arr = ad_hoc_pre_arr[~np.isnan(ad_hoc_pre_arr)]
if not os.path.exists(f'{data_out}/ad_hoc_pre'):
    np.save(f'{data_out}/ad_hoc_pre', ad_hoc_pre_arr)

ad_hoc_pst_arr = np.array(ad_hoc_pst)
ad_hoc_pst_arr = ad_hoc_pst_arr[~np.isnan(ad_hoc_pst_arr)]
if not os.path.exists(f'{data_out}/ad_hoc_pst'):
    np.save(f'{data_out}/ad_hoc_pst', ad_hoc_pst_arr)

ad_hoc_all_arr = np.concatenate((ad_hoc_pre_arr, ad_hoc_pst_arr))
if not os.path.exists(f'{data_out}/ad_hoc_all'):
    np.save(f'{data_out}/ad_hoc_all', ad_hoc_all_arr)

planned_pre_arr = np.array(planned_pre)
planned_pre_arr = planned_pre_arr[~np.isnan(planned_pre_arr)]
if not os.path.exists(f'{data_out}/planned_pre'):
    np.save(f'{data_out}/planned_pre', planned_pre_arr)

planned_pst_arr = np.array(planned_pst)
planned_pst_arr = planned_pst_arr[~np.isnan(planned_pst_arr)]
if not os.path.exists(f'{data_out}/planned_pst'):
    np.save(f'{data_out}/planned_pst', planned_pst_arr)

planned_all_arr = np.concatenate((planned_pre_arr, planned_pst_arr))
if not os.path.exists(f'{data_out}/planned_all'):
    np.save(f'{data_out}/planned_all', planned_all_arr)

external_arr = np.array(external_val)
external_arr = external_arr[~np.isnan(external_arr)]
if not os.path.exists(f'{data_out}/external'):
    np.save(f'{data_out}/external', external_arr)