In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import pickle
import datetime

# load data

In [2]:
# small dataset
dataset = pd.read_pickle('data\simulations_for_ucp_procced_head_100.pickle')
dataset.head(5)

prediction_date,2023-02-02,2023-02-03,2023-02-04,2023-02-05,2023-02-06,2023-02-07,2023-02-08,2023-02-09,2023-02-10,2023-02-11,...,2023-07-22,2023-07-23,2023-07-24,2023-07-25,2023-07-26,2023-07-27,2023-07-28,2023-07-29,2023-07-30,2023-07-31
credentialset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000349e-0a0f-3373-a4f1-05bce8b528b3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
00007e1b-ae0a-3989-b3f1-759732362837,0,1,1,1,1,0,1,0,1,1,...,1,1,1,1,0,1,1,0,1,1
0000ca52-4ef7-303a-a2ca-688a36e79620,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0001f41c-cda8-4ea9-a349-65fd3441581e,1,0,1,1,0,1,1,0,1,1,...,1,1,1,1,1,1,1,0,1,1
0001f720-64bc-361a-ba9f-5a21cbfbcbcb,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0


In [3]:
#X = pickle.load(open("data\simulation_for_ucp_has_txn.pickle", "rb"))
#X = pd.read_pickle('data\simulation_for_ucp_has_txn.pickle')
provider_ids = pickle.load(open("data\provider_ids.pickle", "rb"))

In [4]:
N_USERS = dataset.shape[0]
N_DAYS = dataset.shape[1]

# function

In [5]:
def create_is_effective_crawl_df(data, crawl):
    is_effective_crawl = np.empty((N_USERS, N_DAYS))
    is_effective_crawl[:] = np.nan
    for i in range(N_USERS):
        one_slice_data = data.iloc[i]
        one_slice_crawl = crawl[i]
        buga = one_slice_data.cumsum()[one_slice_crawl]
        is_effective_crawl[i, one_slice_crawl] = np.diff(buga, prepend=0) >0
    
    return pd.DataFrame(is_effective_crawl, columns=data.columns)

In [6]:
def calc_days_with_updates(data, time_slot, exclusion_window=7):
    # data contain the relevant history data for certain date
    columns_in_slot = data.columns[-time_slot:- exclusion_window]
    return pd.DataFrame(data[columns_in_slot].sum(axis=1)/(time_slot-exclusion_window), columns=[f'n_updates_{time_slot}d'])

In [7]:
def calc_time_between_updates(data, time_slot, exclusion_window=7):
    # data contain the relevant history data for certain date
    columns_in_slot = data.columns[-time_slot:- exclusion_window]
    index_df = data[columns_in_slot].apply(lambda row: row[row == 1].index, axis=1)
    mean_list, max_list, min_list = [],[],[]
    for updates_dates in index_df:
        updates_dates_indexes = [data.columns.get_loc(date) for date in updates_dates]
        xdiff = [updates_dates_indexes[n]-updates_dates_indexes[n-1] for n in range(1,len(updates_dates_indexes))]        
        if xdiff == []:
            mean_list.append(0)
            max_list.append(0)
            min_list.append(0)
        else:
            mean_list.append(np.mean(xdiff))
            max_list.append(np.max(xdiff))
            min_list.append(np.min(xdiff))
            
    return pd.DataFrame({f'mean_time_{time_slot}d': mean_list, f'max_time_{time_slot}d': max_list, f'min_time_{time_slot}d': min_list}, index=data.index)
        

In [8]:
def calc_days_since_last_crawl(row, current_date):
    if len(row[row>=0]) == 0:
        return np.nan
    date_str = row[row >= 0].index[-1]
    last_crawl_date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
    current_date = datetime.datetime.strptime(current_date, '%Y-%m-%d')
    delta = current_date - last_crawl_date
    return delta.days

In [9]:
def calc_n_updates_days(data):
    data = data.to_numpy()
    n_updates = data.sum()
    n_days = data.shape[0]*data.shape[1]
    return n_updates/n_days
    

In [10]:
start_date_column = 120  # Adjust this to your desired starting column
# Get the column names (dates) from the dataset
date_columns = dataset.columns[start_date_column:]
# create fetures dataset
features_df = pd.melt(dataset[date_columns], ignore_index=False, value_name='has_txn')

time_slot = [30, 60, 90, 120]
for slot in time_slot:
    n_update_col = f'n_updates_{slot}d'
    mean_col = f'mean_time_{slot}d'
    max_col = f'max_time_{slot}d'
    min_col = f'min_time_{slot}d'
    cols_list = [n_update_col, mean_col, max_col, min_col]
    for col_name in cols_list:
        features_df[col_name] = 0
    for date in date_columns:
        end_index = dataset.columns.get_loc(date)
        start_index = end_index - start_date_column 
        columns_in_slot = dataset.columns[start_index:end_index]
        date_data = dataset[columns_in_slot]
        features_df.loc[features_df['prediction_date'] == date,n_update_col] = calc_days_with_updates(date_data, slot)
        features_df.loc[features_df['prediction_date'] == date, [mean_col, max_col, min_col]] = calc_time_between_updates(date_data, slot)
features_df

Unnamed: 0_level_0,prediction_date,has_txn,n_updates_30d,mean_time_30d,max_time_30d,min_time_30d,n_updates_60d,mean_time_60d,max_time_60d,min_time_60d,n_updates_90d,mean_time_90d,max_time_90d,min_time_90d,n_updates_120d,mean_time_120d,max_time_120d,min_time_120d
credentialset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-02,0,0.260870,4.200000,8,2,0.301887,3.000000,8,1,0.253012,3.750000,12,1,0.221239,4.208333,14,1
00007e1b-ae0a-3989-b3f1-759732362837,2023-06-02,1,0.826087,1.222222,2,1,0.792453,1.243902,3,1,0.819277,1.223881,3,1,0.805310,1.233333,3,1
0000ca52-4ef7-303a-a2ca-688a36e79620,2023-06-02,0,0.000000,0.000000,0,0,0.037736,6.000000,6,6,0.036145,10.000000,14,6,0.044248,16.500000,42,4
0001f41c-cda8-4ea9-a349-65fd3441581e,2023-06-02,1,0.434783,2.333333,6,1,0.509434,2.000000,7,1,0.542169,1.840909,7,1,0.522124,1.931034,7,1
0001f720-64bc-361a-ba9f-5a21cbfbcbcb,2023-06-02,0,0.000000,0.000000,0,0,0.037736,12.000000,12,12,0.024096,12.000000,12,12,0.026549,37.500000,63,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00187410-3f06-30db-a9e6-97c43ab763d0,2023-07-31,0,0.043478,0.000000,0,0,0.075472,9.333333,18,3,0.060241,12.250000,21,3,0.070796,12.571429,26,1
00189d2c-d177-4ace-83bb-6e5807b73ef0,2023-07-31,0,0.478261,2.000000,4,1,0.509434,1.923077,6,1,0.542169,1.818182,6,1,0.548673,1.770492,6,1
0018cd3e-e488-4548-8a95-f251b89a9b80,2023-07-31,0,0.391304,2.250000,5,1,0.396226,2.500000,6,1,0.421687,2.352941,8,1,0.380531,2.547619,8,1
0018d6bb-5ee1-4c82-9f84-4975ab66d1e9,2023-07-31,0,0.000000,0.000000,0,0,0.000000,0.000000,0,0,0.000000,0.000000,0,0,0.000000,0.000000,0,0


# Provider based features

In [11]:
dataset_with_provider = dataset.join(provider_ids)

In [12]:
weekday0 = [date for date in dataset.columns if datetime.datetime.strptime(date, '%Y-%m-%d').weekday() == 0]
weekday1 = [date for date in dataset.columns if datetime.datetime.strptime(date, '%Y-%m-%d').weekday() == 1]
weekday2 = [date for date in dataset.columns if datetime.datetime.strptime(date, '%Y-%m-%d').weekday() == 2]
weekday3 = [date for date in dataset.columns if datetime.datetime.strptime(date, '%Y-%m-%d').weekday() == 3]
weekday4 = [date for date in dataset.columns if datetime.datetime.strptime(date, '%Y-%m-%d').weekday() == 4]
weekday5 = [date for date in dataset.columns if datetime.datetime.strptime(date, '%Y-%m-%d').weekday() == 5]
weekday6 = [date for date in dataset.columns if datetime.datetime.strptime(date, '%Y-%m-%d').weekday() == 6]
weekday_list = [weekday0, weekday1, weekday2, weekday3, weekday4, weekday5, weekday6]

In [13]:
weekday_dict = {}
for i, weekday in enumerate(weekday_list):
    weekday_data = dataset[weekday]
    weekday_dict[i] = calc_n_updates_days(weekday_data)
weekday_dict

{0: 0.2592307692307692,
 1: 0.3468,
 2: 0.3132,
 3: 0.3180769230769231,
 4: 0.3515384615384615,
 5: 0.10346153846153847,
 6: 0.17153846153846153}

In [14]:
provider_feature_df = pd.DataFrame(index=dataset_with_provider['provider_id'].unique())
provider_feature_df

867c4a38-0d75-46ff-a4c2-f44cb63be2c9
280a19b6-a767-4e7f-b6b4-e422d6876897
""
89e6e230-1333-448b-9800-dc2d09a85e99
63260486-7618-4660-9445-7e4885e99042
50761787-31da-4b17-8048-c6063b614b80
50b5a2d8-06d1-4d8d-b8b3-2b945e7c35d6
a12f6956-9aca-4550-8caa-2e2f9532674c
9b9f5ceb-dea2-49db-8476-4530c1b0c92d
22f309a8-1746-49a7-bf6c-678729834573
f10c1b75-2c77-499f-953a-d98eb5302f3d


In [15]:
features_df_with_provider = features_df.join(provider_ids)
features_df_with_provider.head()

Unnamed: 0_level_0,prediction_date,has_txn,n_updates_30d,mean_time_30d,max_time_30d,min_time_30d,n_updates_60d,mean_time_60d,max_time_60d,min_time_60d,n_updates_90d,mean_time_90d,max_time_90d,min_time_90d,n_updates_120d,mean_time_120d,max_time_120d,min_time_120d,provider_id
credentialset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-02,0,0.26087,4.2,8,2,0.301887,3.0,8,1,0.253012,3.75,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-03,0,0.217391,4.75,8,3,0.301887,3.0,8,1,0.253012,3.75,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-04,0,0.217391,4.75,8,3,0.301887,3.0,8,1,0.253012,3.75,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-05,0,0.173913,5.333333,8,3,0.301887,3.0,8,1,0.253012,3.75,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-06,0,0.173913,5.333333,8,3,0.301887,3.0,8,1,0.253012,3.75,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9


In [16]:
provider_feature_df = pd.DataFrame(index=dataset_with_provider['provider_id'].unique())

for provider in dataset_with_provider['provider_id'].unique():
    mask_provider = dataset_with_provider['provider_id'] == provider
    provider_data = dataset_with_provider[mask_provider].drop('provider_id', axis=1)
    n_update_provider = calc_n_updates_days(provider_data)
    provider_feature_df.loc[provider, 'provider_n_updates'] = n_update_provider
    features_df_with_provider.loc[mask_provider, 'provider_n_updates'] = n_update_provider
    for i, weekday in enumerate(weekday_list):
        weekday_data = provider_data[weekday]
        mask_weekday = features_df_with_provider['prediction_date'].isin(weekday)
        n_update_weekday = calc_n_updates_days(weekday_data)
        provider_feature_df.loc[provider, f'provider_n_updates_day{i}'] = n_update_weekday
        features_df_with_provider.loc[mask_weekday & mask_provider, 'provider_n_updates_weekday'] = n_update_weekday
        features_df_with_provider.loc[mask_weekday, 'n_updates_weekday'] = n_update_weekday
features_df_with_provider

  return n_updates/n_days
  return n_updates/n_days
  return n_updates/n_days
  return n_updates/n_days
  return n_updates/n_days
  return n_updates/n_days
  return n_updates/n_days
  return n_updates/n_days


Unnamed: 0_level_0,prediction_date,has_txn,n_updates_30d,mean_time_30d,max_time_30d,min_time_30d,n_updates_60d,mean_time_60d,max_time_60d,min_time_60d,...,max_time_90d,min_time_90d,n_updates_120d,mean_time_120d,max_time_120d,min_time_120d,provider_id,provider_n_updates,provider_n_updates_weekday,n_updates_weekday
credentialset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-02,0,0.260870,4.200000,8,2,0.301887,3.000000,8,1,...,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9,0.349653,0.459135,0.730769
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-03,0,0.217391,4.750000,8,3,0.301887,3.000000,8,1,...,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9,0.349653,0.000000,0.000000
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-04,0,0.217391,4.750000,8,3,0.301887,3.000000,8,1,...,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9,0.349653,0.314904,0.115385
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-05,0,0.173913,5.333333,8,3,0.301887,3.000000,8,1,...,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9,0.349653,0.375000,0.576923
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-06,0,0.173913,5.333333,8,3,0.301887,3.000000,8,1,...,12,1,0.221239,4.208333,14,1,867c4a38-0d75-46ff-a4c2-f44cb63be2c9,0.349653,0.472500,0.560000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-27,0,0.304348,2.166667,5,1,0.396226,2.400000,9,1,...,9,1,0.460177,2.176471,9,1,88355d10-e310-465b-a173-953dacf0b96d,0.444444,0.576923,0.576923
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-28,0,0.304348,2.166667,5,1,0.396226,2.400000,9,1,...,9,1,0.451327,2.200000,9,1,88355d10-e310-465b-a173-953dacf0b96d,0.444444,0.730769,0.730769
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-29,0,0.347826,2.285714,5,1,0.415094,2.428571,9,1,...,9,1,0.451327,2.240000,9,1,88355d10-e310-465b-a173-953dacf0b96d,0.444444,0.000000,0.000000
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-30,0,0.347826,2.285714,5,1,0.415094,2.428571,9,1,...,9,1,0.442478,2.224490,9,1,88355d10-e310-465b-a173-953dacf0b96d,0.444444,0.115385,0.115385


# Crawling based features: 

In [17]:
np.random.seed(33)
crawl = np.random.binomial(1, 0.5, size=(N_USERS, N_DAYS)).astype(bool)
crawl_df = create_is_effective_crawl_df(dataset, crawl)
crawl_df = crawl_df.set_index(dataset.index)

In [18]:
date_columns = dataset.columns[start_date_column:]

for date in date_columns:
    end_index = dataset.columns.get_loc(date)
    start_index = end_index - 7 
    columns_in_slot = dataset.columns[start_index:end_index]
    last_days_df = crawl_df[columns_in_slot]
    features_df_with_provider.loc[features_df_with_provider['prediction_date'] == date, 
                             'n_successful_crawls'] = pd.DataFrame(last_days_df[last_days_df == 1].count(axis=1),
                                                                   columns=['n_successful_crawls'])
    features_df_with_provider.loc[features_df_with_provider['prediction_date'] == date, 
                             'n_unsuccessful_crawls'] = pd.DataFrame(last_days_df[last_days_df == 0].count(axis=1),
                                                                     columns=['n_unsuccessful_crawls'])
    features_df_with_provider.loc[features_df_with_provider['prediction_date'] == date, 
                             'is_last_crawl_succed'] = pd.DataFrame(last_days_df.apply(
        lambda row: row[row>=0][-1] if len(row[row>=0])>0 else np.nan , axis=1), columns=['is_last_crawl_succed'])
    features_df_with_provider.loc[features_df_with_provider['prediction_date'] == date, 
                             'n_days_since_last_crawl'] = pd.DataFrame(last_days_df.apply(calc_days_since_last_crawl, axis=1, current_date = date), 
                                                                        columns=['n_days_since_last_crawl'])
features_df_with_provider = features_df_with_provider.drop(['has_txn', 'provider_id'], axis=1)
features_df_with_provider


Unnamed: 0_level_0,prediction_date,n_updates_30d,mean_time_30d,max_time_30d,min_time_30d,n_updates_60d,mean_time_60d,max_time_60d,min_time_60d,n_updates_90d,...,mean_time_120d,max_time_120d,min_time_120d,provider_n_updates,provider_n_updates_weekday,n_updates_weekday,n_successful_crawls,n_unsuccessful_crawls,is_last_crawl_succed,n_days_since_last_crawl
credentialset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-02,0.260870,4.200000,8,2,0.301887,3.000000,8,1,0.253012,...,4.208333,14,1,0.349653,0.459135,0.730769,3.0,3.0,0.0,1.0
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-03,0.217391,4.750000,8,3,0.301887,3.000000,8,1,0.253012,...,4.208333,14,1,0.349653,0.000000,0.000000,3.0,4.0,0.0,1.0
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-04,0.217391,4.750000,8,3,0.301887,3.000000,8,1,0.253012,...,4.208333,14,1,0.349653,0.314904,0.115385,2.0,5.0,0.0,1.0
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-05,0.173913,5.333333,8,3,0.301887,3.000000,8,1,0.253012,...,4.208333,14,1,0.349653,0.375000,0.576923,2.0,5.0,0.0,1.0
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-06,0.173913,5.333333,8,3,0.301887,3.000000,8,1,0.253012,...,4.208333,14,1,0.349653,0.472500,0.560000,2.0,5.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-27,0.304348,2.166667,5,1,0.396226,2.400000,9,1,0.397590,...,2.176471,9,1,0.444444,0.576923,0.576923,1.0,0.0,1.0,2.0
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-28,0.304348,2.166667,5,1,0.396226,2.400000,9,1,0.385542,...,2.200000,9,1,0.444444,0.730769,0.730769,2.0,0.0,1.0,1.0
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-29,0.347826,2.285714,5,1,0.415094,2.428571,9,1,0.397590,...,2.240000,9,1,0.444444,0.000000,0.000000,2.0,1.0,0.0,1.0
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-30,0.347826,2.285714,5,1,0.415094,2.428571,9,1,0.397590,...,2.224490,9,1,0.444444,0.115385,0.115385,2.0,1.0,0.0,2.0


# Target - is a crawl 'today' will succed

In [19]:
date_columns = dataset.columns[start_date_column:]

for date in date_columns:
    date_index = dataset.columns.get_loc(date)
    fixed_crawl = crawl.copy()
    for i in range(len(fixed_crawl)):
        fixed_crawl[i][date_index] = True 
    crawl_df = create_is_effective_crawl_df(dataset, fixed_crawl)
    crawl_df = crawl_df.set_index(dataset.index)
    features_df_with_provider.loc[features_df_with_provider['prediction_date'] == date, 'target'] = crawl_df[date]
features_df_with_provider

Unnamed: 0_level_0,prediction_date,n_updates_30d,mean_time_30d,max_time_30d,min_time_30d,n_updates_60d,mean_time_60d,max_time_60d,min_time_60d,n_updates_90d,...,max_time_120d,min_time_120d,provider_n_updates,provider_n_updates_weekday,n_updates_weekday,n_successful_crawls,n_unsuccessful_crawls,is_last_crawl_succed,n_days_since_last_crawl,target
credentialset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-02,0.260870,4.200000,8,2,0.301887,3.000000,8,1,0.253012,...,14,1,0.349653,0.459135,0.730769,3.0,3.0,0.0,1.0,0.0
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-03,0.217391,4.750000,8,3,0.301887,3.000000,8,1,0.253012,...,14,1,0.349653,0.000000,0.000000,3.0,4.0,0.0,1.0,0.0
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-04,0.217391,4.750000,8,3,0.301887,3.000000,8,1,0.253012,...,14,1,0.349653,0.314904,0.115385,2.0,5.0,0.0,1.0,0.0
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-05,0.173913,5.333333,8,3,0.301887,3.000000,8,1,0.253012,...,14,1,0.349653,0.375000,0.576923,2.0,5.0,0.0,1.0,0.0
0000349e-0a0f-3373-a4f1-05bce8b528b3,2023-06-06,0.173913,5.333333,8,3,0.301887,3.000000,8,1,0.253012,...,14,1,0.349653,0.472500,0.560000,2.0,5.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-27,0.304348,2.166667,5,1,0.396226,2.400000,9,1,0.397590,...,9,1,0.444444,0.576923,0.576923,1.0,0.0,1.0,2.0,1.0
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-28,0.304348,2.166667,5,1,0.396226,2.400000,9,1,0.385542,...,9,1,0.444444,0.730769,0.730769,2.0,0.0,1.0,1.0,0.0
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-29,0.347826,2.285714,5,1,0.415094,2.428571,9,1,0.397590,...,9,1,0.444444,0.000000,0.000000,2.0,1.0,0.0,1.0,0.0
0018df90-340f-4926-ba6c-ff3b0d3fe325,2023-07-30,0.347826,2.285714,5,1,0.415094,2.428571,9,1,0.397590,...,9,1,0.444444,0.115385,0.115385,2.0,1.0,0.0,2.0,0.0


In [20]:
features_df_with_provider.to_csv('feature_dataset.csv')