In [0]:
# In this notebook the functions used for creating the customer base are defined:

# * `get_month_customer_base` takes the tenant history table as input and a given date, and returns contracts that are active on that date (it does not considered whether a unit churned or not, this has to be done outside of the function)
# * `get_customer_base` uses the function above to return the customer base for a list of months

In [0]:
def get_month_customer_base(config, tenant_history, id_date, n_months_to_end_for_predicting):
  """Auxiliary function, used by get_customer_base, that calculates the customer base for one month

  Parameters:
  config: configuration dictionary, as defined in get_config.
  tenant_history: tenant history data source, only considering customers that should be included in the customer base (e.g., individuals).
  id_date: Specific month for which to calculate the customer base.
  n_months_to_end_for_predicting: Number of months ahead of the contract end the customer is to be included in the customer base.

  Returns:
  Data Frame with the ID cols (htenant, hunit, id_date), as well as some variables coming from the tenant_history table. It only contains one id_date, and the concatenation of all of the data frames returning from this function gets the customer base.

  """
  # filtering tenant history to take only needed columns
  filtered_tenant_history = tenant_history.loc[(tenant_history.dtleasefrom <= id_date) & (tenant_history.dtleaseto >= id_date) & (tenant_history.dtoccurred <= id_date)]
  cols_group_by = ['htent', 'hunit', 'dtleasefrom', 'dtleaseto']
  feature_cols = ['crent', 'cdeposit0']
  filtered_tenant_history.sort_values(cols_group_by + ['dtoccurred'], inplace = True)
  # take the latest value before the scoring date
  latest_situation = filtered_tenant_history.groupby(cols_group_by)[feature_cols].last().reset_index()
  latest_situation['id_date'] = id_date
  # compute the contracts that need to be considered for each month
  min_end_date_for_contract = id_date + pd.DateOffset(months = n_months_to_end_for_predicting)
  max_end_date_for_contract = id_date + pd.DateOffset(months = n_months_to_end_for_predicting + 1) - pd.DateOffset(days = 1)
  assert min_end_date_for_contract.strftime('%Y%m') == max_end_date_for_contract.strftime('%Y%m'), 'The dates for the customers base are inconsistent'
  # filter contracts based on the previously computed month
  latest_situation = latest_situation.loc[(latest_situation.dtleaseto >= min_end_date_for_contract) & (latest_situation.dtleaseto <= max_end_date_for_contract)]
  return latest_situation

In [0]:
def get_customer_base(config, tenant_history = None, resi_detail = None, from_date = None, to_date = None, n_months_to_end_for_predicting = 3):
  """Creates the customer base of the model

  Parameters:
  config: configuration dictionary, as defined in get_config.
  tenant_history: tenant history data source. If none, the function reads it.
  resi_detail: residential detail data source. If none, the function reads it.
  from_date: first date to include in the customer base. Must be inputted.
  to_date: last date to include in the customer base. Must be inpoutted.
  n_months_to_end_for_predicting: Number of months ahead of the contract end the customer is to be included in the customer base.

  Returns:
  Data Frame with the ID cols (htenant, hunit, id_date), as well as some variables coming from the tenant_history table.

  """
  assert from_date is not None, 'The from_date is missing'
  assert to_date is not None, 'The to_date is missing'
  #   if from_date is None:
  #     from_date = pd.to_datetime('2018-04-01')
  #   if to_date is None:
  #     to_date = pd.to_datetime('2019-04-01')    
  if tenant_history is None:
    tenant_history = get_raw_tenant_history_data(config)
  if resi_detail is None:
    resi_detail = get_raw_resi_detail_data(config)
  ind_acc = resi_detail.loc[resi_detail.type == 'individual', 'hcode'].values
  tenant_history = tenant_history.loc[tenant_history.htent.isin(ind_acc)]
  date_range = pd.date_range(from_date, to_date, freq = 'MS')
  customer_base = pd.DataFrame({})
  for i in date_range:
    print('--- Getting customer base for {} ---'.format(i))
    temp_df = get_month_customer_base(config, tenant_history, id_date = i, n_months_to_end_for_predicting = n_months_to_end_for_predicting)
    customer_base = pd.concat([customer_base, temp_df], axis = 0, sort = False)
  return customer_base