In [0]:
from geopy.distance import distance

In [0]:
def measure_distance_to_loc(lat, lon, lat_origin = 25.276987, lon_origin = 55.296249):
  
  """Given a a point defined by its latitude and logitud, returns the distance to a certain origin, by default specified to be Dubai. This function is used to create variables based on the nationality of the customer.

  Parameters:
  lat: Latitude of the point.
  lon: Longitude of the point.
  lat_origin: Latitude of the origin, by default set to Dubai.
  lon_origin: Longitude of the origin, by default set to Dubai.

  Returns:
  Distance in kilometers from the speficied point to the origin.

  """
  # by default dubai is the origin
  if np.isnan(lat + lon):
      return np.NaN
  dist_to_loc = distance((lat, lon), (lat_origin, lon_origin)).km
  return dist_to_loc


In [0]:
def add_demo_feats(config, mt, resi_detail = None, coords = None):
  
  """Adds demographic features to master table. The number of rows is always unaltered. Some nationalities from resi detail are changed to match the format from the coords file.

  Parameters:
  config: configuration dictionary, as defined in get_config.
  mt: master table to which demographic features are added
  resi_detail: residential detail data source. If None, it is read from the appropiate file.
  coords: Dataset specificing the coordinates from each country. If None, it is read from the appropiate file.

  Returns:
  Data frame with new columns related to demographic features. 

  """  
  
  if resi_detail is None:
    resi_detail = get_raw_df(config, 'residential_detail')
  if coords is None:
    coords = get_raw_df(config, 'coords')
    
  coords.loc[:, 'distance_to_dubai'] = coords.apply(lambda x: measure_distance_to_loc(x.latitude, x.longitude, config['model_params']['coords_dubai']['lat'], config['model_params']['coords_dubai']['lon']), axis = 1)
   
  # correcting some country names. pd.replace does not work property with the installed pandas version
  resi_detail.loc[resi_detail.nationality == 'palestine', 'nationality'] = 'lebanon'
  resi_detail.loc[resi_detail.nationality == 'serbia_and_montenegro', 'nationality'] = 'serbia'
  resi_detail.loc[resi_detail.nationality == 'philippine', 'nationality'] = 'philippines'
  resi_detail.loc[resi_detail.nationality == 'ethopia', 'nationality'] = 'ethiopia'
  resi_detail.loc[resi_detail.nationality == 'korea_south', 'nationality'] = 'south_korea'
  resi_detail.loc[resi_detail.nationality == 'korea_north', 'nationality'] = 'north_korea'
  resi_detail.loc[resi_detail.nationality == 'congo_democratic_republic_of_the', 'nationality'] = 'congo_drc'
  resi_detail.loc[resi_detail.nationality == 'macedonia', 'nationality'] = 'macedonia_fyrom'
  resi_detail.loc[resi_detail.nationality == 'burma', 'nationality'] = 'myanmar_burma'
  resi_detail.loc[resi_detail.nationality == 'united_states_of_america', 'nationality'] = 'united_states'
  resi_detail.loc[resi_detail.nationality == 'emirates', 'nationality'] = 'united_arab_emirates'
  resi_detail.loc[resi_detail.nationality == 'kyrgistan', 'nationality'] = 'kyrgyzstan'
  resi_detail.loc[resi_detail.nationality == 'mianmaar', 'nationality'] = 'myanmar_burma'
  resi_detail.loc[resi_detail.nationality == 'malysia', 'nationality'] = 'malaysia'
  resi_detail.loc[resi_detail.nationality == 'saudia', 'nationality'] = 'saudi_arabia'
  resi_detail.loc[resi_detail.nationality == 'trinidad_tobago', 'nationality'] = 'trinidad_and_tobago'
  resi_detail.loc[resi_detail.nationality == 'ukranain', 'nationality'] = 'ukraine'
  resi_detail.loc[resi_detail.nationality == 'tunis', 'nationality'] = 'tunisia'  
  
  demo_features = pd.merge(resi_detail[['hcode', 'nationality', 'individual_categry']], coords[['name', 'latitude', 'longitude', 'distance_to_dubai']], left_on = 'nationality', right_on = 'name', how = 'left')
  demo_features.drop(columns = ['nationality', 'name'], inplace = True)
  new_mt = pd.merge(mt, demo_features, left_on = 'htent', right_on = 'hcode', how = 'left')
  
#   assert new_mt.latitude.isna().sum() == 0, 'There are some customers whose country has not been identified'
  assert new_mt.shape[0] == mt.shape[0], 'The number of rows of the master table with the new demographic features is not correct'
  
  new_mt = pd.get_dummies(new_mt, dummy_na=False, columns=['individual_categry'])
  
  # replacing nas
  new_mt.latitude.fillna(24, inplace = True)
  new_mt.longitude.fillna(69, inplace = True)
  new_mt.distance_to_dubai.fillna(2500, inplace = True)
  
  return new_mt
  

In [0]:
def add_unit_feats(config, mt, unit = None, prop = None, unit_type = None):
  
  """Adds unit features to master table. The number of rows is always unaltered. 

  Parameters:
  config: configuration dictionary, as defined in get_config.
  mt: master table to which unit features are added.
  unit: unit data source. If None, it is read from the appropiate file.
  prop: property data source. If None, it is read from the appropiate file.
  unit_type: unit type data source. If None, it is read from the appropiate file.

  Returns:
  Data frame with new columns related to unit features. 

  """  
  
  if unit is None:
    unit = get_raw_df(config, 'unit')
  if prop is None:
    prop = get_raw_df(config, 'property')
  if unit_type is None:
    unit_type = get_raw_df(config, 'unit_type')
    
  unit_mt = unit[['hproperty', 'hmy', 'hunittype', 'dsqft']]
  unit_mt.columns = ['hproperty', 'hunit', 'hunittype', 'dsqft']
  unit_mt = pd.merge(unit_mt, prop, left_on = 'hproperty', right_on = 'hmy', how = 'left').drop(columns = 'hmy')
  unit_mt = pd.merge(unit_mt, unit_type[['hmy', 'sdesc', 'ibeds', 'ibaths']], left_on = 'hunittype', right_on = 'hmy', how = 'left').drop(columns = 'hmy')
  
  unit_mt.sdesc.fillna('unknown', inplace = True)
  
  unit_mt['is_luxury_unit'] = ['lux' in str(i) for i in unit_mt.sdesc]
  unit_mt['is_standard_unit'] = ['sta' in str(i) for i in unit_mt.sdesc]
  unit_mt['is_economy_unit'] = ['economy' in str(i) for i in unit_mt.sdesc]
  unit_mt['is_basic_unit'] = ['basic' in str(i) for i in unit_mt.sdesc]
  unit_mt['is_other_unit'] = (unit_mt[['is_luxury_unit', 'is_standard_unit', 'is_economy_unit', 'is_basic_unit']].sum(axis = 1) == 0)
  
  dam_units = ['al_khail_gate', 'shorooq', 'ghoroob', 'remraam', 'layan']
  other_projects = np.logical_not([x in dam_units for x in unit_mt.saddr2])
  
  unit_mt.loc[other_projects, 'saddr2'] = 'other'
  unit_mt['community'] = unit_mt.saddr2
  
  unit_mt = pd.get_dummies(unit_mt, dummy_na=False, columns=['saddr2'])
  
  new_mt = pd.merge(mt, unit_mt, on = 'hunit', how = 'left')
  assert new_mt.shape[0] == mt.shape[0], 'The number of rows of the master table with the new unit features is not correct'
  
  # replacing nas
  new_mt.ibaths.fillna(-1, inplace = True)
  new_mt.ibeds.fillna(-1, inplace = True)
  new_mt.dsqft.fillna(1300, inplace = True)
  
  for i in dam_units:
    new_mt['saddr2_' + i].fillna(0, inplace = True)
  new_mt['saddr2_other'].fillna(True, inplace = True)
  
  # formatting data types
  for i in ['is_luxury_unit', 'is_standard_unit', 'is_economy_unit', 'is_basic_unit', 'is_other_unit']:
    new_mt[i] = new_mt[i].fillna(0).astype(bool)
    
  new_mt['ibaths'] = new_mt['ibaths'].astype(int, errors = 'ignore')
  new_mt['ibeds'] = new_mt['ibeds'].astype(int, errors = 'ignore')
  
  return new_mt

In [0]:
def add_avg_price_feat(config, mt):
  
  """Adds price ratio features to master table. The number of rows is always unaltered. 
  
  The features added are:
  - Ratio between rent and average rent for units scored in that month.
  - Ratio between price per square feet and average price per square feet in that month.
  
  The averages are computed globally, at community level and at type level (e.g., luxury units)

  Parameters:
  config: configuration dictionary, as defined in get_config.
  mt: master table to which unit features are added

  Returns:
  Data frame with new columns related to price ratios features. 

  """  
  
  # global average price
  monthly_price = mt.groupby('id_date')['crent', 'price_per_sq_ft'].mean().reset_index()
  monthly_price.columns = ['id_date', 'avg_monthly_price', 'avg_price_per_sq_ft']
  
  new_mt = mt.copy()
  new_mt = pd.merge(new_mt, monthly_price, on = 'id_date', how = 'left')
  
  new_mt['ratio_price_avg'] = new_mt['crent'] /  new_mt['avg_monthly_price']
  new_mt['ratio_price_sqft_avg'] = new_mt['price_per_sq_ft'] /  new_mt['avg_price_per_sq_ft']
  
  assert new_mt.shape[0] == mt.shape[0], 'The number of rows of the master table with the new ratio of the price and the monthly average price is wrong'
  
  new_mt.drop(columns = 'avg_monthly_price', inplace = True)
  
  # average prices per community
  monthly_price_comm = mt.groupby(['id_date', 'community'])['crent', 'price_per_sq_ft'].mean().reset_index()
  monthly_price_comm.columns = ['id_date', 'community', 'avg_monthly_price_comm', 'avg_price_per_sq_ft_comm']
  
  new_mt = pd.merge(new_mt, monthly_price_comm, on = ['id_date', 'community'], how = 'left')
  
  assert new_mt.shape[0] == mt.shape[0], 'The number of rows of the master table with the new ratio of the price and the monthly average price per community is wrong'
  new_mt['ratio_price_avg_comm'] = new_mt['crent'] /  new_mt['avg_monthly_price_comm']
  new_mt['ratio_price_sqft_avg_comm'] = new_mt['price_per_sq_ft'] /  new_mt['avg_price_per_sq_ft_comm']
  
  
  # average prices per unit type  
  is_unit_cols = [x for x in new_mt.columns if x.startswith('is_') and x.endswith('_unit')]
  aux_vct = np.zeros(new_mt.shape[0])
  counter = 0
  for i in is_unit_cols:
    counter += 1
    temp_vct =  new_mt[i] * counter
    assert np.logical_not(np.logical_and(temp_vct > 0, aux_vct > 0).any()), 'For column {}, something went wrong when computing unit type1'.format(i)
    aux_vct = aux_vct + temp_vct 
  new_mt['aux_col'] = aux_vct
  monthly_price_type = new_mt.groupby(['id_date', 'aux_col'])['crent', 'price_per_sq_ft'].mean().reset_index()
  monthly_price_type.columns = ['id_date', 'aux_col', 'avg_monthly_price_type', 'avg_price_per_sq_ft_type']
  
  new_mt = pd.merge(new_mt, monthly_price_type, on = ['id_date', 'aux_col'], how = 'left')
  assert new_mt.shape[0] == mt.shape[0], 'The number of rows of the master table with the new ratio of the price and the monthly average price per type is wrong'
  new_mt['ratio_price_avg_type'] = new_mt['crent'] /  new_mt['avg_monthly_price_type']
  new_mt['ratio_price_sqft_avg_type'] = new_mt['price_per_sq_ft'] /  new_mt['avg_price_per_sq_ft_type']
  new_mt.drop(columns = 'aux_col', inplace = True)
  
  
  # replacing NAs
  new_mt['ratio_price_avg'].fillna(1, inplace = True)
  new_mt['ratio_price_sqft_avg'].fillna(1, inplace = True)
  new_mt['ratio_price_avg_comm'].fillna(1, inplace = True)
  new_mt['ratio_price_sqft_avg_comm'].fillna(1, inplace = True)
  new_mt['ratio_price_avg_type'].fillna(1, inplace = True)
  new_mt['ratio_price_sqft_avg_type'].fillna(1, inplace = True)  
  
  # additional synthetic variables
  new_mt['ratio_price_mult'] = new_mt['ratio_price_avg'] * new_mt['ratio_price_avg_comm'] * new_mt['ratio_price_avg_type']
  new_mt['ratio_price_sqft_mult'] = new_mt['ratio_price_sqft_avg'] * new_mt['ratio_price_sqft_avg_comm'] * new_mt['ratio_price_sqft_avg_type']
  new_mt['ratio_price_avg_avg'] = (new_mt['ratio_price_avg'] + new_mt['ratio_price_avg_comm'] + new_mt['ratio_price_avg_type']) / 3
  new_mt['ratio_price_sqft_avg_avg'] = (new_mt['ratio_price_sqft_avg'] + new_mt['ratio_price_sqft_avg_comm'] + new_mt['ratio_price_sqft_avg_type']) / 3
  
  return new_mt

In [0]:
def add_sr_features(config, mt):
  
  """Adds Service Requests features to master table. The number of rows is always unaltered. 
  
  Parameters:
  config: configuration dictionary, as defined in get_config.
  mt: master table to which unit features are added

  Returns:
  Data frame with new columns related to Service Requests features. 

  """  
  
  processed_sr = get_processed_srs(config)
  new_mt = pd.merge(left = mt, right = processed_sr, left_on = ['htent', 'hunit', 'id_date'], right_on = ['htenant', 'hunit', 'id_date'], how = 'left')
  assert new_mt.shape[0] == mt.shape[0], 'The number of rows of the master table with the SR features is not correct'
  
  sr_cols = [x for x in new_mt.columns if x.startswith('n_SR')]
  for i in sr_cols:
    new_mt[i].fillna(0, inplace = True)
  
  
  return new_mt