In [1]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

In [2]:
from property_data_generation import generate_user_listing_interactions

In [3]:
df = generate_user_listing_interactions(
    max_number_of_interactions=10,
    number_of_listings=70000,
    number_of_users=100000
)

In [4]:
df = pd.get_dummies(df, columns=['land_reg_type', 'listing_status', 'transaction_type'], dtype=np.int8)

In [5]:
smart_tags = [
    'balcony',
    'bath',
    'conservatory',
    'cottage',
    'driveway',
    'en_suite',
    'garage',
    'garden',
    'kitchen_island',
    'patio',
    'period_property',
    'new_home'
]

property_features = [
    "price",
    "num_beds",
    "num_baths",
    'land_reg_type_detached',
    'land_reg_type_flat',
    'land_reg_type_semi-detached',
    'land_reg_type_terraced',
    'postcode_latitude',
    'postcode_longitude'
]

listing_status = [
    'listing_status_for-sale',
    'listing_status_rent-under-offer',
    'listing_status_rented',
    'listing_status_sale-under-offer',
    'listing_status_sold',
    'listing_status_to-rent',
    'transaction_type_sale'
]


In [6]:
user_profiles = pd.merge(
    df.groupby(by=['anonymous_id'])[property_features].mean().reset_index(),
    df.groupby(by=['anonymous_id'])[smart_tags].mean().reset_index(),
    on=['anonymous_id']
)

user_profiles = pd.merge(
    user_profiles,
    df.groupby(by=['anonymous_id'])[listing_status].mean().reset_index(),
    on=['anonymous_id']
)


In [7]:
rename_columns = {
    "price": "user_preferred_price",
    "postcode_latitude": "user_preferred_latitude",
    "postcode_longitude": "user_preferred_longitude",
    "num_beds": "user_preferred_num_beds",
    "num_baths": "user_preferred_num_baths",
    "land_reg_type_detached": "user_preferred_detached",
    "land_reg_type_flat": "user_preferred_flat",
    "land_reg_type_semi-detached": "user_preferred_semi_detached",
    "land_reg_type_terraced": "user_preferred_terraced",
    "listing_status_for-sale": "user_preferred_for_sale",
    "listing_status_rent-under-offer": "user_preferred_rent_under_offer",
    "listing_status_sale-under-offer": "user_preferred_sale_under_offer",
    "listing_status_to-rent": "user_preferred_to_rent",
    "listing_status_sold": "user_preferred_sold",
    "listing_status_rented": "user_preferred_rented",
    "transaction_type_sale": "user_preferred_sale"
} | {smart_tag: f"user_preferred_{smart_tag}" for smart_tag in smart_tags}


user_profiles = user_profiles.rename(
    columns=rename_columns
)

In [8]:
df_modelling = pd.merge(
    df,
    user_profiles,
    on=['anonymous_id']
)

In [9]:
print(len(df), len(df_modelling))

550893 550893


In [10]:
df_modelling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550893 entries, 0 to 550892
Data columns (total 66 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   anonymous_id                     550893 non-null  object 
 1   listing_id                       550893 non-null  object 
 2   sent_lead                        550893 non-null  int64  
 3   listing_saved                    550893 non-null  int64  
 4   time_spent_on_listing            550893 non-null  float64
 5   number_of_times_property_viewed  550893 non-null  int64  
 6   lsrp_click                       550893 non-null  int64  
 7   lsrp_view                        550893 non-null  int64  
 8   price                            550893 non-null  int64  
 9   num_beds                         550893 non-null  int64  
 10  num_baths                        550893 non-null  int64  
 11  postcode                         550893 non-null  object 
 12  po

In [17]:
features = [
    'listing_saved',
    'time_spent_on_listing', 
    'number_of_times_property_viewed',
    'lsrp_click', 
    'lsrp_view', 
    'price', 
    'num_beds',
    'num_baths', 
    'postcode_latitude', 
    'postcode_longitude', 
    'balcony', 
    'bath', 
    'conservatory', 
    'cottage',
    'driveway', 
    'en_suite', 
    'garage', 
    'garden', 
    'kitchen_island', 
    'patio',
    'period_property', 
    'new_home', 
    'land_reg_type_detached',
    'land_reg_type_flat', 
    'land_reg_type_semi-detached',
    'land_reg_type_terraced', 
    'listing_status_for-sale',
    'listing_status_rent-under-offer', 
    'listing_status_rented',
    'listing_status_sale-under-offer', 
    'listing_status_sold',
    'listing_status_to-rent', 
    'user_preferred_price',
    'user_preferred_latitude',
    'user_preferred_longitude',
    'user_preferred_num_beds', 
    'user_preferred_num_baths',
    'user_preferred_detached', 
    'user_preferred_flat',
    'user_preferred_semi_detached', 
    'user_preferred_terraced',
    'user_preferred_balcony', 
    'user_preferred_bath',
    'user_preferred_conservatory', 
    'user_preferred_cottage',
    'user_preferred_driveway', 
    'user_preferred_en_suite',
    'user_preferred_garage', 
    'user_preferred_garden',
    'user_preferred_kitchen_island', 
    'user_preferred_patio',
    'user_preferred_period_property', 
    'user_preferred_new_home',
    'user_preferred_for_sale', 
    'user_preferred_rent_under_offer',
    'user_preferred_rented', 
    'user_preferred_sale_under_offer',
    'user_preferred_sold', 
    'user_preferred_to_rent'
]

target_label = "sent_lead"

In [18]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report, confusion_matrix, f1_score
import xgboost as xgb
import shap

Need to have a function that can split the interactions based on user. Or do I? 
- The idea is that it introduces a bias, but isn't having users the model has seen before a part of it? 

In [19]:
anonymous_ids = df_modelling['anonymous_id'].unique()

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df_modelling[features], df_modelling[target_label], test_size=0.2, random_state=42)

In [60]:
hyperparameters = {
    "n_estimators": 100,
    "learning_rate": 0.1,
    "max_depth": 6,
    "random_state": 42,
    "objective": "binary:logistic"
}

xgb_clf = xgb.XGBClassifier(**hyperparameters)

In [63]:
scores = cross_val_score(xgb_clf, X_train, y_train, cv=10, scoring='f1')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [64]:
scores

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [41]:
xgb_clf.fit(X_train, y_train)

In [62]:
print(classification_report(y_pred=y_pred, y_true=y_train))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    434287
           1       1.00      0.00      0.01      6427

    accuracy                           0.99    440714
   macro avg       0.99      0.50      0.50    440714
weighted avg       0.99      0.99      0.98    440714



In [49]:
X_train['listing_saved'].value_counts()

listing_saved
0    429885
1     10829
Name: count, dtype: int64

In [52]:
y_train.value_counts(normalize=True)

sent_lead
0    0.985417
1    0.014583
Name: proportion, dtype: float64

In [42]:
y_pred = xgb_clf.predict(X_train)

In [46]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
scores

array([0., 0., 0., 0., 0.])

In [24]:
np.mean(scores)

np.float64(0.9854145772693285)

In [None]:

xgb_clf.fit(X_train, y_train)

In [None]:
y_pred = xgb_reg.predict_proba(X_train)

In [None]:
classification_thresholds = [n/10 for n in range(1, 10)]

In [None]:
f1_scores = []
for classification_threshold in classification_thresholds:
    f1_scores.append((classification_threshold, f1_score(y_train, (y_pred[:, 1] > classification_threshold))))

In [None]:
auc_score = roc_auc_score(y_train, y_pred[:, 1])

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, y_pred[:, 1])

In [None]:
plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % auc_score,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()