In [1]:
import pickle
import random
import numpy as np
import pandas as pd

from yaml import CLoader as Loader, load
from datasketch import MinHash, MinHashLSHForest

from source.model import LSHGraph
from source.score import score_fn, get_extn
from source.utils import feat_imp, count_fn
from source.utils import conv_values, flatten_list

## Data Cleaning

In [3]:
# Load the config files
with open("config.yaml") as stream:
    config = load(stream, Loader=Loader)

In [4]:
label = config["label"]
n_perm = config["n_perm"]
thresh = config["thresh"]
id_col = config["id_col"]
features = config["features"]
data_path = config["data_path"]
seed_path = config["seed_path"]
extn_path = config["extn_path"]
list_cols = config["list_cols"]
model_path = config["model_path"]
count_path = config["count_path"]
clean_data_path = config["clean_data_path"]

In [5]:
# Read the data
data = pd.read_json(data_path)

In [6]:
# Reset index in data
data.reset_index(drop=True, inplace=True)

In [7]:
# convert list to integers
for c in features:
    if c not in list_cols:
        data[c] = data[c].apply(lambda x: x[0] if type(x) == list else None)

In [8]:
data.shape

(200000, 11)

In [9]:
# remove rows having number of elements above certain threshold
# for list columns
for c in list_cols:
    data["count"] = data[c].apply(lambda x: len(x) if type(x) == list else 0)
    data = data[data["count"] <= thresh]
data.drop("count", axis=1, inplace=True)

In [10]:
data.shape

(106546, 11)

In [11]:
# sort the values in the list columns
# replace empty values with empty list
for c in list_cols:
    data[c] = data[c].apply(lambda x: sorted(x) if type(x) == list else [])

In [12]:
data.reset_index(inplace=True)
data.rename({"index": "id"}, axis=1, inplace=True)

In [13]:
# Write the clean data to disc
data.to_json(clean_data_path)

In [14]:
# Calculate feature counts for scoring
count_df = count_fn(data, features, list_cols)
count_df.to_csv(count_path, index=False)

## Model training

In [15]:
# Read the cleaned data
data = pd.read_json(clean_data_path)
df = data.drop(label, axis=1)

In [16]:
# Create a MinHashForest model
lsh = MinHashLSHForest(num_perm=n_perm)
# Create a LSH graph object
lsh_graph = LSHGraph(df, lsh, features,
                     id_col=id_col,
                     n_perm=n_perm)

In [17]:
# Train the model
lsh_graph.update_graph()

Processing 0 of 106546
Processing 5000 of 106546
Processing 10000 of 106546
Processing 15000 of 106546
Processing 20000 of 106546
Processing 25000 of 106546
Processing 30000 of 106546
Processing 35000 of 106546
Processing 40000 of 106546
Processing 45000 of 106546
Processing 50000 of 106546
Processing 55000 of 106546
Processing 60000 of 106546
Processing 65000 of 106546
Processing 70000 of 106546
Processing 75000 of 106546
Processing 80000 of 106546
Processing 85000 of 106546
Processing 90000 of 106546
Processing 95000 of 106546
Processing 100000 of 106546
Processing 105000 of 106546


In [19]:
# Save the model to disc
with open(model_path, "wb") as f:
    pickle.dump(lsh_graph, f)

## Seed set extension

In [22]:
# Read the cleaned data
data = pd.read_json(clean_data_path)

In [23]:
# Read the seed set
seed = pd.read_csv(seed_path)
seed_ids = list(seed["id"])

In [24]:
# Load the trained model
lsh_graph = pickle.load(open(model_path, "rb"))

In [25]:
# Retrieve the neighbors of seed set from LSH graph
neighbors = lsh_graph.extract_neighbors(seed_ids)

In [26]:
# Select records that are not in the seed set
df = data[~data["id"].isin(seed_ids)]

In [27]:
# Calculate the default click rate
def_click_rate = df[label].mean()
def_click_rate = round(def_click_rate*100, 2)

In [29]:
# Score the neighbors
df = score_fn(data, count_path, features, list_cols,
              seed_ids, neighbors, label)

In [30]:
# Create and store extension file
extn_click_rate = get_extn(df, seed_ids, label, extn_path, x=2)

In [31]:
print(f"Click rate increased from {def_click_rate}% to {extn_click_rate}%")

Click rate increased from 9.24% to 13.82%
