In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

import flair
from flair.data import Sentence
from flair.embeddings import ELMoEmbeddings, DocumentPoolEmbeddings
import catboost

import sklearn
from sklearn.model_selection import train_test_split as TTsplit, KFold
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from sklearn.preprocessing import scale

from sklearn.linear_model import HuberRegressor, SGDRegressor

In [2]:
data = pd.read_csv('cleaned_data.csv')

In [3]:
data.sample(n=10)

Unnamed: 0,Date,Market,Keyword,CPC,Clicks,CTR,Impressions,Cost,AveragePosition,Year,Month
322536,20121218,UK-Market,saturn sky,-4.058894,-0.958607,0.5,1.342423,-2.0,1.0,2012,12
227921,20130203,US-Market,zero percent credit cards,1.049631,1.110926,4.9,2.423246,1.426511,1.0,2013,2
140071,20120913,US-Market,stock quotes,3.564378,1.787673,0.4,4.151676,2.860578,1.0,2012,9
353048,20130205,UK-Market,trade stock,4.313246,-0.19382,2.9,1.342423,1.107549,1.0,2013,2
108305,20130208,US-Market,buy gift card,2.811471,1.396722,2.3,3.041787,2.243311,1.0,2013,2
95263,20130108,US-Market,lawyer,2.604071,3.530455,2.5,5.129313,4.314475,1.0,2013,1
201962,20121223,US-Market,auto insurance companies,6.24184,2.212427,2.5,3.813581,4.091437,1.0,2012,12
97935,20130114,US-Market,slumdog millionaire,-3.184425,0.322219,0.4,2.737193,-0.619789,1.0,2013,1
322402,20121218,UK-Market,gmac mortgage,-3.643856,-0.920819,0.5,1.380211,-2.0,1.0,2012,12
165844,20121024,US-Market,visa credit cards,5.839204,1.774225,3.0,3.297104,3.532013,1.0,2012,10


We do a few more data transformations

In [79]:
data['Keyword'] = data.Keyword.apply(lambda x: x.lower())

In [80]:
data['Market'] = data.Market.map({'US-Market':1, 'UK-Market':0})

In [12]:
def perform_cross_validation(model, train_df, target_df, k_folds=5, fit_params=None):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=17)
    scores = []
    r2s = []
    errors = []
    i = 1

    for train_indices, val_indices in tqdm(list(kf.split(train_df, target_df))):
        print("Training on fold " + str(i) + f" of {k_folds}...", end='')
        i += 1
        
        if not fit_params:
            model.fit(train_df.iloc[train_indices], target_df.iloc[train_indices])
        else:
            model.fit(train_df.iloc[train_indices], target_df.iloc[train_indices], **fit_params)
        print(" Done.")
        predicted_value = model.predict(train_df.iloc[val_indices])
        actual_value = target_df.iloc[val_indices]
        scores.append(explained_variance_score(actual_value, predicted_value))
        r2s.append(r2_score(actual_value, predicted_value))
        errors.append(mean_absolute_error(actual_value, predicted_value))
        
    scores = np.array(scores)
    r2s = np.array(r2s)
    errors = np.array(errors)
    print(f"Results:\nscores: {scores.mean()} +/- {scores.std()}")
    print(f"r-squared: {r2s.mean()} +/- {r2s.std()}")
    print(f"MAE: {errors.mean()} +/- {errors.std()}")
    return scores, r2s, errors

# Computing ELMo word embeddings

In the first half of 2018, the ELMo algorithm ([whitepaper](https://arxiv.org/pdf/1802.05365.pdf)) was state-of-the-art in Natural Language Processing problems as it introduced a new concept called deep contextualized word representations in which the vector embedding of each word is not just a function of the word itself with respect to the entire vocabulary, but is also a function of the sentence in which it appears in. Thus ELMo is able to model situations in which words have different meanings when used in a different context. It is a major improvement over the popular word2vec and GloVe algorithms for generating word embeddings

We will use the ELMoEmbedding function provided in the Flair library. ELMo uses a recurrent neural network architecture thus it builds on top of PyTorch

In [27]:
# init embedding model
elmo_small = ELMoEmbeddings('small')

In [30]:
document_embedding = DocumentPoolEmbeddings([elmo_small])

In [6]:
def compute_elmo_embedding(keyword):
    sentence = Sentence(keyword)
    document_embedding.embed(sentence)
    return sentence.get_embedding().detach().cpu().numpy()

Generating embedding vectors of dimension 768 for the dataset of 203k keywords takes a while.

In [51]:
vectors = []
for keyword in tqdm(list(data.Keyword.values)):
    vectors.append(compute_elmo_embedding(keyword))

HBox(children=(IntProgress(value=0, max=203643), HTML(value='')))




In [52]:
train = data[['Market', 'CPC', 'Year', 'Month']].copy()

In [53]:
vectors = pd.DataFrame.from_records(np.array(vectors),index=train.index)

In [54]:
train = pd.concat([train, vectors], axis=1)

In [55]:
train.shape

(203643, 772)

We save the embeddings along with the relevant predictor features so it is faster to iterate when experimenting with machine learning models

In [56]:
train.to_csv('embeddings_elmo.csv', index_label=False)

In [7]:
train = pd.read_csv('embeddings_elmo.csv')

# Training Regressors

We choose to use the gradient boosted tree library named CatBoost from Yandex. It has been shown to improve upon the very popular XGBoost and LightGBM libraries and performs well for categorical variables. In our dataset we have Market, Year and Month as categorical variables. I have done some hyperparameter tuning beforehand and below are values I chose.

In [10]:
cbmodel = catboost.CatBoostRegressor(task_type='GPU', depth=16, grow_policy='Lossguide', max_leaves=63)

We need to train a separate regressor for each target numerical feature. As discussed in the EDA notebook, we will be predicting CTR, Clicks and AveragePosition then we will compute the Impressions and Cost. First we perform 5-fold cross-validation and compute the explained variance score, the $R^2$ value, and mean absolute error for each regressor. The closer to 1.0 the explained variance is, the better the model is.

The sklearn docs has a detailed breakdown of regression metrics [here](https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics)

In [13]:
perform_cross_validation(cbmodel, train, data.CTR, fit_params={'cat_features':[0,2,3], 'verbose':False});

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Training on fold 1 of 5... Done.
Training on fold 2 of 5... Done.
Training on fold 3 of 5... Done.
Training on fold 4 of 5... Done.
Training on fold 5 of 5... Done.

Results:
scores: 0.8689546296236248 +/- 0.0006391622327423346
r-squared: 0.8689293730513373 +/- 0.0006353664800213117
MAE: 0.47718850367210497 +/- 0.0017195953631911546


In [14]:
perform_cross_validation(cbmodel, train, data.Clicks, fit_params={'cat_features':[0,2,3], 'verbose':False});

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Training on fold 1 of 5... Done.
Training on fold 2 of 5... Done.
Training on fold 3 of 5... Done.
Training on fold 4 of 5... Done.
Training on fold 5 of 5... Done.

Results:
scores: 0.9370096213708639 +/- 0.0015850676153734565
r-squared: 0.9370046921952836 +/- 0.001585706440868557
MAE: 0.1754057206881581 +/- 0.0009201017426284919


In [15]:
perform_cross_validation(cbmodel, train, data.AveragePosition, fit_params={'cat_features':[0,2,3], 'verbose':False});

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Training on fold 1 of 5... Done.
Training on fold 2 of 5... Done.
Training on fold 3 of 5... Done.
Training on fold 4 of 5... Done.
Training on fold 5 of 5... Done.

Results:
scores: 0.7149192538376179 +/- 0.020763505663724192
r-squared: 0.7149038956299435 +/- 0.020753352038012464
MAE: 0.03680639801468201 +/- 0.0008204893030684343


After selection of hyperparameters, we will train regressors over the entire dataset and prepare a wrapper function that will take the predictor features and compute all target features

In [17]:
ctr_predictor = catboost.CatBoostRegressor(task_type='GPU', depth=16, grow_policy='Lossguide', max_leaves=63)
click_predictor = catboost.CatBoostRegressor(task_type='GPU', depth=16, grow_policy='Lossguide', max_leaves=63)
ap_predictor = catboost.CatBoostRegressor(task_type='GPU', depth=16, grow_policy='Lossguide', max_leaves=63)

In [19]:
ctr_predictor.fit(train,data.CTR, cat_features=[0,2,3], verbose=False)

<catboost.core.CatBoostRegressor at 0x7f6d6663e780>

In [20]:
click_predictor.fit(train,data.Clicks, cat_features=[0,2,3], verbose=False)

<catboost.core.CatBoostRegressor at 0x7f6d6663e400>

In [21]:
ap_predictor.fit(train,data.AveragePosition, cat_features=[0,2,3], verbose=False)

<catboost.core.CatBoostRegressor at 0x7f6d6663e550>

In [40]:
def Predictor(Date, Market, Keyword, CPC):
    # NOTE: this function only takes a single datapoint at a time
    # Each input must match the data type of the corresponding column in the original dataset
    year = int(Date[:4])
    month = int(Date[4:6])
    market = 1 if Market == 'US-Market' else 0
    cpc = np.log2(CPC)
    keyword = Keyword.lower()
    vector = list(compute_elmo_embedding(keyword))
    input_vector = [market, cpc, year, month, *vector]
    ctr = ctr_predictor.predict(input_vector)
    clicks = click_predictor.predict(input_vector)
    averageposition = ap_predictor.predict(input_vector)
    impressions = clicks/ctr if ctr != 0.0 else 999999
    cost = clicks*cpc
    return ctr, 10**clicks, 10**impressions, 10**cost, averageposition

Now we can go ahead and try out our predictor function

In [44]:
Predictor('20120524', 'US-Market', 'agile management software', 1.2)

(9.16275183499718,
 15.302697773917824,
 1.3467985492694117,
 2.0494378241038187,
 1.0012287559463449)

In [37]:
raw_data = pd.read_csv('dataset.csv')

In [38]:
raw_data.head()

Unnamed: 0,Date,Market,Keyword,Average.Position,CPC,Clicks,CTR,Impressions,Cost
0,20120524,US-Market,secure online back up,0.0,0.0,0.0,0.00%,0.0,0.0
1,20120524,US-Market,agile management software,1.0,1.2,21.22,8.20%,260.0,25.45
2,20120524,US-Market,crm for financial,0.0,0.0,0.0,0.00%,0.0,0.0
3,20120524,US-Market,disaster recovery planning for it,0.0,0.0,0.0,0.00%,0.0,0.0
4,20120524,US-Market,tracking a vehicle,0.0,0.0,0.0,0.00%,0.0,0.0
