In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

In [3]:
# hours = []
#Get first 24 hours worth of data on day 1
# for i in range(24): 
#     hours.append(pd.read_json("./data/combined1_" + str(i), lines=True))
# hour0 = pd.read_json("./combined1_0", lines=True)
# hour1 = pd.read_json("./combined1_1", lines=True)
# hour19 = pd.read_json("./combined1_19", lines=True)

In [4]:
# table = pd.concat(hours)

In [5]:
# table.to_pickle("Day1")
table = pd.read_pickle("./data/Day1")

In [5]:
# Read in file chunksize at a time to prevent kernel from dying, read_json will return an iterator
# table = pd.read_json("./data/combined_all_days", chunksize=2500000, lines=True)
# table = pd.concat(table)

## Data Exploration

In [6]:
table.shape

(4783760, 42)

In [7]:
table.head()

Unnamed: 0,ad_network_id,ad_type,advertiser_id,bid_requests,bid_responses,c_cnt,c_timestamp,c_txn_fee,c_txn_rate,campaign_id,...,txn_fee,txn_rate,ua_device,ua_device_type,ua_name,ua_os_name,vi_cnt,vi_timestamp,vv_cnt,zone_id
0,1839,[story],4523,[],[],0.0,,,,26027,...,,,Samsung SM-G930V,MOB,Chrome Mobile,Android,0.0,,0.0,26024
1,977,[story],2917,[],[],,,,,25990,...,,,iPad,TAB,Chrome Mobile iOS,iOS,,,,26024
2,1828,[story],4514,[],[],0.0,,,,22360,...,,,QTAIR7,TAB,Chrome,Android,0.0,,0.0,17681
3,1839,[story],4523,[],[],0.0,,,,26027,...,,,iPhone,MOB,Mobile Safari,iOS,0.0,,0.0,26024
4,727,[story],5152,[],[],,,,,26038,...,,,iPad,TAB,Mobile Safari UI/WKWebView,iOS,,,,14414


In [8]:
filtered = table[(~np.isnan(table["c_cnt"]))]
filtered.dropna(subset=["i_timestamp", "ua_device_type"], inplace=True)
filtered.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


(3784791, 42)

In [9]:
size = filtered.shape[0]
size

3784791

In [10]:
filtered.c_cnt.value_counts()

0.0    3782400
1.0       2391
Name: c_cnt, dtype: int64

In [11]:
CLICKED_CNT = filtered.c_cnt.value_counts()[1]
CLICKED_CNT

2391

In [12]:
#Find average ctr rate of each advertiser
def score_advertisers(table, score_func):
    click_counts = table[["advertiser_id", "c_cnt"]]
    clean_ccnt = click_counts.dropna()
    grouped = clean_ccnt.groupby("advertiser_id")
    score_means = lambda x: score_func(np.mean(x))
    agg_scored = grouped.agg({'c_cnt': score_means})
    return agg_scored

click_rates = score_advertisers(filtered, lambda x: x).reset_index()

In [13]:
click_rates.rename(columns={"c_cnt": "ctr_rate"}, inplace=True)
filtered = pd.merge(filtered, click_rates, on="advertiser_id")
filtered.head()

Unnamed: 0,ad_network_id,ad_type,advertiser_id,bid_requests,bid_responses,c_cnt,c_timestamp,c_txn_fee,c_txn_rate,campaign_id,...,txn_rate,ua_device,ua_device_type,ua_name,ua_os_name,vi_cnt,vi_timestamp,vv_cnt,zone_id,ctr_rate
0,1839,[story],4523,[],[],0.0,,,,26027,...,,Samsung SM-G930V,MOB,Chrome Mobile,Android,0.0,,0.0,26024,0.0001
1,1839,[story],4523,[],[],0.0,,,,26027,...,,iPhone,MOB,Mobile Safari,iOS,0.0,,0.0,26024,0.0001
2,1839,[story],4523,[],[],0.0,,,,26027,...,,Other,PC,Chrome,Windows 8.1,0.0,,0.0,26024,0.0001
3,1839,[story],4523,[],[],0.0,,,,26027,...,,iPhone,MOB,Mobile Safari,iOS,0.0,,0.0,26024,0.0001
4,1839,[story],4523,[],[],0.0,,,,26027,...,,Other,PC,Chrome,Windows 7,0.0,,0.0,26024,0.0001


In [14]:
filtered["ua_device_type"].value_counts()

MOB    1935685
PC     1233963
TAB     612025
BOT       3118
Name: ua_device_type, dtype: int64

In [15]:
filtered.campaign_type.value_counts()

private    3609772
outside     175019
Name: campaign_type, dtype: int64

In [16]:
filtered.columns

Index(['ad_network_id', 'ad_type', 'advertiser_id', 'bid_requests',
       'bid_responses', 'c_cnt', 'c_timestamp', 'c_txn_fee', 'c_txn_rate',
       'campaign_id', 'campaign_type', 'cr_cnt', 'creative_id', 'exp_mode',
       'f_cnt', 'geo_continent_code', 'geo_country_code2', 'geo_dma_code',
       'geo_region_name', 'geo_timezone', 'i_cnt', 'i_timestamp',
       'pub_network_id', 'r_cnt', 'r_num_ads_requested', 'r_num_ads_returned',
       'r_num_ads_third_party', 'r_timestamp', 'rate_metric', 'session_id',
       'site_id', 'token', 'txn_fee', 'txn_rate', 'ua_device',
       'ua_device_type', 'ua_name', 'ua_os_name', 'vi_cnt', 'vi_timestamp',
       'vv_cnt', 'zone_id', 'ctr_rate'],
      dtype='object')

## Feature Engineering and Model Creation

In [37]:
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [18]:
# Shuffle data
filtered = shuffle(filtered)

In [19]:
def generate_sample(n):
    """Return X matrix with n no click samples (undersampling)"""
    clicked = filtered[filtered["c_cnt"] == 1]
    no_click = filtered[filtered["c_cnt"] == 0].sample(n)
    return shuffle(pd.concat([clicked, no_click]))

def generate_subset(n, extra):
    """Returns random subset of original matrix with n/2 click samples and (n/2 + extra) nonclicks"""
    clicked = filtered[filtered["c_cnt"] == 1].reset_index(drop=True)
    no_click = filtered[filtered["c_cnt"] == 0].reset_index(drop=True)
    click_idx = np.random.randint(n, size=int(n/2))
    no_click_idx = np.random.randint(no_click.shape[0], size=int(n/2 + extra))
    return (clicked.drop(click_idx), no_click.drop(no_click_idx), 
            shuffle(pd.concat([clicked.iloc[click_idx, :], no_click.iloc[no_click_idx, :]])))

In [20]:
# Create X and Y matrix for model
click_filtered, no_click_filtered, sample = generate_subset(CLICKED_CNT, 1500)

In [21]:
X_data = pd.DataFrame()
y_data = sample["c_cnt"]
y_data.shape

(3890,)

In [22]:
sample["ua_device_type"].value_counts()

MOB    1809
PC     1225
TAB     855
BOT       1
Name: ua_device_type, dtype: int64

In [23]:
# One hot encode device type feature
le_device = preprocessing.LabelBinarizer()
transformed_device = le_device.fit_transform(sample["ua_device_type"])

# One hot encode continent code 
le_cont = preprocessing.LabelBinarizer()
transformed_continent = le_cont.fit_transform(sample["geo_continent_code"])

In [24]:
def transform_hour(x):
    """Assigns hour to a bin"""
    if x >= 5 and x < 8: return 1 #early morning 
    elif x >= 8 and x < 11: return 2 #morning 
    elif x >= 11 and x < 14: return 3 #midday
    elif x >= 14 and x < 19: return 4 #afternoon
    elif x >= 19 and x < 22: return 5 #evening
    else: return 6 #night

In [25]:
def create_numerical_features(df, sample):
    """Add numerical features to dataframe df from sample dataframe"""
    df["i_cnt"] = sample["i_cnt"].tolist()
    df["r_cnt"] = sample["r_cnt"].tolist()
    df["campaign_type"] = sample["campaign_type"].apply(lambda x: 1 if x == "private" else 0).tolist()
    df["ctr_rate"] = sample["ctr_rate"].tolist()
    df["num_ads_requested"] = sample["r_num_ads_requested"].tolist()
    # Create hour feature
    hours = sample["i_timestamp"].str[11:13] # get hour of timestamp column
    # X_data["hour"] = np.sin(hours.astype(int)).tolist() #sine transform hour because it's cyclical
    df["hour"] = hours.astype(int).apply(transform_hour).tolist()

In [26]:
create_numerical_features(X_data, sample)

In [27]:
# Hash sparse features, allows for less sparse representation than one hot encoding
# More info can be found on https://en.wikipedia.org/wiki/Feature_hashing
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=30, input_type='string')

hashed_ad_id = h.transform(sample["advertiser_id"].astype(str)).toarray()
hashed_adnet_id = h.transform(sample["ad_network_id"].astype(str)).toarray()
hashed_country_code =  h.transform(sample["geo_country_code2"]).toarray()
hashed_os = h.transform(sample["ua_os_name"]).toarray()
hashed_site_id = h.transform(sample["site_id"].astype(str)).toarray()
hashed_ua_name = h.transform(sample["ua_name"].astype(str)).toarray()
hashed_campaign_id = h.transform(sample["campaign_id"].astype(str)).toarray()

In [28]:
# Combine all features
X_data = pd.concat([X_data, pd.DataFrame(transformed_continent), pd.DataFrame(hashed_ad_id),
                            pd.DataFrame(hashed_country_code), pd.DataFrame(hashed_adnet_id),
                            pd.DataFrame(hashed_os), pd.DataFrame(hashed_site_id),
                            pd.DataFrame(hashed_ua_name), pd.DataFrame(hashed_campaign_id),
                            pd.DataFrame(transformed_device)], 
                   axis=1)

In [29]:
X_data.shape

(3890, 226)

In [30]:
X_data.head()

Unnamed: 0,i_cnt,r_cnt,campaign_type,ctr_rate,num_ads_requested,hour,0,1,2,3,...,24,25,26,27,28,29,0.1,1.1,2.1,3.1
0,1.0,1,1,0.000414,10,6,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1,1.0,1,1,0.000441,6,6,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1
2,1.0,1,0,0.007837,1,2,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
3,1.0,1,1,0.0001,4,6,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0
4,1.0,1,1,0.000414,10,5,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0


### Bagging

In [31]:
bg = ensemble.BaggingClassifier()
# bg.fit(X_train, y_train)
# bg.score(X_test, y_test) #test accuracy
bg_scores = cross_val_score(bg, X_data, y_data, cv=5)
print("Bagging Classifier Avg Score:", np.mean(bg_scores))

Bagging Classifier Avg Score: 0.829820051414


### Boosting

In [32]:
# ADABOOST
adaboost = ensemble.AdaBoostClassifier()
ada_scores = cross_val_score(adaboost, X_data, y_data, cv=5)
print("Boosting Classifier Avg Score:", np.mean(ada_scores))

Boosting Classifier Avg Score: 0.843958868895


In [33]:
# XGBOOST
xgboost = ensemble.GradientBoostingClassifier()
xg_scores = cross_val_score(xgboost, X_data, y_data, cv=5)
print("Boosting Classifier Avg Score:", np.mean(xg_scores))

Boosting Classifier Avg Score: 0.843701799486


In [34]:
#Tune XGBoost parameters
parameters = {'max_depth':[3, 5, 7]}
clf = GridSearchCV(xgboost, parameters, cv=5)
clf.fit(X_data, y_data)
print("Optimal Tree Depth:", clf.best_params_)

Optimal Tree Depth: {'max_depth': 3}


In [87]:
# F1 score on this small test set
a,b,c,d = train_test_split(X_data, y_data, test_size=0.2)
xgboost.fit(a, c)
e = xgboost.predict(b)

confusion_test = confusion_matrix(d, e)

precision_test = confusion_test[1][1] / (confusion_test[1][1] + confusion_test[0][1])
recall_test = confusion_test[1][1] / (confusion_test[1][1] + confusion_test[1][0])
print("Precision:", precision_test, ", Recall:", recall_test)

print("F1 Score:", f1_score(d, e))

Precision: 0.778894472362 , Recall: 0.625
F1 Score: 0.693512304251


## Testing on original data

In [39]:
#Only keep rows that weren't part of training set
filtered_subset = shuffle(pd.concat([click_filtered, no_click_filtered]))

In [40]:
filtered_subset.shape

(3781148, 43)

In [41]:
X_test = pd.DataFrame()
y_test = filtered_subset.c_cnt

In [42]:
create_numerical_features(X_test, filtered_subset)

In [43]:
# One hot encode device type
transformed_device_test = le_device.transform(filtered_subset['ua_device_type'])

# One hot encode continent code
transformed_continent_test = le_cont.transform(filtered_subset["geo_continent_code"])

In [44]:
# Hash features
hashed_ad_id_test = h.transform(filtered_subset["advertiser_id"].astype(str)).toarray()
hashed_adnet_id_test = h.transform(filtered_subset["ad_network_id"].astype(str)).toarray()
hashed_country_code_test =  h.transform(filtered_subset["geo_country_code2"]).toarray()
hashed_os_test = h.transform(filtered_subset["ua_os_name"]).toarray()
hashed_site_id_test = h.transform(filtered_subset["site_id"].astype(str)).toarray()
hashed_ua_name_test = h.transform(filtered_subset["ua_name"].astype(str)).toarray()
hashed_campaign_id_test = h.transform(filtered_subset["campaign_id"].astype(str)).toarray()

In [45]:
# Combine all features
X_test = pd.concat([X_test, pd.DataFrame(transformed_continent_test), pd.DataFrame(hashed_ad_id_test),
                            pd.DataFrame(hashed_country_code_test), pd.DataFrame(hashed_adnet_id_test),
                            pd.DataFrame(hashed_os_test), pd.DataFrame(hashed_site_id_test),
                            pd.DataFrame(hashed_ua_name_test), pd.DataFrame(hashed_campaign_id_test),
                            pd.DataFrame(transformed_device_test)], 
                   axis=1)

In [46]:
X_test.head()

Unnamed: 0,i_cnt,r_cnt,campaign_type,ctr_rate,num_ads_requested,hour,0,1,2,3,...,24,25,26,27,28,29,0.1,1.1,2.1,3.1
0,1.0,1,1,0.0001,10,1,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1,1.0,1,1,0.0001,10,4,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
2,1.0,1,1,0.000414,10,5,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
3,1.0,1,1,0.000395,10,6,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
4,1.0,1,1,0.0001,10,3,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0


In [47]:
xgboost.fit(X_data, y_data)
xgboost.score(X_test, y_test)

0.92719750721209537

In [48]:
y_pred = xgboost.predict(X_test)

In [49]:
confusion = confusion_matrix(y_test, y_pred) #00 - TN, 10 - FN, 01 - FP, 11 - TP
confusion

array([[3504965,  274740],
       [    537,     906]])

Our precision - TP/(TP + FP) - is very low, which means we are predicting a lot of non-clicks as clicks:

In [50]:
precision = confusion[1][1] / (confusion[1][1] + confusion[0][1])
recall = confusion[1][1] / (confusion[1][1] + confusion[1][0])
precision, recall

(0.0032868244052153849, 0.62785862785862789)

As a result, our F1 score isn't optimal either:

In [51]:
f1_score(y_test, y_pred)

0.0065394151337656841

# FFM (inspired by https://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf)

In [52]:
import xlearn as xl
xl.hello() #xl only prints in terminal

We need to make a new dataframe with categorical features one-hot encoded instead of hashed:

In [53]:
sample.head()

Unnamed: 0,ad_network_id,ad_type,advertiser_id,bid_requests,bid_responses,c_cnt,c_timestamp,c_txn_fee,c_txn_rate,campaign_id,...,txn_rate,ua_device,ua_device_type,ua_name,ua_os_name,vi_cnt,vi_timestamp,vv_cnt,zone_id,ctr_rate
60244,1845,[story],5118,[],[],0.0,,,,26123,...,,iPhone,MOB,Mobile Safari UI/WKWebView,iOS,0.0,,0.0,26024,0.000414
530651,1828,[story],4514,[],[],0.0,,,,22360,...,,iPad,TAB,Mobile Safari,iOS,0.0,,0.0,21963,0.000441
307,727,[story],5152,[],[],1.0,2018-02-11T08:17:30.415000Z,0.0,0.5,26038,...,0.5,ONEPLUS A3003,MOB,Chrome Mobile,Android,1.0,2018-02-11T08:17:26.758000Z,0.0,14414,0.007837
1604749,1839,[story],4523,[],[],0.0,,,,26027,...,,Other,PC,Chrome,Windows 7,0.0,,0.0,17681,0.0001
3574445,1845,[story],5118,[],[],0.0,,,,26123,...,,Other,PC,IE,Windows 7,0.0,,0.0,26024,0.000414


In [54]:
sample_size = sample.shape[0]

In [55]:
# Split to help generate data for train and test ffm matrices
train_ffm_original = sample.iloc[:int(0.8*sample_size), :]
test_ffm_original = sample.iloc[int(0.8*sample_size):, :]

In [56]:
# These are the actual data that will be trained with and tested on
train_ffm_transformed = pd.DataFrame(data={'c_cnt':train_ffm_original["c_cnt"].tolist()})
test_ffm_transformed = pd.DataFrame(data={'c_cnt':test_ffm_original["c_cnt"].tolist()})

In [57]:
# Create numerical features
create_numerical_features(train_ffm_transformed, train_ffm_original)
create_numerical_features(test_ffm_transformed, test_ffm_original)

In [58]:
encoders = {} #cache encoders

In [59]:
# Create encoders and one hot encode features
max_categories = 30
def encode_feature(feature, sample):
    """Encode given feature from sample df and return"""
    if feature not in encoders:
        encoder = preprocessing.LabelBinarizer()
        encoder.fit(filtered[feature].value_counts()[:max_categories]
                             .index
                             .tolist()
                    )
        encoders[feature] = encoder
    else:
        encoder = encoders[feature]
    return encoder.transform(sample[feature])

ffm_train_encoded_device = le_device.transform(train_ffm_original["ua_device_type"])
ffm_train_encoded_ad_id = encode_feature("advertiser_id", train_ffm_original)
ffm_train_encoded_ad_network_id = encode_feature("ad_network_id", train_ffm_original)
ffm_train_encoded_geo_country_code = encode_feature("geo_country_code2", train_ffm_original)
ffm_train_encoded_ua_os_name = encode_feature("ua_os_name", train_ffm_original)
ffm_train_encoded_site_id = encode_feature("site_id", train_ffm_original)
ffm_train_encoded_continent = encode_feature("geo_continent_code", train_ffm_original)
ffm_train_encoded_ua_name = encode_feature("ua_name", train_ffm_original)
ffm_train_encoded_campaign_id = encode_feature("campaign_id", train_ffm_original)

ffm_test_encoded_device = le_device.transform(test_ffm_original["ua_device_type"])
ffm_test_encoded_ad_id = encode_feature("advertiser_id", test_ffm_original)
ffm_test_encoded_ad_network_id = encode_feature("ad_network_id", test_ffm_original)
ffm_test_encoded_geo_country_code = encode_feature("geo_country_code2", test_ffm_original)
ffm_test_encoded_ua_os_name = encode_feature("ua_os_name", test_ffm_original)
ffm_test_encoded_site_id = encode_feature("site_id", test_ffm_original)
ffm_test_encoded_continent = encode_feature("geo_continent_code", test_ffm_original)
ffm_test_encoded_ua_name = encode_feature("ua_name", test_ffm_original)
ffm_test_encoded_campaign_id = encode_feature("campaign_id", test_ffm_original)

Feature order is continent code, advertiser id, country code, ad network id, os, site id, ua_name, campaign id,
and ua_device type

In [61]:
feature_count = [] # Tracks number of columns per categorical feature, i.e. device_type has 4 columns
feature_count.append(ffm_train_encoded_continent.shape[1])
feature_count.append(ffm_train_encoded_ad_id.shape[1])
feature_count.append(ffm_train_encoded_geo_country_code.shape[1])
feature_count.append(ffm_train_encoded_ad_network_id.shape[1])
feature_count.append(ffm_train_encoded_ua_os_name.shape[1])
feature_count.append(ffm_train_encoded_site_id.shape[1])
feature_count.append(ffm_train_encoded_ua_name.shape[1])
feature_count.append(ffm_train_encoded_campaign_id.shape[1])
feature_count.append(ffm_train_encoded_device.shape[1])
feature_count

[7, 26, 30, 21, 30, 24, 30, 30, 4]

In [62]:
# Combined all encoded features
train_ffm_transformed = pd.concat([train_ffm_transformed,  pd.DataFrame(ffm_train_encoded_continent), 
                                   pd.DataFrame(ffm_train_encoded_ad_id),
                                   pd.DataFrame(ffm_train_encoded_geo_country_code),
                                   pd.DataFrame(ffm_train_encoded_ad_network_id),
                                   pd.DataFrame(ffm_train_encoded_ua_os_name),
                                   pd.DataFrame(ffm_train_encoded_site_id),
                                   pd.DataFrame(ffm_train_encoded_ua_name),
                                   pd.DataFrame(ffm_train_encoded_campaign_id),
                                   pd.DataFrame(ffm_train_encoded_device)], 
                                   axis=1)

In [63]:
test_ffm_transformed = pd.concat([test_ffm_transformed,  pd.DataFrame(ffm_test_encoded_continent), 
                                   pd.DataFrame(ffm_test_encoded_ad_id),
                                   pd.DataFrame(ffm_test_encoded_geo_country_code),
                                   pd.DataFrame(ffm_test_encoded_ad_network_id),
                                   pd.DataFrame(ffm_test_encoded_ua_os_name),
                                   pd.DataFrame(ffm_test_encoded_site_id),
                                   pd.DataFrame(ffm_test_encoded_ua_name),
                                   pd.DataFrame(ffm_test_encoded_campaign_id),
                                   pd.DataFrame(ffm_test_encoded_device)], 
                                   axis=1)

In [64]:
# Inspired by https://www.analyticsvidhya.com/blog/2018/01/factorization-machines/
def convert_to_ffm(df,data_type,numerics):
    nrows = df.shape[0]
    ncols = df.shape[1]
    with open("./ffm/" + str(data_type) + "_ffm.txt", "w") as text_file:
        # Looping over rows to convert each row to libffm format
        datastring = ""
        for r in range(nrows)[:1000000]: #first 1,000,000 rows for now
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow['c_cnt']))
            field_position = 0
            feature_count_index = 0
            for i in range(1, ncols):
                if field_position == feature_count[feature_count_index]:
                    feature_count_index += 1
                    field_position = 0
                # For numerical fields, we are creating a dummy field here
                if (i - 1) < len(numerics):
                    #we are dealing with a numeric feature
                    feat = numerics[i - 1]
                    datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[feat])
                else:
                    #categorical feature
                    if datarow[i] == 1:
                        datastring = datastring + " "+str(feature_count_index)+":"+ str(field_position)+":1"
                    field_position += 1
            datastring += '\n'
        text_file.write(datastring)

In [65]:
# Generate list of numeric feature names
numerics = train_ffm_transformed.columns[1:7].tolist()

In [66]:
# Rename columns to make it easier to transform to libffm format
ncolumns = train_ffm_transformed.columns.shape[0]
train_ffm_transformed.columns = ["c_cnt"] + numerics + list(range(7, ncolumns))
test_ffm_transformed.columns = ["c_cnt"] + numerics + list(range(7, ncolumns))

In [67]:
train_ffm_transformed.head()

Unnamed: 0,c_cnt,i_cnt,r_cnt,campaign_type,ctr_rate,num_ads_requested,hour,7,8,9,...,199,200,201,202,203,204,205,206,207,208
0,0.0,1.0,1,1,0.000414,10,6,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0.0,1.0,1,1,0.000441,6,6,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1.0,1.0,1,0,0.007837,1,2,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.0,1.0,1,1,0.0001,4,6,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0.0,1.0,1,1,0.000414,10,5,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [68]:
convert_to_ffm(train_ffm_transformed, "train", numerics)
convert_to_ffm(test_ffm_transformed, "test", numerics)

Train FFM model for 15 epochs, achieved 84.4% accuracy, 0.41 log loss and 0.86 AUC score on test set:

In [78]:
# Training task
ffm_model = xl.create_ffm()  
ffm_model.setTrain("./ffm/train_ffm.txt")   # Training data
ffm_model.setValidate("./ffm/test_ffm.txt")  # Validation data
ffm_model.setSigmoid()

# param:
#  0. binary classification
#  1. learning rate : 0.2
#  2. regular lambda : 0.002
#  3. Metric for monitoring validation set performance (using accuracy rn)
#  4. Maximum number of Epochs
param = {'task':'binary', 'lr':0.2, 'lambda':0.002,'metric':'acc','epoch':15}

# Train model
ffm_model.fit(param, "./ffm/model.out")

Find F1 score of test data:

In [72]:
ffm_model.setTest("./ffm/test_ffm.txt")
ffm_model.setSign() #makes outputs either 0 or 1
ffm_model.predict("./ffm/model.out", "./ffm/output.txt")

In [88]:
y_pred_ffm = pd.read_csv('ffm/output.txt', header = None)
y_pred_ffm.head()

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0


In [76]:
confusion_ffm = confusion_matrix(test_ffm_transformed['c_cnt'], y_pred_ffm) #00 - TN, 10 - FN, 01 - FP, 11 - TP
print("Confusion Matrix:\n", confusion_ffm)

precision_ffm = confusion_ffm[1][1] / (confusion_ffm[1][1] + confusion_ffm[0][1])
recall_ffm = confusion_ffm[1][1] / (confusion_ffm[1][1] + confusion_ffm[1][0])
print("Precision:", precision_ffm, ", Recall:", recall_ffm)

print("F1 Score:", f1_score(test_ffm_transformed['c_cnt'], y_pred_ffm)) #better F1 score!

Confusion Matrix:
 [[507  20]
 [101 150]]
Precision: 0.882352941176 , Recall: 0.597609561753
F1 Score: 0.712589073634


### Test FFM on original data

In [214]:
combined = pd.concat([train_ffm_transformed, test_ffm_transformed])

In [234]:
combined.head()

Unnamed: 0,c_cnt,i_cnt,r_cnt,campaign_type,ctr_rate,num_ads_requested,hour,7,8,9,...,139,140,141,142,143,144,145,146,147,148
0,0.0,1.0,1,0,0.007053,1,4,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0.0,1.0,1,1,0.000393,10,5,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,0.0,1.0,1,1,0.000393,4,6,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,0.0,1.0,1,1,0.000248,10,6,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,1.0,1,1,0.000248,10,4,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [227]:
ffm_original = pd.DataFrame(filtered_subset['c_cnt'])
create_numerical_features(ffm_original, filtered_subset)

In [228]:
ffm_original.reset_index(inplace=True, drop=True)

In [224]:
# Encoding features of original data, takes a long time to run
ffm_original_encoded_device = le_device.transform(filtered_subset["ua_device_type"].tolist())
ffm_original_encoded_ad_id = encode_feature("advertiser_id", filtered_subset)
ffm_original_encoded_ad_network_id = encode_feature("ad_network_id", filtered_subset)
ffm_original_encoded_geo_country_code = encode_feature("geo_country_code2", filtered_subset)
ffm_original_encoded_ua_os_name = encode_feature("ua_os_name", filtered_subset)
ffm_original_encoded_site_id = encode_feature("site_id", filtered_subset)
ffm_original_encoded_continent = encode_feature("geo_continent_code", filtered_subset)

In [229]:
# Combined all encoded features
ffm_original = pd.concat([ffm_original, pd.DataFrame(ffm_original_encoded_device),
                                  pd.DataFrame(ffm_original_encoded_ad_id),
                                  pd.DataFrame(ffm_original_encoded_ad_network_id),
                                  pd.DataFrame(ffm_original_encoded_geo_country_code),
                                  pd.DataFrame(ffm_original_encoded_ua_os_name),
                                  pd.DataFrame(ffm_original_encoded_site_id), 
                                  pd.DataFrame(ffm_original_encoded_continent)], axis=1)

In [231]:
ffm_original.head()

Unnamed: 0,c_cnt,i_cnt,r_cnt,campaign_type,ctr_rate,num_ads_requested,hour,0,1,2,...,21,22,23,0.1,1.1,2.1,3,4,5,6
0,0.0,1.0,1,1,0.0001,10,3,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0.0,1.0,1,1,0.0001,4,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.0,1.0,1,1,0.000416,10,6,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.0,1.0,1,1,0.0001,10,4,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.0,1.0,1,1,0.000248,10,5,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [232]:
# Rename columns to make it easier to transform to libffm format
ffm_original.columns = ["c_cnt"] + numerics + list(range(7, ncolumns))

In [233]:
convert_to_ffm(combined, "train_combined", numerics)

In [237]:
convert_to_ffm(ffm_original, "test_original", numerics)

KeyboardInterrupt: 

In [None]:
# Training task
ffm_model_whole = xl.create_ffm()  
ffm_model_whole.setTrain("./ffm/train_combined_ffm.txt")   # Training data
ffm_model_whole.setValidate("./ffm/test_original_ffm.txt")  # Validation data
ffm_model_whole.setSigmoid()

# param:
#  0. binary classification
#  1. learning rate : 0.2
#  2. regular lambda : 0.002
#  3. Metric for monitoring validation set performance (using accuracy rn)
#  4. Maximum number of Epochs
param_whole = {'task':'binary', 'lr':0.2, 'lambda':0.002,'metric':'acc','epoch':5}

# Train model
ffm_model_whole.fit(param_whole, "./ffm/model_whole.out")