In [4]:
import numpy as np
import xgboost as xgb
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBT
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
%matplotlib inline

#from sklearn.preprocessing import CategoricalEncoder
#CategoricalEncoder is part of sklearn's developer version, which you can't just update with conda. If you have issues
#getting this version, try a hard code implementation of the library here - https://pastebin.com/qs1es9XE

In [2]:
df_store = pd.HDFStore('combined_day1.h5')
df = df_store['df']

In [6]:
df.shape

(2262922, 75)

In [7]:
feature_map = []

In [8]:
def transform_column(df, col, thresh=200):
    if col in numerical_features:
        print("Numerical" , col)
        feature_map.append(col)
        return df[col].values.reshape(-1,1)
    
    print(col, df[col].nunique())
   
    if df[col].nunique() > thresh:
        df_frequency = df[[col, 'c_cnt']].groupby(col).agg('count').sort_values('c_cnt',ascending=False)
        cat = [sorted(df_frequency[0:thresh].index.values)]
        dict2 = {}
        for i, item in enumerate(cat[0]):
            feature_map.append(col + " - " + str(item))
            dict2[item] = i
        #enc = CategoricalEncoder(categories=[sorted(df_frequency[0:thresh].index.values)],handle_unknown='ignore')
    else:
        dict2 = {}
        i = 0
        for item in df[col].values:
            if item not in dict2:
                feature_map.append(col + " - " + str(item))
                dict2[item] = i
                i+=1
        #enc = CategoricalEncoder(categories='auto',handle_unknown='ignore')
    return [[1 if j == i else 0 for j in dict2] for i in df[col].values]

Now, we need to do some data cleaning. From some initial exploratory analysis, we can see that that we have 5 features with only 16 non-nan values, with a few other features having a similarly low level of non-nan values. To simplify things, we choose to drop all features with less than some threshhold of non-nan values. Also, as we are trying to predict c_cnt, samples where c_cnt is NaN are useless, so we throw those away as well. 

After this, we see that less than 10% of our remaining samples contains any NaN values, so we just drop those samples as we don't lose that much information from them.

In [11]:
#how many non-nan values do we have?
print(df.count())
n = len(df)

#filter rows with c_cnt as NaN
df = df[np.isfinite(df['c_cnt'])]

#filter threshhold
df = df.dropna(thresh=int(0.3*n), axis=1)
#drop all samples with NaN values
df = df.dropna(axis=0)

_host                    2262922
ad_network_id            2262922
ad_type                  2262922
adlog_count              2262922
advertiser_id            2262922
bid_requests             2262922
bid_responses            2262922
c_cnt                    1724754
c_flag_cnt               1724754
c_timestamp                 1186
c_txn_fee                    429
c_txn_rate                   429
campaign_id              2262922
campaign_type            2262922
ck                       2262922
cr_cnt                   2262922
creative_id              2262922
exp_mode                  475358
f_cnt                    1724754
f_nfr                          2
f_timestamp                    2
flag                       22239
geo_area_code            1788769
geo_city_code            2182133
geo_city_name            2183499
geo_continent_code       2262921
geo_country_code2        2262921
geo_country_code3        2262921
geo_dma_code             1788139
geo_postal_code          2095169
          

Now, we have some more preprocessing to do, so we wrote some simple functions for preprocessing. The most important thing we do here is that since most of our features are categorical, we must encode them with one-hot-encoding, which essentially turns one feature into n different features, one for each type of class in the original features. For example, if we had a feature for "hair color", we would map it to a higher dimensional feature space consisting of "is the hair white", "is the hair black", "is the hair brown", etc. Only one of these features would be a 1, and the rest would be 0.

Normally, each feature would be mapped to n features, with n being the number of unique classes that feature contains. For our data, however, some features will have thousands, even millions of unique classes, which would result is an omega-sparse dataset. To account for this, we set a threshhold at 200, such that n will never be greater than 201. We still keep track of the 200 most frequent classes, however, the rest will be bunched into a single class. The motivation for this is that for the more frequent classes, we have enough data that our ML models will be able to extract some information, but for the less frequent classes, there is too little data for accurate analysis, so we group them as one class.

In [12]:
#Turns a timestamp into which minute the time was at - used as a categorical feature.
def timestamp_to_min(timestamp, is_hour=True):
    if is_hour:
        return timestamp.split(':')[0][-2:]
    else: 
        return timestamp.split(':')[1]

#plots frequency of a feature's different classes, useful for exploratory analysis
def plot_freq(col_name, df):
    df_frequency = df.groupby(col_name).agg('count').sort_values('ad_type',ascending=False)
    plt.plot([i for i in range(len(df_frequency.values))], [np.log(i[2]) for i in df_frequency.values])
    plt.show()

#if a feature only has one unique value, it tells us nothing, so we drop it.
def remove_only_ones(df):
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, inplace=True,axis=1)

#just prints how many unique values are in each feature
def print_column_counts(df):    
    for i in df:
        print(i, df[i].nunique())

#We do some final cleaning, changing all non-numerical features into strings for later.
def preprocess(df):    
    for i in df:
        if i[-1] != 't' or i[-2] != 'n' or i[-3] != 'c':
            df[i] = df[i].astype('str')
    remove_only_ones(df)
    if 'site_id' in df.columns:
        df.drop('site_id',inplace=True,axis=1)
    df['i_timestamp'] = df['i_timestamp'].apply(timestamp_to_min)
    df['r_timestamp'] = df['r_timestamp'].apply(timestamp_to_min)
    


#final preprocessing
preprocess(df)
#this set contains our numerical column names
numerical_features = set(['c_cnt', 'i_cnt', 'r_cnt', 'vi_cnt'])
#we create a copy so that X will not include 'c_cnt'
df2 = df.copy()
df2.drop('c_cnt',inplace=True,axis=1)
#u,s,v = np.linalg.svd(X)

In [10]:
df_removed = df.drop( [ "i_cnt", "vi_cnt", "r_num_ads_returned", "i_flag_cnt", "vi_flag_cnt"] , axis=1)
df2_removed = df_removed.copy()
df2_removed.drop('c_cnt',inplace=True,axis=1)



In [13]:
df = df_removed
df2 = df2_removed

#given a categorical column, we apply our earlier strategy of one-hot-encoding with maximum thresh=200
def transform_column(df, col, thresh=200, return_labels=False):
    print(col)
    df_frequency = df[[col, 'c_cnt']].groupby(col).agg('count').sort_values('c_cnt',ascending=False)
    if df[col].nunique() > thresh:
        enc = CategoricalEncoder(categories=[sorted(df_frequency[0:thresh].index.values)],handle_unknown='ignore')
        labels = df_frequency[0:thresh].index.values
    else:
        enc = CategoricalEncoder(categories=[sorted(df_frequency.index.values)],handle_unknown='ignore')
        labels = df_frequency.index.values
    labels = [str(col) + str(i) for i in labels]
    if return_labels:
        return labels
    enc.fit(df[col].values.reshape(-1, 1))
    return enc.transform(df[col].values.reshape(-1,1)).toarray()

Create our X and Y matrices - adjust threshhold values for 1HE here

In [13]:
one_hot_thresh = 20
Y = df['c_cnt'].values
X = np.hstack([transform_column(df, col, thresh=one_hot_thresh) for col in df2])


_host 46
ad_network_id 16
ad_type 2
advertiser_id 23
c_flag_cnt 3
campaign_id 44
campaign_type 2
ck 5
creative_id 100
f_cnt 2
geo_area_code 256
geo_city_code 11444
geo_city_name 8139
geo_dma_code 210
geo_postal_code 16860
geo_region_name 51
geo_timezone 8
Numerical i_cnt
i_flag_cnt 4
i_timestamp 11
ip_address 260620
num_ads 6
pub_network_id 3
Numerical r_cnt
r_num_ads_requested 6
r_num_ads_returned 8
r_timestamp 13
rate_metric 2
referer 74092
session_id 927350
token 100
ua 12237
ua_device 1724
ua_device_type 4
ua_major 72
ua_minor 31
ua_name 35
ua_os 65
ua_os_name 8
url 90728
user_agent 12217
uuid 475380
Numerical vi_cnt
vi_flag_cnt 3
zone_id 51


In [23]:
X.shape

(948596, 589)

In [15]:
def fix_class_imbalance_with_subsampling(tempX, tempY, pos_ratio=2):
    tempY = tempY.reshape(-1,1)
    ind_1, ind_0 = [], []
    for i, y_h in enumerate(tempY):
        if y_h: ind_1.append(i)
        else: ind_0.append(i)
    to_sample = np.random.permutation(pos_ratio*len(ind_1))
    to_sample_0 = [ind_0[i] for i in to_sample]
    X2 = np.vstack([tempX[ind_1],tempX[to_sample_0]])
    Y2 = np.vstack([tempY[ind_1],tempY[to_sample_0]])
    tempY = tempY.reshape(-1)
    
    new_ind = np.random.permutation(len(X2))
    return X2[new_ind],Y2[new_ind]

In [19]:
def score(y_pred , y_test):
    test = confusion_matrix(y_test , y_pred)
    prec = test[1][1] / (test[1][1] + test[0][1])
    rec = test[1][1] / (test[1][1] + test[1][0])
    print("Precision: ", prec)
    print("Recall: " , rec)
    print("Log Loss: ", log_loss(y_test,y_pred))
    print(test)
    return f1_score(y_test , y_pred)

Now we split the data into 2 sets: training and testing to avoid overfitting of the model. This is done before any subsampling to avoid contaminating the test set. The train set is now subsampled to increase ratio of clicks to nonclicks from 1:2000 to 1:3 which allows models to more accurately learn the click patterns

In [20]:


X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.3)

for i in [60,80,100]:
    X_fix , Y_fix = fix_class_imbalance_with_subsampling(X_train, y_train, pos_ratio=i)
    Y_fix=Y_fix.ravel()
    model = xgb.XGBClassifier(gamma = 5 , min_child_weight = 3, objective = 'binary:logistic')
    model.fit(X_fix ,Y_fix )
    y_pred = model.predict(X_test)
    print("F1 Score: " , score(y_pred, y_test))


Precision:  0.272727272727
Recall:  0.100840336134
Log Loss:  0.0168702381624
[[284428     32]
 [   107     12]]
F1 Score:  0.147239263804
Precision:  0.48
Recall:  0.100840336134
Log Loss:  0.0145641932896
[[284447     13]
 [   107     12]]
F1 Score:  0.166666666667
Precision:  0.333333333333
Recall:  0.100840336134
Log Loss:  0.0158992719001
[[284436     24]
 [   107     12]]
F1 Score:  0.154838709677
