In [1]:
import numpy as np
import sklearn
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import json
import math
import random
import itertools
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import CategoricalEncoder
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.base import clone
#CategoricalEncoder is part of sklearn's developer version, which you can't just update with conda. If you have issues
#getting this version, try a hard code implementation of the library here - https://pastebin.com/qs1es9XE. There is
#no implementation of the labels method for the hard code, but it should be relatively easy to implement.

First, let us read in and process our data for NaN values.

In [2]:
df = pd.read_pickle('Day1')

#drop rows with c_cnt as NaN
df = df[np.isfinite(df['c_cnt'])]

#drop columns that have more than 50% NaN values
df = df.dropna(thresh=int(0.5*df.shape[0]), axis=1)

#drop all samples with any NaN values included
df = df.dropna(axis=0)

print(df.count())

ad_network_id            3719533
ad_type                  3719533
advertiser_id            3719533
bid_requests             3719533
bid_responses            3719533
c_cnt                    3719533
campaign_id              3719533
campaign_type            3719533
cr_cnt                   3719533
creative_id              3719533
f_cnt                    3719533
geo_continent_code       3719533
geo_country_code2        3719533
geo_dma_code             3719533
geo_region_name          3719533
geo_timezone             3719533
i_cnt                    3719533
i_timestamp              3719533
pub_network_id           3719533
r_cnt                    3719533
r_num_ads_requested      3719533
r_num_ads_returned       3719533
r_num_ads_third_party    3719533
r_timestamp              3719533
rate_metric              3719533
session_id               3719533
site_id                  3719533
token                    3719533
ua_device                3719533
ua_device_type           3719533
ua_name   

Now, let's do some final preprocessing of our dataframe. 

In [3]:
#Turns a timestamp into which minute the time was at - used as a categorical feature.
def timestamp_to_min(timestamp, is_hour=True):
    if is_hour:
        return timestamp.split(':')[0][-2:]
    else: 
        return timestamp.split(':')[1]

#plots frequency of a feature's different classes, useful for exploratory analysis
def plot_freq(col_name, df):
    df_frequency = df.groupby(col_name).agg('count').sort_values('ad_type',ascending=False)
    plt.plot([i for i in range(len(df_frequency.values))], [np.log(i[2]) for i in df_frequency.values])
    plt.show()

#if a feature only has one unique value, it tells us nothing, so we drop it.
def remove_only_ones(df):
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, inplace=True,axis=1)

#just prints how many unique values are in each feature
def print_column_counts(df):    
    for i in df:
        print(i, df[i].nunique())

#We do some final cleaning, changing all non-numerical features into strings for later.
def preprocess(df):    
    for i in df:
        if i[-1] != 't' or i[-2] != 'n' or i[-3] != 'c':
            df[i] = df[i].astype('str')
    remove_only_ones(df)
    if 'site_id' in df.columns:
        df.drop('site_id',inplace=True,axis=1)
    df['i_timestamp'] = df['i_timestamp'].apply(timestamp_to_min)
    df['r_timestamp'] = df['r_timestamp'].apply(timestamp_to_min)
    
#given a categorical column, we apply our earlier strategy of one-hot-encoding with maximum thresh=200
def transform_column(df, col, thresh=200, return_labels=False):
    df_frequency = df[[col, 'c_cnt']].groupby(col).agg('count').sort_values('c_cnt',ascending=False)
    if df[col].nunique() > thresh:
        enc = CategoricalEncoder(categories=[sorted(df_frequency[0:thresh].index.values)],handle_unknown='ignore')
        labels = df_frequency[0:thresh].index.values
    else:
        enc = CategoricalEncoder(categories=[sorted(df_frequency.index.values)],handle_unknown='ignore')
        labels = df_frequency.index.values
    labels = [str(col) + str(i) for i in labels]
    if return_labels:
        return labels
    enc.fit(df[col].values.reshape(-1, 1))
    return enc.transform(df[col].values.reshape(-1,1)).toarray()

preprocess(df)
#this set contains our numerical column names
numerical_features = set(['c_cnt', 'i_cnt', 'r_cnt', 'vi_cnt'])

Now we convert our dataframe into X and Y matrices to do machine learning on.

In [4]:
'''This function takes in the feature matrix, and drops all columns that are closely correlated, up to some threshhold
between 0 and 1, with 0 being no correlation and 1 being high correlation. We do this so there is no
overrepresentation of any one 'true feature', and to reduce overfitting.'''
def remove_corr_features(X, labels, thresh=0.95):
    correlation_matrix, corrs, toremove = np.corrcoef(X.T), [], set()
    
    for i, row in enumerate(correlation_matrix):
        for j, corr in enumerate(row):
            if j <= i: continue
            if corr > thresh:
                toremove.add(j)   
                corrs.append((corr, labels[i], labels[j]))

    X = np.delete(X, [i for i in toremove], 1)
    labels = np.delete(labels, [i for i in toremove])
    
'''
This function takes in X as the feature matrix and Y as the label matrix, and undersamples the majority class such that 
#majority class samples / #minority class samples = pos_ratio. Returns the new X and Y matrices
'''
def fix_class_imbalance_with_subsampling(X, Y, pos_ratio=9):
    Y = Y.reshape(-1,1)
    ind_1, ind_0 = [], []
    for i, y_h in enumerate(Y):
        if y_h: ind_1.append(i)
        else: ind_0.append(i)
    to_sample = np.random.permutation(pos_ratio*len(ind_1))
    to_sample_0 = [ind_0[i] for i in to_sample]
    X2 = np.vstack([X[ind_1],X[to_sample_0]])
    Y2 = np.vstack([Y[ind_1],Y[to_sample_0]])
    Y = Y.reshape(-1)
    
    new_ind = np.random.permutation(len(X2))
    return X2[new_ind],Y2[new_ind]

'''
This function takes a dataframe with a one hot encoding threshhold and correlation threshhold, and returns
X as the feature matrix, Y as the label matrix, and labels as a list of feature names corresponding to X.
'''
def get_data_matrix(df, one_hot_thresh=5, corr_thresh=0.95):
    Y = df['c_cnt'].values
    labels = np.hstack([transform_column(df, col, thresh=one_hot_thresh, return_labels=True) if col not in numerical_features else [str(col)]
                        for col in df if col != 'c_cnt'])
    X = np.hstack([transform_column(df, col, thresh=one_hot_thresh) if col not in numerical_features else df[col].values.reshape(-1,1)
               for col in df if col != 'c_cnt'])
    remove_corr_features(X, labels, corr_thresh)
    return X, Y, labels

'''
This function does cross validation by splitting our data into training and validation sets.
'''
def test_train_split(X, Y, thresh=0.7): 
    #shuffle our data
    new_order = np.random.permutation(len(X))
    X, Y = X[new_order], Y[new_order]
    #split our data
    cutoff = int(thresh*len(X))
    X_train, Y_train = X[0:cutoff], Y[0:cutoff]
    X_test, Y_test = X[cutoff:], Y[cutoff:]
    return X_train, Y_train, X_test, Y_test

'''
This function takes in X as the feature matrix and Y as the label matrix, and undersamples the majority class such that 
#majority class samples / #minority class samples = pos_ratio. Returns the new X and Y matrices. This should only
be done on the training dataset - validation CANNOT be undersampled or you introduce outside bias.
'''
def fix_class_imbalance_with_subsampling(X, Y, pos_ratio=9):
    Y = Y.reshape(-1,1)
    ind_1, ind_0 = [], []
    for i, y_h in enumerate(Y):
        if y_h: ind_1.append(i)
        else: ind_0.append(i)
    to_sample = np.random.permutation(pos_ratio*len(ind_1))
    to_sample_0 = [ind_0[i] for i in to_sample]
    X2 = np.vstack([X[ind_1],X[to_sample_0]])
    Y2 = np.vstack([Y[ind_1],Y[to_sample_0]])
    Y = Y.reshape(-1)
    
    new_ind = np.random.permutation(len(X2))
    return X2[new_ind],np.ravel(Y2[new_ind])

Now for the actual machine learning - we will be focusing on Logistic Regression.

In [None]:
'''
We will be using f1 score to measure our models, which is a good performance measurement scalar for datasets
where the negatives >> positives.
'''
def get_f1_score(test_cm):
    true_neg  = test_cm[0][0]
    false_pos = test_cm[0][1]
    false_neg = test_cm[1][0]
    true_pos  = test_cm[1][1]
    precision = true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    score = 2*precision*recall/(precision+recall)
    return score

'''
Perform grid search on logistic regression. Our hyperparameters are as below. Returns confusion matrices for each
set of hyperparameters.
'''
def lr_grid_search(df, best, params):
    '''
    Current best - 0.02941812675751676 (5, 25, 'l2', 0.1, 'newton-cg', 'ovr'), iteration 540
    '''
    iter = 1
    grid_search_dict = {}
    Y = df['c_cnt'].values
    best = 0

    #for one_hot_thresh in [2, 5, 10, 25]:
    for one_hot_thresh in [25]:
        X = get_data_matrix(df,one_hot_thresh=one_hot_thresh)[0]
        for pos_rate in [1, 2, 5, 10, 25, 50]:
            X_train, Y_train, X_test, Y_test = test_train_split(X, Y)
            X_train, Y_train = fix_class_imbalance_with_subsampling(X_train, Y_train,pos_ratio=pos_rate)

            for penalty in ['l1', 'l2']:
                for C in [0.01, 0.1, 1]:
                    for solver in ['newton-cg', 'sag', 'saga', 'lbfgs']:
                        if penalty == 'l1' and solver != 'saga': continue
                        for mult_class in ['ovr', 'multinomial']:
                            print(iter)
                            print(best, params)
                            iter += 1
                            av_score = 0
                            for trial in range(3):
                                lr = LogisticRegression(penalty=penalty, C=C, solver=solver, multi_class=mult_class, max_iter=5000)
                                lr.fit(X_train, Y_train)
                                test_cm = confusion_matrix(Y_test, lr.predict(X_test))
                                grid_search_dict[(one_hot_thresh, pos_rate, penalty, C, solver, mult_class)] = test_cm
                                av_score += get_f1_score(test_cm)/3
                        
                            if av_score > best:
                                best = av_score
                                params = (one_hot_thresh, pos_rate, penalty, C, solver, mult_class)
    return grid_search_dict
        
'''
Performs feature scoring by randomly subsampling our features and scoring each subsample. 
'''
def feature_scoring_random_subsample(lst, df, model, oh_thresh, resample_thresh, p=0.3, iterations=10000):
    lst, transformed_col, past_score = [], {}, {}
    for col in df2.columns:
        if col not in numerical_features:
            transformed_col[col] = transform_column(df, col, thresh=oh_thresh)
    Y = df['c_cnt'].values
    scores = []
    for i in iterations:
        print("Iteration: ", j)
        #subsample random columns with probability p
        selected = [c for c in df2.columns if random.random() < p]
        #if we have no items, super unlucky, try again
        if len(selected) < 1: continue
        X = np.hstack([transformed_col[col] if col not in numerical_features else df[col].values.reshape(-1,1)
                   for col in selected])
        X_train, Y_train, X_test, Y_test = test_train_split(X, Y)
        X_train, Y_train = fix_class_imbalance_with_subsampling(X_train, Y_train,pos_ratio=resample_thresh)

        lr = clone(model)
        lr.fit(X_train, Y_train)
        test_cm = confusion_matrix(Y_test, [bin_round(i) for i in lr.predict(X_test)])
        score = get_f1_score(test_cm)
        if math.isnan(score): continue
        scores.append((selected, score))
        
        if i > 0 and i % 100 == 0: #Updates score list every 100 iterations, and checks for convergence
            feature_scores = {}
            for f in df.columns:
                feature_score = []
                for cols, score in scores:
                    if f in cols:
                        feature_score.append(score)
                feature_scores[f] = np.mean(feature_score)
            lst = []
            for feat in feature_scores:
                lst.append((feature_scores[feat], feat))
            lst = sorted(lst)
            lst.reverse()
            #check for convergence
            if i > 100:
                has_conv = True
                for f in df.columns:
                    if abs(past_score[f]-feature_scores[f])/feature_scores[f] > 0.01: has_conv = False
                if has_conv:
                    return
            past_score = feature_scores


Time to start using our functions.

In [None]:
best, params = None, None
lr_grid_search(df, best, params)
print(best, params)