In [1]:
import numpy as np
import sklearn
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import json
import math
import random
import itertools
import pickle
import datetime
import pytz
from pytz import timezone
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import CategoricalEncoder
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
#CategoricalEncoder is part of sklearn's developer version, which you can't just update with conda. If you have issues
#getting this version, try a hard code implementation of the library here - https://pastebin.com/qs1es9XE. There is
#no implementation of the labels method for the hard code, but it should be relatively easy to implement.

OHE_PATH = "data/ohe_labels.pkl"

First, let us read in our data.

In [2]:
#Open our hdf files
pos_store1 = pd.HDFStore('data/day1_positives_processed.h5')
neg_store1 = pd.HDFStore('data/day1_negatives_processed.h5')
pos_store2 = pd.HDFStore('data/day2_positives_processed.h5')
neg_store2 = pd.HDFStore('data/day2_negatives_processed.h5')

#Load out dataframes
df_pos1 = pos_store1['df'] 
df_neg1 = neg_store1['df']
df_pos2 = pos_store2['df'] 
df_neg2 = neg_store2['df']

#Close our hdf files
pos_store1.close()
neg_store1.close()
pos_store2.close()
neg_store2.close()

#combine across days and free excess memory
df_pos = pd.concat([df_pos1, df_pos2])
df_neg = pd.concat([df_neg1, df_neg2])
df_pos1, df_pos2, df_neg1, df_neg2 = None, None, None, None
df_pos = df_pos.sample(frac=1)
df_neg = df_neg.sample(frac=1)

In [59]:
#how many samples of each?
print(df_pos.shape)
print(df_neg.shape)

(1305631, 39)
(626297, 39)


Here is the new transform_column function for applying one hot encoding. This is written for batch training and entire dataset training - set the boolean to True or False depending on which one you are trying to do. Threshhold for batch training is the minimum # of times a class must appear to be a part of the OHE. Threshold for entire dataset training is the maximum # of classes to keep.

In [3]:
#this is the helper function for preprocessing ohe labels - dont need to explicitly call this function
def generate_ohe_labels(df, c, thresh=200, k_most_freq=False):
    appears, oh_index = {}, {}
    if c == 'keywords':
        for val in df[c].values:
            for word in val:
                if word not in appears:
                    appears[word] = 0
                appears[word] += 1
        if k_most_freq:
            for v in sorted(appears)[0:thresh]:
                oh_index[v] = len(oh_index)
        else:
            for v in [k for k in appears.keys()]:
                if appears[v] < thresh:
                    del appears[v]
            for v in sorted(appears):
                oh_index[v] = len(oh_index)
    else:    
        for val in df[c].values:
            if val not in appears:
                appears[val] = 0
            appears[val] += 1
        if k_most_freq:
            for v in sorted(appears)[0:thresh]:
                oh_index[v] = len(oh_index)
        else:
            for v in [k for k in appears.keys()]:
                if appears[v] < thresh:
                    del appears[v]
            for v in sorted(appears):
                oh_index[v] = len(oh_index)
    return oh_index

#generate OHE labels to be used for batch learning - run this FIRST
def preprocess_ohe(df, thresh=200, path=OHE_PATH, k_most_freq=False):
    #create and save our ohe labels
    ohe_labels = {}
    for c in df:
        if c == 'c_cnt':
            continue
        else:
            ohe_labels[c] = generate_ohe_labels(df, c, thresh, k_most_freq=k_most_freq)

    with open(path, 'wb') as f:
        pickle.dump(ohe_labels, f, pickle.HIGHEST_PROTOCOL)
        
    return ohe_labels
        
#generates a small X and Y matrix by sampling from both negative and positive dataframes
def generate_batch(df_pos, df_neg, batch_size, path=OHE_PATH, pos_ratio=1, ohe_labels=None):
    #load our ohe labels
    if ohe_labels == None:
        with open(path, 'rb') as f:
            ohe_labels = pickle.load(path)
    
    #pick which indices to use for our batch training
    indices_touse_pos = np.random.permutation(len(df_pos))[0:batch_size]
    indices_touse_neg = np.random.permutation(len(df_neg))[0:int(batch_size*pos_ratio)]
    
    #generate X and Y matrices
    X, Y = [], []
    for i in range(batch_size):
        sample_x, sample_y = generate_one_sample(df_pos, ohe_labels)
        X.append(sample_x)
        Y.append(sample_y)
    for i in range(int(batch_size*pos_ratio)):
        sample_x, sample_y = generate_one_sample(df_neg, ohe_labels)
        X.append(sample_x)
        Y.append(sample_y)
    
    #shuffle X and Y matrices
    shuffled_indices = np.random.permutation(len(X))
    return [X[i] for i in shuffled_indices], [Y[i] for i in shuffled_indices]
    
#generates exactly one random sample from a dataframe using OHE. this is a helper function, shouldn't be explicitly called
#if index is not -1, will not generate random index
def generate_one_sample(df, ohe_labels, index=-1):
    if index == -1:
        index = np.random.randint(0,len(df))
    X = [[0 if ohe_labels[c][df[c].values[index]] != j else 1 for j in range(len(ohe_labels[c]))]
         if df[c].values[index] in ohe_labels[c] else [0 for j in range(len(ohe_labels[c]))]
                  for c in ohe_labels if c != 'c_cnt' and c != 'keywords']
    wordset = set([w for w in df['keywords'].values[index]])
    X.append([1 if v in wordset else 0 for v in ohe_labels['keywords']])
    X = np.array(X)
    X = np.hstack(X)
    return X, df['c_cnt'].values[index]

#takes in the dataframe, returns an X and Y matrix 
def transform_df(df, ohe_labels):
    data = np.vstack([generate_one_sample(df, ohe_labels, index=i) for i in range(len(df))])
    X = data[0:-1]
    Y = data[-1].flatten()
    return X, Y

#generate ohe labels
ohe_labels = preprocess_ohe(df_neg, k_most_freq=False)
#get validation set - take first 100,000 samples of both
df_validate = pd.concat([df_neg.head(100000), df_pos.head(100000)])
df_pos = df_pos.tail(len(df_pos)-100000)
df_neg = df_neg.tail(len(df_neg)-100000)


'''If you wanted to do one hot encoding for k most frequent classes - use this code
ohe_labels = preprocess_ohe(df_neg, thresh=k, k_most_freq=True)
df = transform_df(df, ohe_labels)

'''

Let's train a few models with different l1/l2 losses and alpha values.

In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss

def validate(models, df_validate):
    Y_test, Y_pred, Y_prob = [], [[] for i in models], [[] for i in models]
    #Separate the validation dataframe into batches of 100
    for i in range(int(len(df_validate)/100)):
        X, Y = [], []
        for j in range(i*100,(i+1)*100):
            miniX, miniY = generate_one_sample(df_validate, ohe_labels, index=j)
            X.append(miniX)
            Y.append(miniY)
        Y_test.extend(Y)
        for i, model in enumerate(models):
            Y_pred[i].extend(model.predict(X))
            Y_prob[i].extend(model.predict_proba(X))
    #get log loss
    logloss = [log_loss(Y_test, p) for p in Y_prob]
    test_cm = [confusion_matrix(Y_test, i) for i in Y_pred]
    
    return logloss, test_cm

models = [SGDClassifier(loss='log',penalty='l1',alpha=0.1), SGDClassifier(loss='log',penalty='l2',alpha=0.1),
          SGDClassifier(loss='log',penalty='l1',alpha=1), SGDClassifier(loss='log',penalty='l2',alpha=1), 
          SGDClassifier(loss='log',penalty='l1',alpha=0.01), SGDClassifier(loss='log',penalty='l2',alpha=0.01)]

X, Y = generate_batch(df_pos, df_neg, 100, ohe_labels=ohe_labels)
print(np.array(X).shape)
for i in range(len(models)):
    models[i].partial_fit(X, Y, classes=[0, 1])
for i in range(10000):
    X, Y = generate_batch(df_pos, df_neg, 100, ohe_labels=ohe_labels)
    if i % 10 == 0 and i != 0:
        for j in range(len(models)):
            print("Model ", j)
            train_cm = confusion_matrix(Y, models[j].predict(X))
            print("iteration ", i, "with training score ", models[j].score(X,Y))
            print("Training confusion matrix:")
            print(train_cm)
    if i % 1000 == 0 and i != 0:
        logloss, test_cm = validate(models, df_validate)
        for i in range(len(models)):
            print("Model ", i, ", Log loss score: ", logloss[i])
            print("Test confusion matrix:")
            print(test_cm[i])
    for j in range(len(models)):
        models[j].partial_fit(X, Y, classes=[0, 1])

(200, 5333)
Model  0
iteration  10 with training score  0.855
Training confusion matrix:
[[85 17]
 [12 86]]
Model  1
iteration  10 with training score  0.855
Training confusion matrix:
[[85 17]
 [12 86]]
Model  2
iteration  10 with training score  0.51
Training confusion matrix:
[[102   0]
 [ 98   0]]
Model  3
iteration  10 with training score  0.84
Training confusion matrix:
[[82 20]
 [12 86]]
Model  4
iteration  10 with training score  0.93
Training confusion matrix:
[[94  8]
 [ 6 92]]
Model  5
iteration  10 with training score  0.93
Training confusion matrix:
[[96  6]
 [ 8 90]]
Model  0
iteration  20 with training score  0.83
Training confusion matrix:
[[88 14]
 [20 78]]
Model  1
iteration  20 with training score  0.845
Training confusion matrix:
[[88 14]
 [17 81]]
Model  2
iteration  20 with training score  0.51
Training confusion matrix:
[[102   0]
 [ 98   0]]
Model  3
iteration  20 with training score  0.82
Training confusion matrix:
[[84 18]
 [18 80]]
Model  4
iteration  20 with

KeyboardInterrupt: 

It seems that a l2 loss with .01 alpha does very good. Let's see how it does on the first day's worth of data. This is consdiered as the test-set

In [5]:
#Open our hdf files
day1_store = pd.HDFStore('data/day1_positives_processed.h5')

#Load out dataframes
df_day1 = day1_store['df'] 

#Close our hdf files
day1_store.close()

#too many samples, let's lower it a little
df_day1 = df_day1.sample(frac=0.2)

Let's validate on this

In [None]:
logloss, test_cm = validate([models[5]], df_day1)
print(logloss[0])
print(test_cm[0])

Now we convert our dataframe into X and Y matrices to do machine learning on.

In [None]:
'''This function takes in the feature matrix, and drops all columns that are closely correlated, up to some threshhold
between 0 and 1, with 0 being no correlation and 1 being high correlation. We do this so there is no
overrepresentation of any one 'true feature', and to reduce overfitting.'''
def remove_corr_features(X, labels, thresh=0.95):
    correlation_matrix, corrs, toremove = np.corrcoef(X.T), [], set()
    if not isinstance(correlation_matrix, list): return
    
    for i, row in enumerate(correlation_matrix):
        for j, corr in enumerate(row):
            if j <= i: continue
            if corr > thresh:
                toremove.add(j)   
                corrs.append((corr, labels[i], labels[j]))

    X = np.delete(X, [i for i in toremove], 1)
    if labels != None: labels = np.delete(labels, [i for i in toremove])
    
'''
This function takes in X as the feature matrix and Y as the label matrix, and undersamples the majority class such that 
#majority class samples / #minority class samples = pos_ratio. Returns the new X and Y matrices. This should only
be done on the training dataset - validation CANNOT be undersampled or you introduce outside bias.
'''
def fix_class_imbalance_with_subsampling(X, Y, pos_ratio=9):
    Y = Y.reshape(-1,1)
    ind_1, ind_0 = [], []
    for i, y_h in enumerate(Y):
        if y_h: ind_1.append(i)
        else: ind_0.append(i)
    to_sample = np.random.permutation(min(int(pos_ratio*len(ind_1)),len(ind_0)))
    to_sample_0 = [ind_0[i] for i in to_sample]
    X2 = np.vstack([X[ind_1],X[to_sample_0]])
    Y2 = np.vstack([Y[ind_1],Y[to_sample_0]])
    Y = Y.reshape(-1)
    
    new_ind = np.random.permutation(len(X2))
    return X2[new_ind],np.ravel(Y2[new_ind])

'''
This function takes a dataframe with a one hot encoding threshhold and correlation threshhold, and returns
X as the feature matrix, Y as the label matrix, and labels as a list of feature names corresponding to X.
'''
def get_data_matrix(df, one_hot_thresh=5, corr_thresh=0.95):
    Y = df['c_cnt'].values
    labels = np.hstack([transform_column(df, col, thresh=one_hot_thresh, return_labels=True) if col not in numerical_features else [str(col)]
                        for col in df if col != 'c_cnt'])
    X = np.hstack([transform_column(df, col, thresh=one_hot_thresh) if col not in numerical_features else df[col].values.reshape(-1,1)
               for col in df if col != 'c_cnt'])
    remove_corr_features(X, labels, corr_thresh)
    return X, Y, labels

'''
This function does cross validation by splitting our data into training and validation sets.
'''
def test_train_split(X, Y, thresh=0.7): 
    #shuffle our data
    new_order = np.random.permutation(len(X))
    X, Y = X[new_order], Y[new_order]
    #split our data
    cutoff = int(thresh*len(X))
    X_train, Y_train = X[0:cutoff], Y[0:cutoff]
    X_test, Y_test = X[cutoff:], Y[cutoff:]
    if sum(Y_train) < 100 or sum(Y_test) < 100: return test_train_split(X, Y, thresh=0.7)
    return X_train, Y_train, X_test, Y_test

'''
This function takes in our original dataframe, and outputs a new one where each categorical feature is
replaced by the empirical click through rate of each class. Returns X_train,Y_train,X_test,Y_test
'''
def generate_empirical_data(df, numerical_features, thresh=0.7, corr_thresh=0.95):
    df = df.sample(frac=1)
    train_size = int(thresh*df.shape[0])
    test_size = df.shape[0]-train_size
    df_train = df.head(train_size).copy()
    df_test = df.tail(test_size).copy()
    for col in df_train:
        if col in numerical_features or col == 'c_cnt': continue
        df_star = df_train.groupby(col)['c_cnt'].mean()
        av_dict = {}
        for col_id, av in zip(df_star.index, df_star.values):
            av_dict[col_id] = av
        df_train[col] = [av_dict[v] for v in df_train[col]]
        df_test[col] = [av_dict[v] if v in av_dict else 0 for v in df_test[col]]
    Y_train, Y_test = df_train['c_cnt'].values, df_test['c_cnt'].values
    X_train, X_test = df_train[[c for c in df_train.columns if c != 'c_cnt']].values, df_test[[c for c in df_train.columns if c != 'c_cnt']].values
    X = np.vstack([X_train, X_test])
    remove_corr_features(X, None, corr_thresh)
    X_train, X_test = X[0:train_size], X[train_size:]
    return X_train, Y_train, X_test, Y_test

Now for the actual machine learning - we will be focusing on Logistic Regression.

In [None]:
'''
We will be using f1 score to measure our models, which is a good performance measurement scalar for datasets
where the negatives >> positives.
'''
def get_f1_score(test_cm):
    true_neg  = test_cm[0][0]
    false_pos = test_cm[0][1]
    false_neg = test_cm[1][0]
    true_pos  = test_cm[1][1]
    precision = true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    if precision + recall == 0: return 0
    score = 2*precision*recall/(precision+recall)
    return score

def test_model(df, model, one_hot_thresh, pos_rate):
    X, Y, labels = get_data_matrix(df,one_hot_thresh=one_hot_thresh)
    X_train, Y_train, X_test, Y_test = test_train_split(X, Y)
    X_train, Y_train = fix_class_imbalance_with_subsampling(X_train, Y_train,pos_ratio=pos_rate)
    model.fit(X_train, Y_train)
    test_cm = confusion_matrix(Y_test, model.predict(X_test))
    return test_cm
'''
Perform grid search on logistic regression. Our hyperparameters are as below. Returns confusion matrices for each
set of hyperparameters.
'''
def lr_grid_search(df, X=None, Y=None, default_start=True, toprint=False):
    '''
    Current best - 0.02941812675751676 (5, 25, 'l2', 0.1, 'newton-cg', 'ovr'), iteration 540
    for 25 one hot thresh - 0.025665219852802415 (25, 25, 'l2', 0.1, 'saga', 'multinomial')
    '''
    iter = 1
    best = 0
    grid_search_dict = {}
    params = None
    if default_start == True:
        print("default start")
        Y = df['c_cnt'].values
        for one_hot_thresh in [2, 5, 10, 25]:
            X = get_data_matrix(df,one_hot_thresh=one_hot_thresh)[0]
            tempparams, score = lr_grid_search(df, X, Y, default_start=False)
            if score > best:
                params = (one_hot_thresh, tempparams)
                best = score
    else:
        remove_corr_features(X, None, 0.9)
        for pos_rate in [1, 2, 5, 10]:
            for penalty in ['l1', 'l2']:
                for C in [0.01, 0.1, 1]:
                    for solver in ['newton-cg', 'sag', 'saga', 'lbfgs']:
                        if penalty == 'l1' and solver != 'saga': continue
                        for mult_class in ['ovr', 'multinomial']:
                            if toprint:
                                print(iter)
                                print(best, params)
                                iter += 1
                            av_score = 0
                            for trial in range(2):
                                X_train, Y_train, X_test, Y_test = test_train_split(X, Y)
                                X_train, Y_train = fix_class_imbalance_with_subsampling(X_train, Y_train,pos_ratio=pos_rate)
                                lr = LogisticRegression(penalty=penalty, C=C, solver=solver, multi_class=mult_class, max_iter=5000)
                                lr.fit(X_train, Y_train)
                                test_cm = confusion_matrix(Y_test, lr.predict(X_test))
                                av_score += get_f1_score(test_cm)/2

                            if av_score > best:
                                best = av_score
                                params = (pos_rate, penalty, C, solver, mult_class)
    return params, best
        
'''
Performs feature scoring by randomly subsampling our features and scoring each subsample. 
'''
def feature_scoring_random_subsample(lst, df, model, oh_thresh, resample_thresh, p=0.3, iterations=10000):
    transformed_col, past_score = {}, {}
    for col in df.columns:
        if col == 'c_cnt': continue
        if col not in numerical_features:
            transformed_col[col] = transform_column(df, col, thresh=oh_thresh)
    Y = df['c_cnt'].values
    scores = []
    for i in range(iterations):
        print("Iteration: ", i)
        selected = [c for c in df.columns if random.random() < p and c != 'c_cnt']
        if len(selected) < 1: continue
        X = np.hstack([transformed_col[col] if col not in numerical_features else df[col].values.reshape(-1,1)
                   for col in selected])
        params, score = lr_grid_search(df, X, Y, default_start = False)

        if math.isnan(score): continue
        scores.append((selected, score))
        
        if i > 0 and i % 100 == 0: #Updates score list every 100 iterations, and checks for convergence
            feature_scores = {}
            for f in df.columns:
                if f == 'c_cnt': continue
                feature_score = []
                for cols, score in scores:
                    if f in cols:
                        feature_score.append(score)
                feature_scores[f] = np.mean(feature_score)
            lst = []
            for feat in feature_scores:
                lst.append((feature_scores[feat], feat))
            lst = sorted(lst)
            lst.reverse()
            #check for convergence
            if i > 100:
                print(lst)
                has_conv = True
                for f in df.columns:
                    if f not in past_score or f not in feature_scores:
                        has_conv = False
                        break
                    if abs(past_score[f]-feature_scores[f])/feature_scores[f] > 0.01: has_conv = False
                if has_conv:
                    return lst
            past_score = feature_scores
            
'''
Forward Selection: Forward selection is an iterative method in which we start with having no feature in the model.
In each iteration, we keep adding the feature which best improves our model till an addition of a new variable
does not improve the performance of the model.
'''
def feature_scoring_upsample(feat_set, df, oh_thresh):
    df = df.sample(frac=1) #shuffles rows
    transformed_col, feat_set, Y = {}, set(), df['c_cnt'].values
    for col in df.columns:
        if col == 'c_cnt': continue
        if col not in numerical_features:
            transformed_col[col] = transform_column(df, col, thresh=oh_thresh)
        else:
            transformed_col[col] = df[col].values.reshape(-1,1)
    last_score = 0
    while True:
        selected = [c for c in df.columns if c in feat_set]
        best_col = None
        cur_score = 0
        for col in df.columns:
            if col == 'c_cnt': continue
            
            if col not in feat_set:
                selected.append(col)
                X = np.hstack([transformed_col[c] for c in selected])
                params, score = lr_grid_search(df, X, Y, default_start = False)
                if math.isnan(score): continue
                selected.pop()
                if score > cur_score:
                    cur_score = score
                    best_col = col
                    best_params = params
        
        print(best_col)
        print(cur_score)
        print(best_params)
        if cur_score > last_score:
            last_score = cur_score
            feat_set.add(best_col)
        else: return last_score, best_params, feat_set
        
'''
Backward Elimination: In backward elimination, we start with all the features and removes the least 
significant feature at each iteration which improves the performance of the model. We repeat this 
until no improvement is observed on removal of features.
'''
def feature_scoring_downsample(df, oh_thresh):
    df = df.sample(frac=1) #shuffles rows
    transformed_col, feat_set, Y = {}, set([c for c in df.columns if c != 'c_cnt']), df['c_cnt'].values
    for col in df.columns:
        if col == 'c_cnt': continue
        if col not in numerical_features:
            transformed_col[col] = transform_column(df, col, thresh=oh_thresh)
        else:
            transformed_col[col] = df[col].values.reshape(-1,1)
            
    X, Y, labels = get_data_matrix(df, one_hot_thresh=oh_thresh)
    last_params, last_score = lr_grid_search(df, X, Y, default_start=False)
    
    while True:
        worst_col = None
        cur_score = 0
        for col in feat_set:
            if col == 'c_cnt': continue
            X = np.hstack([transformed_col[c] for c in feat_set if c != col])
            params, score = lr_grid_search(df, X, Y, default_start=False)
            if math.isnan(score): continue
            if score > cur_score:
                bestparams = params
                cur_score = score
                worst_col = col
        print(worst_col)
        print(cur_score)
        print(bestparams)
        if cur_score > last_score:
            last_params = bestparams
            last_score = cur_score
            feat_set.remove(worst_col)
        else:
            return last_score, last_params, feat_set


Time to start using our functions.

In [None]:
#This is the model we found by hyperparameter grid search over all features. Its conf_mat is below
model = LogisticRegression(penalty='l2', C=0.1, solver='newton-cg', multi_class='ovr', max_iter=5000)
test_model(df, model, 5, 15)

In [None]:
ranking = feature_scoring_random_subsample([], df, model, 5, 25)

In [26]:
upsample_set = feature_scoring_upsample(None, df, 15)



geo_timezone
0.014188267394270123
(10, 'l1', 0.1, 'saga', 'ovr')




geo_region_name
0.016731734523145563
(5, 'l1', 0.1, 'saga', 'ovr')




i_cnt
0.016007276034561164
(5, 'l1', 0.1, 'saga', 'ovr')


In [None]:
#downsample_set will contain the set of features that we SHOULD use, this is printing every feature we remove starting
#from a full featureset, and the corresponding f1 score after its removal
downsample_set = feature_scoring_downsample(df, 15)

In [47]:
print("down", downsample_set)
print("up", upsample_set)
print("sub", ranking)

down {'zone_id', 'campaign_type', 'ua_os_name', 'token', 'r_num_ads_requested', 'ua_device', 'vi_cnt', 'geo_dma_code', 'r_num_ads_returned', 'r_cnt', 'pub_network_id', 'campaign_id', 'ua_device_type', 'i_timestamp', 'session_id', 'geo_continent_code', 'advertiser_id', 'r_timestamp', 'ad_network_id', 'creative_id', 'geo_region_name', 'ad_type', 'i_cnt', 'ua_name'}
up {'geo_region_name', 'r_timestamp', 'ad_network_id', 'geo_country_code2'}
sub [(0.01808507856592656, 'vi_cnt'), (0.017034778877041468, 'ua_device_type'), (0.015414808047934946, 'zone_id'), (0.015070605636681994, 'ua_device'), (0.015014100541612238, 'campaign_type'), (0.014662554007265311, 'pub_network_id'), (0.014261141774854916, 'geo_continent_code'), (0.014025323583435303, 'ad_type'), (0.01399589042913707, 'r_cnt'), (0.013909555396328942, 'geo_dma_code'), (0.013862394185223366, 'geo_region_name'), (0.013840340065613889, 'r_num_ads_requested'), (0.01373979182599108, 'i_cnt'), (0.013503942596251473, 'geo_timezone'), (0.01339

In [8]:
X_train_orig, Y_train_orig, X_test_orig, Y_test = generate_empirical_data(df, numerical_features)
scaler = StandardScaler()
scaler.fit(X_train_orig)
X_train = scaler.transform(X_train_orig)
X_test = scaler.transform(X_test_orig)

In [19]:
X_train, Y_train = fix_class_imbalance_with_subsampling(X_train_orig, Y_train_orig,pos_ratio=1)
print(X_train.shape)
print(X_test.shape)

(1806, 26)
(1115860, 26)


In [20]:
print(sum(Y_test))
print(sum(Y_train))
lr = LogisticRegression(penalty='l2', C=0.1, solver='newton-cg', multi_class='ovr', max_iter=5000)
lr.fit(X_train, Y_train)
train_cm = confusion_matrix(Y_train, lr.predict(X_train))
test_cm = confusion_matrix(Y_test, lr.predict(X_test))
print(train_cm)
print(test_cm)
print(get_f1_score(test_cm))

651.0
1642.0
[[ 164    0]
 [   0 1642]]
[[1115209       0]
 [    651       0]]
nan




In [32]:
X_test_ones = np.array([X_test[i] for i in range(len(X_test)) if Y_test[i] == 1])
X_train_ones = np.array([X_train[i] for i in range(len(X_train)) if Y_train[i] == 1])
print(X_test_ones.shape)
print(X_train_ones.shape)

for i in X_test[0:100]:
    print(i)
        

(656, 26)
(1637, 26)
[ 4.55085138  0.01094738  4.82286735  4.80179828  4.64949678  3.45059112
  3.489778    1.19359815  3.01985406 -0.50844335  1.11154434  0.0288747
  0.96138336  4.64949678  0.04599271  4.56317366  4.55254939  0.95575878
 -0.02509766  3.45059112  0.70932045 -0.50432088 -0.13591287  0.11447926
  2.94252212  4.95389886]
[ 0.38913751  0.01094738  0.38741855  0.23015282 -0.21507704  0.22484316
 -0.15642773 -0.11149777 -0.54676263 -0.1139568  -0.09081672  0.0288747
 -0.99684874 -0.21507704  0.04599271 -0.2177986  -0.23268629 -0.94766315
 -0.02509766  0.22484316  0.60942895  2.25919582 -0.02269706  0.11447926
 -0.33984451 -0.21878565]
[-0.35035553  0.01094738 -0.34880788 -0.34728408 -0.21507704 -0.33207117
 -0.15642773 -0.11149777 -0.29966939 -0.21924204 -0.09081672  0.0288747
 -1.053615   -0.21507704  0.04599271 -0.2177986  -0.23268629 -1.09987055
 -0.02509766 -0.33207117  0.04768277 -0.50432088 -0.13591287  0.11447926
 -0.33984451 -0.21878565]
[-0.28711737  0.01094738 -0.

In [9]:
Y = df['c_cnt'].values
X = df[[c for c in df.columns if c != 'c_cnt']].values
X_ones = np.array([X[i] for i in range(len(X)) if Y[i] == 1])
print(len(X_ones))
for i in range(len(X_ones)):
    for j in range(i+1, len(X_ones)):
        if list(X_ones[i]) == list(X_ones[j]): print(i,j)


2293


In [51]:
df = pd.read_pickle('Day1')

In [55]:
print(sum([i for i in df['c_cnt'].values if not math.isnan(i)]))
print(len([i for i in df['c_cnt'].values if not math.isnan(i)]))

2399.0
3801079


In [None]:
X, Y, labels = get_data_matrix(df, one_hot_thresh=379, corr_thresh=0.8)

ad_network_id
21
ad_type
3
advertiser_id
26
c_cnt
2
campaign_id
42
campaign_type
2
creative_id
117
geo_continent_code
6
geo_country_code2
189
geo_dma_code
211
geo_region_name
379
geo_timezone
256
i_cnt
2
i_timestamp
24
pub_network_id
2
r_cnt
2
r_num_ads_requested
6
r_num_ads_returned
8
r_timestamp
24
token
117
ua_device_type
4
ua_name
67
ua_os_name
34
vi_cnt
2
zone_id
39


In [29]:
upsample_set

(0.016731734523145563,
 (5, 'l1', 0.1, 'saga', 'ovr'),
 {'geo_region_name', 'geo_timezone'})