In [1]:
import numpy as np
import sklearn
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.preprocessing import CategoricalEncoder
#CategoricalEncoder is part of sklearn's developer version, which you can't just update with conda. If you have issues
#getting this version, try a hard code implementation of the library here - https://pastebin.com/qs1es9XE

Use the aggregated CPC 'Day1' dataset provided by Boris.

In [2]:
df = pd.read_pickle('../Day1')

Now, we need to do some data cleaning. From some initial exploratory analysis, we can see that that we have 5 features with only 16 non-nan values, with a few other features having a similarly low level of non-nan values. To simplify things, we choose to drop all features with less than some threshhold of non-nan values. Also, as we are trying to predict c_cnt, samples where c_cnt is NaN are useless, so we throw those away as well. 

After this, we see that less than 10% of our remaining samples contains any NaN values, so we just drop those samples as we don't lose that much information from them.

In [3]:
#how many non-nan values do we have?
print(df.count())
n = len(df)

#filter rows with c_cnt as NaN
df = df[np.isfinite(df['c_cnt'])]

#filter threshhold
df = df.dropna(thresh=int(0.5*n), axis=1)
#drop all samples with NaN values
df = df.dropna(axis=0)

ad_network_id            4783760
ad_type                  4783760
advertiser_id            4783760
bid_requests             4783760
bid_responses            4783760
c_cnt                    3801079
c_timestamp                 2400
c_txn_fee                   2400
c_txn_rate                  2400
campaign_id              4783760
campaign_type            4783760
cr_cnt                   4783760
creative_id              4783760
exp_mode                 1016557
f_cnt                    3801079
geo_continent_code       4783732
geo_country_code2        4783732
geo_dma_code             4783732
geo_region_name          4667981
geo_timezone             4716474
i_cnt                    3801079
i_timestamp              3799091
pub_network_id           4783760
r_cnt                    4783760
r_num_ads_requested      4783760
r_num_ads_returned       4783760
r_num_ads_third_party    4783760
r_timestamp              4783760
rate_metric              4783760
session_id               4783760
site_id   

Add the average click-to-impression ratio over the whole dataset as a feature.

In [4]:
grouped_cnts = pd.read_pickle('../grouped_cnts.pkl')
grouped_cnts['agg_c_cnt/agg_i_cnt'] = grouped_cnts['c_cnt']/grouped_cnts['i_cnt']
grouped_cnts.dropna(axis=0, inplace=True)
df = df.merge(grouped_cnts.loc[:,['pub_network_id', 'advertiser_id', 'ua_device_type', 'agg_c_cnt/agg_i_cnt']], 
              on=['pub_network_id', 'advertiser_id', 'ua_device_type'])

Now, we have some more preprocessing to do, so we wrote some simple functions for preprocessing. The most important thing we do here is that since most of our features are categorical, we must encode them with one-hot-encoding, which essentially turns one feature into n different features, one for each type of class in the original features. For example, if we had a feature for "hair color", we would map it to a higher dimensional feature space consisting of "is the hair white", "is the hair black", "is the hair brown", etc. Only one of these features would be a 1, and the rest would be 0.

Normally, each feature would be mapped to n features, with n being the number of unique classes that feature contains. For our data, however, some features will have thousands, even millions of unique classes, which would result is an omega-sparse dataset. To account for this, we set a threshhold at 200, such that n will never be greater than 201. We still keep track of the 200 most frequent classes, however, the rest will be bunched into a single class. The motivation for this is that for the more frequent classes, we have enough data that our ML models will be able to extract some information, but for the less frequent classes, there is too little data for accurate analysis, so we group them as one class.

In [5]:
#Turns a timestamp into which minute the time was at - used as a categorical feature.
def timestamp_to_min(timestamp, is_hour=True):
    if is_hour:
        return timestamp.split(':')[0][-2:]
    else: 
        return timestamp.split(':')[1]

#plots frequency of a feature's different classes, useful for exploratory analysis
def plot_freq(col_name, df):
    df_frequency = df.groupby(col_name).agg('count').sort_values('ad_type',ascending=False)
    plt.plot([i for i in range(len(df_frequency.values))], [np.log(i[2]) for i in df_frequency.values])
    plt.show()

#if a feature only has one unique value, it tells us nothing, so we drop it.
def remove_only_ones(df):
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, inplace=True,axis=1)

#just prints how many unique values are in each feature
def print_column_counts(df):    
    for i in df:
        print(i, df[i].nunique())

#We do some final cleaning, changing all non-numerical features into strings for later.
def preprocess(df):    
    for i in df:
        if i[-1] != 't' or i[-2] != 'n' or i[-3] != 'c':
            df[i] = df[i].astype('str')
    remove_only_ones(df)
    if 'site_id' in df.columns:
        df.drop('site_id',inplace=True,axis=1)
    df['i_timestamp'] = df['i_timestamp'].apply(timestamp_to_min)
    df['r_timestamp'] = df['r_timestamp'].apply(timestamp_to_min)
    
#given a categorical column, we apply our earlier strategy of one-hot-encoding with maximum thresh=200
def transform_column(df, col, thresh=200, return_labels=False):
    print(col)
    df_frequency = df[[col, 'c_cnt']].groupby(col).agg('count').sort_values('c_cnt',ascending=False)
    if df[col].nunique() > thresh:
        enc = CategoricalEncoder(categories=[sorted(df_frequency[0:thresh].index.values)],handle_unknown='ignore')
        labels = df_frequency[0:thresh].index.values
    else:
        enc = CategoricalEncoder(categories=[sorted(df_frequency.index.values)],handle_unknown='ignore')
        labels = df_frequency.index.values
    labels = [str(col) + str(i) for i in labels]
    if return_labels:
        return labels
    enc.fit(df[col].values.reshape(-1, 1))
    return enc.transform(df[col].values.reshape(-1,1)).toarray()

#final preprocessing
preprocess(df)
#this set contains our numerical column names
numerical_features = set(['c_cnt', 'i_cnt', 'r_cnt', 'vi_cnt', 'agg_c_cnt/agg_i_cnt'])
#we create a copy so that X will not include 'c_cnt'
df2 = df.copy()
df2.drop('c_cnt',inplace=True,axis=1)
#u,s,v = np.linalg.svd(X)

Create our X and Y matrices - adjust threshhold values for 1HE here

In [6]:
one_hot_thresh = 20
Y = df['c_cnt'].values
labels = np.hstack([transform_column(df, col, thresh=one_hot_thresh, return_labels=True) if col not in numerical_features else [str(col)] for col in df2])
X = np.hstack([transform_column(df, col, thresh=one_hot_thresh) if col not in numerical_features else df[col].values.reshape(-1,1)
               for col in df2])

ad_network_id
ad_type
advertiser_id
campaign_id
campaign_type
creative_id
geo_continent_code
geo_country_code2
geo_dma_code
geo_region_name
geo_timezone
i_timestamp
pub_network_id
r_num_ads_requested
r_num_ads_returned
r_timestamp
session_id
token
ua_device
ua_device_type
ua_name
ua_os_name
zone_id
ad_network_id
ad_type
advertiser_id
campaign_id
campaign_type
creative_id
geo_continent_code
geo_country_code2
geo_dma_code
geo_region_name
geo_timezone
i_timestamp
pub_network_id
r_num_ads_requested
r_num_ads_returned
r_timestamp
session_id
token
ua_device
ua_device_type
ua_name
ua_os_name
zone_id


The following blocks are for hyperparameter search on different types of models. We create confusion matrices on the validation sets to judge model performance. They are a bit messy, lmk if you are confused about what something does.

In [7]:
'''
This function takes in X as the feature matrix and Y as the label matrix, and undersamples the majority class such that 
#majority class samples / #minority class samples = pos_ratio. Returns the new X and Y matrices
'''

def fix_class_imbalance_with_subsampling(X, Y, pos_ratio=9):
    Y = Y.reshape(-1,1)
    ind_1, ind_0 = [], []
    for i, y_h in enumerate(Y):
        if y_h: ind_1.append(i)
        else: ind_0.append(i)
    to_sample = np.random.permutation(pos_ratio*len(ind_1))
    to_sample_0 = [ind_0[i] for i in to_sample]
    X2 = np.vstack([X[ind_1],X[to_sample_0]])
    Y2 = np.vstack([Y[ind_1],Y[to_sample_0]])
    Y = Y.reshape(-1)
    
    new_ind = np.random.permutation(len(X2))
    return X2[new_ind],Y2[new_ind]

def bin_round(a):
    if abs(a-0) < abs(a-1): return 0
    return 1

ind_cutoff = int(0.7*len(X))
X_train, Y_train = X[0:ind_cutoff], Y[0:ind_cutoff]
X_test, Y_test = X[ind_cutoff:], Y[ind_cutoff:]

Feature scoring using the scoring heuristic of  "Score = (True Pos)^2 / (All Pos * All Predicted Pos)". To score features, randomly select each column with probability p=0.1 and then train a logistic regression model and score based on the heuristic. Randomly select and train for as many iterations as possible. The model's hyperparameters are based off of the grid search performed by Jihan.

In [16]:
import sys
import random
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

In [13]:
#cache all the transformed columns since it is a costly operation.
transformed_col = {}
for col in df.columns:
    if col not in numerical_features:
        transformed_col[col] = transform_column(df, col, thresh=20)

ad_network_id
ad_type
advertiser_id
campaign_id
campaign_type
creative_id
geo_continent_code
geo_country_code2
geo_dma_code
geo_region_name
geo_timezone
i_timestamp
pub_network_id
r_num_ads_requested
r_num_ads_returned
r_timestamp
session_id
token
ua_device
ua_device_type
ua_name
ua_os_name
zone_id


In [17]:
#(pos_rat, penalty, C, solver, mult_class)
#best is (81, 'l2', 0.01, 'saga', 'ovr')
restart = False
if restart:
    p = 0.1
    iterations = 10000

    Y_selected = df['c_cnt'].values
    scores = []
    best = 0
    for j in range(iterations):
        print("Iteration: ", j)
        selected = [c for c in df2.columns if random.random() < p]
        if len(selected) < 1: continue
        X_selected = np.hstack([transformed_col[col] if col not in numerical_features else df2[col].values.reshape(-1,1)
                   for col in selected])
        ind_cutoff = int(0.7*len(X_selected))
        X_train, Y_train = X_selected[0:ind_cutoff], Y_selected[0:ind_cutoff]
        X_test, Y_test = X_selected[ind_cutoff:], Y_selected[ind_cutoff:]

        X_train, Y_train = fix_class_imbalance_with_subsampling(X_train, Y_train,pos_ratio=81)
        Y_train = np.ravel(Y_train)

        lr = LogisticRegression(penalty='l2', C=0.01, solver='saga', multi_class='ovr', max_iter=300)
        lr.fit(X_train, Y_train)
        test_cm = confusion_matrix(Y_test, [bin_round(i) for i in lr.predict(X_test)])
        true_neg  = test_cm[0][0]
        false_pos = test_cm[0][1]
        false_neg = test_cm[1][0]
        true_pos  = test_cm[1][1]
        #    Score = (True Pos)^2 / (All Pos * All Predicted Pos)
        score = (true_pos) / ((true_pos + false_neg)*(true_neg + true_pos))
        scores.append((selected, score))
    
    print("Finished training")

Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Itera

Iteration:  519
Iteration:  520
Iteration:  521
Iteration:  522
Iteration:  523
Iteration:  524
Iteration:  525
Iteration:  526
Iteration:  527
Iteration:  528
Iteration:  529
Iteration:  530
Iteration:  531
Iteration:  532
Iteration:  533
Iteration:  534
Iteration:  535
Iteration:  536
Iteration:  537
Iteration:  538
Iteration:  539
Iteration:  540
Iteration:  541
Iteration:  542
Iteration:  543
Iteration:  544
Iteration:  545
Iteration:  546
Iteration:  547
Iteration:  548
Iteration:  549
Iteration:  550
Iteration:  551
Iteration:  552
Iteration:  553
Iteration:  554
Iteration:  555
Iteration:  556
Iteration:  557
Iteration:  558
Iteration:  559
Iteration:  560
Iteration:  561
Iteration:  562
Iteration:  563
Iteration:  564
Iteration:  565
Iteration:  566
Iteration:  567
Iteration:  568
Iteration:  569
Iteration:  570
Iteration:  571
Iteration:  572
Iteration:  573
Iteration:  574
Iteration:  575
Iteration:  576
Iteration:  577
Iteration:  578
Iteration:  579
Iteration:  580
Iteratio

Iteration:  1030
Iteration:  1031
Iteration:  1032
Iteration:  1033
Iteration:  1034
Iteration:  1035
Iteration:  1036
Iteration:  1037
Iteration:  1038
Iteration:  1039
Iteration:  1040
Iteration:  1041
Iteration:  1042
Iteration:  1043
Iteration:  1044
Iteration:  1045
Iteration:  1046
Iteration:  1047
Iteration:  1048
Iteration:  1049
Iteration:  1050
Iteration:  1051
Iteration:  1052
Iteration:  1053
Iteration:  1054
Iteration:  1055
Iteration:  1056
Iteration:  1057
Iteration:  1058
Iteration:  1059
Iteration:  1060
Iteration:  1061
Iteration:  1062
Iteration:  1063
Iteration:  1064
Iteration:  1065
Iteration:  1066
Iteration:  1067
Iteration:  1068
Iteration:  1069
Iteration:  1070
Iteration:  1071
Iteration:  1072
Iteration:  1073
Iteration:  1074
Iteration:  1075
Iteration:  1076
Iteration:  1077
Iteration:  1078
Iteration:  1079
Iteration:  1080
Iteration:  1081
Iteration:  1082
Iteration:  1083
Iteration:  1084
Iteration:  1085
Iteration:  1086
Iteration:  1087
Iteration:  10

Iteration:  1512
Iteration:  1513
Iteration:  1514
Iteration:  1515
Iteration:  1516
Iteration:  1517
Iteration:  1518
Iteration:  1519
Iteration:  1520
Iteration:  1521
Iteration:  1522
Iteration:  1523
Iteration:  1524
Iteration:  1525
Iteration:  1526
Iteration:  1527
Iteration:  1528
Iteration:  1529
Iteration:  1530
Iteration:  1531
Iteration:  1532
Iteration:  1533
Iteration:  1534
Iteration:  1535
Iteration:  1536
Iteration:  1537
Iteration:  1538
Iteration:  1539
Iteration:  1540
Iteration:  1541
Iteration:  1542
Iteration:  1543
Iteration:  1544
Iteration:  1545
Iteration:  1546
Iteration:  1547
Iteration:  1548
Iteration:  1549
Iteration:  1550
Iteration:  1551
Iteration:  1552
Iteration:  1553
Iteration:  1554
Iteration:  1555
Iteration:  1556
Iteration:  1557
Iteration:  1558
Iteration:  1559
Iteration:  1560
Iteration:  1561
Iteration:  1562
Iteration:  1563
Iteration:  1564
Iteration:  1565
Iteration:  1566
Iteration:  1567
Iteration:  1568
Iteration:  1569
Iteration:  15

Iteration:  1994
Iteration:  1995
Iteration:  1996
Iteration:  1997
Iteration:  1998
Iteration:  1999
Iteration:  2000
Iteration:  2001
Iteration:  2002
Iteration:  2003
Iteration:  2004
Iteration:  2005
Iteration:  2006
Iteration:  2007
Iteration:  2008
Iteration:  2009
Iteration:  2010
Iteration:  2011
Iteration:  2012
Iteration:  2013
Iteration:  2014
Iteration:  2015
Iteration:  2016
Iteration:  2017
Iteration:  2018
Iteration:  2019
Iteration:  2020
Iteration:  2021
Iteration:  2022
Iteration:  2023
Iteration:  2024
Iteration:  2025
Iteration:  2026
Iteration:  2027
Iteration:  2028
Iteration:  2029
Iteration:  2030
Iteration:  2031
Iteration:  2032
Iteration:  2033
Iteration:  2034
Iteration:  2035
Iteration:  2036
Iteration:  2037
Iteration:  2038
Iteration:  2039
Iteration:  2040
Iteration:  2041
Iteration:  2042
Iteration:  2043
Iteration:  2044
Iteration:  2045
Iteration:  2046
Iteration:  2047
Iteration:  2048
Iteration:  2049
Iteration:  2050
Iteration:  2051
Iteration:  20

Iteration:  2476
Iteration:  2477
Iteration:  2478
Iteration:  2479
Iteration:  2480
Iteration:  2481
Iteration:  2482
Iteration:  2483
Iteration:  2484
Iteration:  2485
Iteration:  2486
Iteration:  2487
Iteration:  2488
Iteration:  2489
Iteration:  2490
Iteration:  2491
Iteration:  2492
Iteration:  2493
Iteration:  2494
Iteration:  2495
Iteration:  2496
Iteration:  2497
Iteration:  2498
Iteration:  2499
Iteration:  2500
Iteration:  2501
Iteration:  2502
Iteration:  2503
Iteration:  2504
Iteration:  2505
Iteration:  2506
Iteration:  2507
Iteration:  2508
Iteration:  2509
Iteration:  2510
Iteration:  2511
Iteration:  2512
Iteration:  2513
Iteration:  2514
Iteration:  2515
Iteration:  2516
Iteration:  2517
Iteration:  2518
Iteration:  2519
Iteration:  2520
Iteration:  2521
Iteration:  2522
Iteration:  2523
Iteration:  2524
Iteration:  2525
Iteration:  2526
Iteration:  2527
Iteration:  2528
Iteration:  2529
Iteration:  2530
Iteration:  2531
Iteration:  2532
Iteration:  2533
Iteration:  25

KeyboardInterrupt: 

Give each feature a final score, which is the average of all the model scores in the previous block.

In [33]:
#    Feature Score = Average(sum of all subset scores that feature is in) 
feature_scores = {}
for f in df2.columns:
    feature_score = []
    for cols, score in scores:
        if f in cols:
            feature_score.append(score)
    feature_scores[f] = np.mean(feature_score)
lst = []
for feat in feature_scores:
    lst.append((feature_scores[feat], feat))
for i in sorted(lst):
    print(i)

(6.8354570078016157e-07, 'r_num_ads_returned')
(8.1574440826361118e-07, 'zone_id')
(9.3247541885963875e-07, 'ua_device')
(1.0887093454203212e-06, 'ua_device_type')
(1.1291875085496769e-06, 'vi_cnt')
(1.5032535954484097e-06, 'r_num_ads_requested')
(1.5407103767981857e-06, 'campaign_type')
(1.5450622676280777e-06, 'session_id')
(1.7775540614479142e-06, 'ua_name')
(1.9054381356808588e-06, 'ua_os_name')
(1.968346967966665e-06, 'geo_continent_code')
(2.0589619671386414e-06, 'pub_network_id')
(2.0612456406555123e-06, 'geo_timezone')
(2.1515520712218517e-06, 'ad_type')
(2.3971685314422626e-06, 'geo_country_code2')
(2.6880463519754593e-06, 'geo_dma_code')
(2.7836341630662488e-06, 'geo_region_name')
(6.7838562693651805e-06, 'r_cnt')
(7.4469072649934018e-06, 'agg_c_cnt/agg_i_cnt')
(9.9346810038048869e-06, 'r_timestamp')
(1.0715581409731845e-05, 'i_timestamp')
(1.9602500072702838e-05, 'ad_network_id')
(1.9824895391545093e-05, 'token')
(2.2322774746816733e-05, 'creative_id')
(3.7195260562061638e-0

Use the top ten highest scoring features and train a LogReg model only with those features. Recall is pretty high yet precision is very low, which isn't very surprising considering the huge class imbalance for clicks.

In [43]:
topten_features = [x[1] for x in sorted(lst, reverse=True)]

X_topten = np.hstack([transformed_col[col] if col not in numerical_features else df2[col].values.reshape(-1,1)
           for col in topten_features])
Y_topten = df['c_cnt'].values

ind_cutoff = int(0.7*len(X_topten))
X_train, Y_train = X_topten[0:ind_cutoff], Y_topten[0:ind_cutoff]
X_test, Y_test = X_topten[ind_cutoff:], Y_topten[ind_cutoff:]

X_train, Y_train = fix_class_imbalance_with_subsampling(X_train, Y_train,pos_ratio=81)
Y_train = np.ravel(Y_train)

lr = LogisticRegression(penalty='l2', C=0.01, solver='saga', multi_class='ovr', max_iter=300)
lr.fit(X_train, Y_train)
test_cm = confusion_matrix(Y_test, [bin_round(i) for i in lr.predict(X_test)])
true_neg  = test_cm[0][0]
false_pos = test_cm[0][1]
false_neg = test_cm[1][0]
true_pos  = test_cm[1][1]
prec = true_pos/(true_pos + false_pos) 
rec = true_pos/(true_pos + false_neg)
print("Precision: ", prec)
print("Recall: ", rec)

Precision:  0.0012542431918
Recall:  0.915273132664


Here, we use a different scoring heuristic of 'Score = (Precision * Recall)/(Precision + Recall) as suggested by Kush. Using the same setup as before, we randomly select a features with probability p=0.1 per iteration and train a LogReg model with the same hyperparameters as before for as many iterations as possible.

In [22]:
def new_score(y_test, y_pred):
    test_cm = confusion_matrix(y_test, y_pred)
    true_neg  = test_cm[0][0]
    false_pos = test_cm[0][1]
    false_neg = test_cm[1][0]
    true_pos  = test_cm[1][1]
    prec = true_pos/(true_pos + false_pos) 
    rec = true_pos/(true_pos + false_neg)
    return (prec * rec) / (prec + rec)

p = 0.1
iterations = 1000

Y_selected = df['c_cnt'].values
scores2 = []
best2 = 0
for j in range(iterations):
    print("Iteration: ", j)
    selected = [c for c in df2.columns if random.random() < p]
    if len(selected) < 1: continue
    X_selected = np.hstack([transformed_col[col] if col not in numerical_features else df2[col].values.reshape(-1,1)
               for col in selected])
    ind_cutoff = int(0.7*len(X_selected))
    X_train, Y_train = X_selected[0:ind_cutoff], Y_selected[0:ind_cutoff]
    X_test, Y_test = X_selected[ind_cutoff:], Y_selected[ind_cutoff:]

    X_train, Y_train = fix_class_imbalance_with_subsampling(X_train, Y_train,pos_ratio=81)
    Y_train = np.ravel(Y_train)

    lr = LogisticRegression(penalty='l2', C=0.01, solver='saga', multi_class='ovr', max_iter=300)
    lr.fit(X_train, Y_train)
    score = new_score(Y_test, [bin_round(i) for i in lr.predict(X_test)])
    scores2.append((selected, score))
    
print("Finished training")

Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25


  import sys


Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31


  import sys


Iteration:  32
Iteration:  33
Iteration:  34


  import sys


Iteration:  35
Iteration:  36
Iteration:  37


  import sys


Iteration:  38


  import sys


Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48


  import sys


Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61


  import sys


Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66


  import sys


Iteration:  67
Iteration:  68


  import sys


Iteration:  69
Iteration:  70
Iteration:  71
Iteration:  72
Iteration:  73
Iteration:  74
Iteration:  75
Iteration:  76


  import sys


Iteration:  77


  import sys


Iteration:  78
Iteration:  79
Iteration:  80


  import sys


Iteration:  81
Iteration:  82
Iteration:  83
Iteration:  84
Iteration:  85


  import sys


Iteration:  86


  import sys


Iteration:  87
Iteration:  88


  import sys


Iteration:  89
Iteration:  90
Iteration:  91
Iteration:  92
Iteration:  93
Iteration:  94
Iteration:  95
Iteration:  96
Iteration:  97
Iteration:  98
Iteration:  99
Iteration:  100
Iteration:  101
Iteration:  102
Iteration:  103
Iteration:  104
Iteration:  105
Iteration:  106


  import sys


Iteration:  107
Iteration:  108


  import sys


Iteration:  109


  import sys


Iteration:  110
Iteration:  111
Iteration:  112
Iteration:  113


  import sys


Iteration:  114
Iteration:  115
Iteration:  116
Iteration:  117
Iteration:  118


  import sys


Iteration:  119
Iteration:  120
Iteration:  121
Iteration:  122


  import sys


Iteration:  123
Iteration:  124
Iteration:  125
Iteration:  126
Iteration:  127
Iteration:  128
Iteration:  129
Iteration:  130
Iteration:  131
Iteration:  132


  import sys


Iteration:  133
Iteration:  134
Iteration:  135


  import sys


Iteration:  136
Iteration:  137
Iteration:  138
Iteration:  139
Iteration:  140
Iteration:  141
Iteration:  142
Iteration:  143


  import sys


Iteration:  144
Iteration:  145
Iteration:  146
Iteration:  147
Iteration:  148


  import sys


Iteration:  149


  import sys


Iteration:  150
Iteration:  151
Iteration:  152
Iteration:  153
Iteration:  154
Iteration:  155
Iteration:  156
Iteration:  157
Iteration:  158
Iteration:  159
Iteration:  160
Iteration:  161
Iteration:  162
Iteration:  163
Iteration:  164


  import sys


Iteration:  165


  import sys


Iteration:  166
Iteration:  167
Iteration:  168


  import sys


Iteration:  169
Iteration:  170
Iteration:  171
Iteration:  172
Iteration:  173
Iteration:  174
Iteration:  175
Iteration:  176
Iteration:  177
Iteration:  178
Iteration:  179
Iteration:  180
Iteration:  181
Iteration:  182


  import sys


Iteration:  183
Iteration:  184
Iteration:  185
Iteration:  186
Iteration:  187
Iteration:  188
Iteration:  189
Iteration:  190


  import sys


Iteration:  191
Iteration:  192
Iteration:  193


KeyboardInterrupt: 

Give features a final score which is the mean of all the model scores.

In [28]:
feature_scores2 = {}
for f in df2.columns:
    feature_score = []
    for cols, score in scores2:
        if f in cols and 0 <= score and score <= 1:
            feature_score.append(score)
    feature_scores2[f] = np.mean(feature_score)
lst2 = []
for feat in feature_scores2:
    lst2.append((feature_scores2[feat], feat))
for i in sorted(lst2):
    print(i)

(0.0018970275304459082, 'ua_device_type')
(0.0019203160685655177, 'campaign_id')
(0.0021116437087416555, 'advertiser_id')
(0.0022228937859302056, 'ad_network_id')
(0.0024872861582317658, 'ua_os_name')
(0.0030294936160147269, 'token')
(0.0033702687517666997, 'i_timestamp')
(0.0034292226190454581, 'geo_dma_code')
(0.0037757504036322526, 'agg_c_cnt/agg_i_cnt')
(0.0039635411668275241, 'ua_name')
(0.0040461211764768537, 'creative_id')
(0.0041423885637902509, 'ad_type')
(0.0041703536152968013, 'ua_device')
(0.0041887190934237328, 'r_timestamp')
(0.004387508759515827, 'geo_region_name')
(0.0046728516954340389, 'geo_timezone')
(0.0047364008992809951, 'vi_cnt')
(0.0051421053425468053, 'r_cnt')
(0.0051439974165851674, 'session_id')
(0.0053037223206388491, 'geo_country_code2')
(0.0053897951919575238, 'geo_continent_code')
(0.0055417330577047178, 'pub_network_id')
(0.0057209888390248771, 'r_num_ads_returned')
(0.0057673349606050138, 'r_num_ads_requested')
(0.005863657161130951, 'campaign_type')
(0

Select the top ten features and train a LogReg model only on those features.

In [29]:
topten_features = [x[1] for x in sorted(lst2, reverse=True)]

X_topten = np.hstack([transformed_col[col] if col not in numerical_features else df2[col].values.reshape(-1,1)
           for col in topten_features])
Y_topten = df['c_cnt'].values

ind_cutoff = int(0.7*len(X_topten))
X_train, Y_train = X_topten[0:ind_cutoff], Y_topten[0:ind_cutoff]
X_test, Y_test = X_topten[ind_cutoff:], Y_topten[ind_cutoff:]

X_train, Y_train = fix_class_imbalance_with_subsampling(X_train, Y_train,pos_ratio=81)
Y_train = np.ravel(Y_train)

lr = LogisticRegression(penalty='l2', C=0.01, solver='saga', multi_class='ovr', max_iter=300)
lr.fit(X_train, Y_train)
test_cm = confusion_matrix(Y_test, [bin_round(i) for i in lr.predict(X_test)])
true_neg  = test_cm[0][0]
false_pos = test_cm[0][1]
false_neg = test_cm[1][0]
true_pos  = test_cm[1][1]
prec = true_pos/(true_pos + false_pos) 
rec = true_pos/(true_pos + false_neg)
print("Precision: ", prec)
print("Recall: ", rec)

Precision:  0.0012542431918
Recall:  0.915273132664
