In [1]:
import numpy as np
import pandas as pd
import pyprind
from skfeature.function.information_theoretical_based import FCBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold, train_test_split,cross_val_score
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder




In [2]:
df_clicks = pd.read_csv('dataset/dataset_clicks.csv')
df_clicks.shape

(1869, 183)

In [3]:
df_clicks.head()

Unnamed: 0.1,Unnamed: 0,transaction_id,clicks,click_conversions,click_conversions_type00,click_conversions_type10,click_conversions_type20,click_conversions_type30,click_conversions_type40,day_id,...,day_spend_flag,spend_hours,hour_spend_flag,conversions,keyword_with_data_provider_id,keyword_with_data_provider,dtime,N,financial_reward,state
0,0,dalbid10:9001-1445835989077-80358478,1,1,0,0,0,0,1,5777,...,111111111111111111111111,24,1,1.0,954621705096990,"click_adx_product_category_id10004,data provid...",2015-10-26 00:00:00,1,0.001,1
1,1,dalbid10:9005-1445683160227-105266,1,0,0,0,0,0,0,5775,...,000001111111111111111111,19,1,0.0,27389734586424232,"!squash,data provider (32)",2015-10-24 05:00:00,1,0.0,1
2,2,dalbid10:9005-1446305222448-17044177,1,0,0,0,0,0,0,5782,...,000000011111111111111111,17,1,0.0,22649460937621478,"ticket,data provider (78)",2015-10-31 10:00:00,1,0.0,1
3,3,dalbid10:9010-1446633477424-111930266,1,0,0,0,0,0,0,5786,...,111111111111111111111111,24,1,0.0,203638391889945186,"!breakfast,data provider (186)",2015-11-04 04:00:00,1,0.0,1
4,4,dalbid10:9011-1445964829909-18114437,1,0,0,0,0,0,0,5778,...,000000000111111111111111,15,1,0.0,249295749573107186,"beans,data provider (186)",2015-10-27 11:00:00,1,0.0,1


## Prepare dataset

This dataset has nearly 10 million rows and 182 columns. There is not much information on what each column means other than its name. The target variable 'clicks' has only 1869 rows with value of one, and the rest is zero. I chose to randomly sample 100,000 rows out of every 1 million rows as df_zero, and concat with df_clicks for modeling.

To prepare dataframe for modeling, first drop all columns with null values. Separate columns into numerical variables (num_var) and categorical variables (cat_var). From preliminary modeling, I noticed both num_var and cat_var have some columns with many unique values and/or has 'id' in column names, indicating they could be unique identifiers. I chose to throw out such columns in cat_var with >100 unique values, and columns in num_var with >100 unique values and has 'id' in its name.

In [4]:
def prepare_df(df_clicks,df_zero):
    df = pd.concat([df_clicks,df_zero],axis=0,ignore_index=True)
    #remove columns with null
    df = df.dropna(axis=1)
    print df.shape

    #separate numerical and categorical variables
    cols_total = df.columns
    num_var = []
    cat_var = []
    for col in cols_total:
        if df[col].dtype =='object':
            cat_var.append(col)
        else:
            num_var.append(col)
    print len(num_var), len(cat_var)

    #check unique values in cat_var
    for cat in cat_var:
        print cat, len(df[cat].unique())

    #threw away columns with more than 100 unique values
    cat_var2=[]
    for cat in cat_var:
        if len(df[cat].unique())<100:
            cat_var2.append(cat)
    print len(cat_var2),len(cat_var)

    #drop columns not in cat_Var2 and convert the rest with LabelEncoder()
    df_new = df.copy()
    for cat in cat_var:
        if cat in cat_var2:
            le = LabelEncoder()
            unq_vals = df_new[cat].unique()
            le.fit(unq_vals)
            df_new[cat] = le.transform(df_new[cat])
        else:
            df_new.drop(cat,axis=1,inplace=True)

    #drop numerical variables with 'id' in name and >100 unique values
    for col in num_var:
        if 'id' in col and len(df_new[col].unique())>100:
            df_new.drop(col,axis=1,inplace=True)
            print col

    print df_new.shape
    return df_new

In [8]:
#sample 1/10 or 100,000 for the first 1 million rows

num = 500000
temp = pd.read_csv('dataset/dataset.csv',skiprows=500000,\
                   nrows=num, names=df_clicks.columns,low_memory=False)
print temp.shape
df_zero1 = temp[temp['clicks']==0].sample(100000)
print df_zero1.shape

(500000, 183)
(100000, 183)


In [9]:
df_new1 = prepare_df(df_clicks,df_zero1)

(101869, 105)
52 53
Unnamed: 0 101869
transaction_id 1871
city_id 3737
postal_code 1034
geo_fence_recency_id 12
viewability 13
view_conversions_type40 32
date 38
wday 14
tday_id 4
tday 19
exchange 16
exchange_margin 51
campaign 79
model_enabled 4
campaign_status 13
platform 22
platform_id 28
company 33
company_id 36
client 38
client_id 39
advertiser 40
advertiser_id 20
cost_type 2
cost_type_id 2
spend_type 5
recency_config 8
recency_config_id 5
adx_category 2
adx_category_id 2
adx_product 2
adx_product_category_id 2
adx_product_category 2
openx_category 2
sifi_product_category_id 2
sifi_product_category 2
campaign_type 2
spend_vs_perf 2
domain 422
data_provider 47
is_advertiser 177
ad 141
ad_file_type 10
ad_position 4
segment 7
browser 20
os 18
day_spend_flag 13
conversions 7268
keyword_with_data_provider_id 8008
keyword_with_data_provider 1465
dtime 571
42 53
campaign_id
ad_id
domain_id
keyword_id
context_id
exchange_id
segment_id
mobile_app_id
(101869, 86)


## Feature Selection with skfeature

Use skfeature.FCBF for feature selection. This is a fast correlation-based filter solution, suitable for high-dimension data. 
Split data into 5 fold, run FCBF to select features, model with randomforest and get average of roc_auc_score as metrics. To my surprise, with less than 5 features, the roc_auc_score is already 1.0.

Randomly sample another df_zero from row number 1 million to 2 million and run the model again. Got similar features and perfect score.

Randomly sample another df_zero from row number 2 million to 3 million and run one of the model to get test score. Perfect score.

In [10]:
def test_FCBF(df):
    df_X = df.drop('clicks',axis=1)
    X = df_X.values    # data
    y = df['clicks']    # label
    n_samples, n_features = X.shape
    # split data into 5 folds
    ss = KFold(n_samples, n_folds=5, shuffle=True)

    # perform evaluation on classification task
    num_fea = 20    # number of selected features
    clf = RandomForestClassifier(n_estimators=100,n_jobs=-1)

    score = 0.
    for train, test in ss:
        # obtain the index of each feature on the training set
        idx = FCBF.fcbf(X[train], y[train], n_selected_features=num_fea)
        print idx
        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]
        features_names = df_X.columns[idx]

        clf.fit(features[train], y[train])
        #print clf.feature_importances_, cannot use df because dropped clicks
        feature_importance = [format(x,'.2f') for x in clf.feature_importances_]
        zip_feature = zip(features_names, feature_importance)
        print sorted(zip_feature, key = lambda x: x[1], reverse=True)

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        score+= metrics.roc_auc_score(y[test], y_predict)

    # output the average classification accuracy over all 5 folds
    print '\nROC_AUC_Score:', float(score)/5

In [11]:
test_FCBF(df_new1)

[21 78 83 35]
[('ad_position', '0.40'), ('spend_hours', '0.34'), ('imps', '0.26'), ('wday_id', '0.00')]
[21 78 56 83 60 14]
[('recency_config', '0.23'), ('imps', '0.20'), ('advertiser_id', '0.20'), ('spend_hours', '0.20'), ('ad_position', '0.17'), ('keyword_type_id', '0.00')]
[21 78 83 81]
[('ad_position', '0.37'), ('imps', '0.33'), ('spend_hours', '0.30'), ('os', '0.00')]
[21 83 20 77]
[('imps', '0.29'), ('spend', '0.26'), ('ad_file_type', '0.24'), ('spend_hours', '0.21')]
[21 38 17]
[('imps', '0.38'), ('viewability', '0.34'), ('exchange', '0.28')]

ROC_AUC_Score: 1.0


In [12]:
#sample 1/10 or 100,000 for row number after 1 million

num = 500000
temp = pd.read_csv('dataset/dataset.csv',skiprows=1000000,\
                   nrows=num, names=df_clicks.columns,low_memory=False)
print temp.shape
df_zero2 = temp[temp['clicks']==0].sample(100000)
print df_zero2.shape

(500000, 183)
(100000, 183)


In [13]:
df_new2 = prepare_df(df_clicks,df_zero2)

(101869, 105)
52 53
Unnamed: 0 101869
transaction_id 1871
city_id 3762
postal_code 1034
geo_fence_recency_id 12
viewability 13
view_conversions_type40 32
date 38
wday 14
tday_id 4
tday 17
exchange 16
exchange_margin 51
campaign 79
model_enabled 4
campaign_status 13
platform 22
platform_id 28
company 33
company_id 36
client 38
client_id 39
advertiser 40
advertiser_id 20
cost_type 2
cost_type_id 2
spend_type 5
recency_config 8
recency_config_id 5
adx_category 2
adx_category_id 2
adx_product 2
adx_product_category_id 2
adx_product_category 2
openx_category 2
sifi_product_category_id 2
sifi_product_category 2
campaign_type 2
spend_vs_perf 2
domain 409
data_provider 47
is_advertiser 179
ad 141
ad_file_type 10
ad_position 4
segment 7
browser 19
os 18
day_spend_flag 13
conversions 7245
keyword_with_data_provider_id 7985
keyword_with_data_provider 1465
dtime 572
42 53
campaign_id
ad_id
domain_id
keyword_id
context_id
exchange_id
segment_id
mobile_app_id
(101869, 86)


In [14]:
test_FCBF(df_new2)

[21 20 48  5]
[('click_conversions_type40', '0.34'), ('campaign_status', '0.25'), ('imps', '0.24'), ('spend', '0.17')]
[21 61 39]
[('imps', '0.35'), ('recency_config_id', '0.34'), ('exchange_margin', '0.31')]
[21 78 14 35]
[('ad_position', '0.43'), ('imps', '0.40'), ('wday_id', '0.16'), ('keyword_type_id', '0.00')]
[21 56 83]
[('spend_hours', '0.35'), ('imps', '0.34'), ('advertiser_id', '0.31')]
[21 83 77 79]
[('ad_file_type', '0.34'), ('spend_hours', '0.25'), ('segment', '0.23'), ('imps', '0.18')]

ROC_AUC_Score: 1.0


In [15]:
#sample 1/10 or 100,000 for row number after 2 million as test set

num = 500000
temp = pd.read_csv('dataset/dataset.csv',skiprows=2000000,\
                   nrows=num, names=df_clicks.columns,low_memory=False)
print temp.shape
df_zero3 = temp[temp['clicks']==0].sample(100000)
print df_zero3.shape

(500000, 183)
(100000, 183)


In [17]:
df_new3 = prepare_df(df_clicks,df_zero3)

(101869, 105)
52 53
Unnamed: 0 101869
transaction_id 1871
city_id 3750
postal_code 1034
geo_fence_recency_id 12
viewability 13
view_conversions_type40 32
date 38
wday 14
tday_id 4
tday 18
exchange 16
exchange_margin 51
campaign 79
model_enabled 4
campaign_status 13
platform 22
platform_id 28
company 33
company_id 36
client 38
client_id 39
advertiser 40
advertiser_id 20
cost_type 2
cost_type_id 2
spend_type 5
recency_config 8
recency_config_id 5
adx_category 2
adx_category_id 2
adx_product 2
adx_product_category_id 2
adx_product_category 2
openx_category 2
sifi_product_category_id 2
sifi_product_category 2
campaign_type 2
spend_vs_perf 2
domain 414
data_provider 47
is_advertiser 177
ad 141
ad_file_type 10
ad_position 4
segment 7
browser 19
os 18
day_spend_flag 13
conversions 7170
keyword_with_data_provider_id 7910
keyword_with_data_provider 1465
dtime 572
42 53
campaign_id
ad_id
domain_id
keyword_id
context_id
exchange_id
segment_id
mobile_app_id
(101869, 86)


In [16]:
#pick a simple three-feature model from df_new2
cols_use = ['spend_hours','imps','advertiser_id']
X_train = df_new2[cols_use].values
y_train = df_new2['clicks']
clf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
X_test = df_new3[cols_use].values
y_test = df_new3['clicks']
y_pred = clf.predict(X_test)

In [19]:
print metrics.roc_auc_score(y_test,y_pred)
print len(y_test==y_pred),len(y_test)

1.0
101869 101869
