In [52]:
#packages import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
import pickle
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('always')  #"error", "ignore", "always", "default", "module" or "once"
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import itertools
import random

In [40]:
#dataset import
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")
train_data.reset_index(drop=True, inplace=True)
validation_data = pd.read_csv("validation.csv")
validation_data.reset_index(drop=True, inplace=True)

In [41]:
#get imps
train_data['imps']=0
train_data.loc[(train_data.bidprice>train_data.payprice),'imps'] = 1
validation_data['imps']=0
validation_data.loc[(validation_data.bidprice>validation_data.payprice),'imps'] = 1
#get os and browser
train_data['os']=train_data.useragent.apply(lambda x: x.split('_')[0])
train_data['browser']=train_data.useragent.apply(lambda x: x.split('_')[1])
#get slot size
train_data['slotsize']=train_data['slotwidth']*train_data['slotheight']

In [42]:
#negative downsampling on training dataset
from sklearn.utils import resample
# Separate majority and minority classes
train_data_majority = train_data[train_data.click==0]
train_data_minority = train_data[train_data.click==1]
# Downsample majority class
df_majority_downsampled = resample(train_data_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=20000,     # to match minority class
                                 random_state=123) # reproducible results
# Combine minority class with downsampled majority class
train_downsampled = pd.concat([df_majority_downsampled, train_data_minority])
train_downsampled.reset_index(drop=True, inplace=True)



# Data Cleaning

In [43]:
#Further data cleaning and feature engineering process include removing case specific features, 
#creating frequency and ctr features for all categorical feature, and keeping the continuous data                
def CleanDF(df):
    #remove highly case-specific features:bidid, userid,IP,domain,url,urlid,slotid
    df.drop(['bidid', 'userid', 'IP', 'domain', 'url', 'urlid', 
             'slotid'], axis = 1, inplace = True)
    #create frequency and ctr feature for categorical features: 
    #weekday, hour, os, browser,region,city,adexchange,slotvisibility, creative,keypage,advertiser, usertag, slotformat
    

    #extract os and browser info 
    df['os']=df.useragent.apply(lambda x: x.split('_')[0])
    df['browser']=df.useragent.apply(lambda x: x.split('_')[1])
    #get slot size
    df['slotsize']=df['slotwidth']*df['slotheight']
    

    
    #create frequency features
    def get_freq(df,column):
        item_list=train_data[column].unique()
        freq_list=[]
        for item in item_list:
            freq=len(train_data[train_data[column]==item])/len(train_data)
            freq_list.append(freq)
        fre_col=str(column)+'_freq'
        def corresponding_value(x):
            for i in range(len(item_list)):
                if x==item_list[i]:
                    return freq_list[i]
        df[fre_col]=df[column].apply(corresponding_value)
    get_freq(df,'weekday')
    get_freq(df,'hour')
    get_freq(df,'os')
    get_freq(df,'browser')
    get_freq(df,'region')
    get_freq(df,'city')
    get_freq(df,'adexchange')
    get_freq(df,'slotvisibility')
    get_freq(df,'slotformat')
    get_freq(df,'creative')
    get_freq(df,'keypage')
    get_freq(df,'advertiser')

    
    #create ctr features
    def get_ctr(df,column):
        item_list=train_data[column].unique()
        ctr_list=[]
        for item in item_list:
            if sum(train_data[train_data[column]==item]['imps'])!=0:
                ctr=sum(train_data[train_data[column]==item]['click'])/sum(train_data[train_data[column]==item]['imps'])
            else:
                ctr=0
            ctr_list.append(ctr)
        ctr_col=str(column)+'_ctr'
        def corresponding_value(x):
            for i in range(len(item_list)):
                if x==item_list[i]:
                    return ctr_list[i]
        df[ctr_col]=df[column].apply(corresponding_value)  
    get_ctr(df,'weekday')
    get_ctr(df,'hour')
    get_ctr(df,'os')
    get_ctr(df,'browser')
    get_ctr(df,'region')
    get_ctr(df,'city')
    get_ctr(df,'adexchange')
    get_ctr(df,'slotvisibility')
    get_ctr(df,'slotformat')
    get_ctr(df,'creative')
    get_ctr(df,'keypage')
    get_ctr(df,'advertiser')    

    return df



In [59]:
def get_usertag():
    train_data['usertag']=train_data['usertag'].fillna('0')
    cate=train_data['usertag'].str.split(',')
    train_data['usertag_split']=cate.apply(lambda x: [int(item) for item in x])
    usertag_list=[train_data['usertag'][i].split(",") for i in range(df.shape[0])]
    unique_list=[]
    unique_list=np.unique(list(itertools.chain.from_iterable(usertag_list))) 
    unique_list=[int(item) for item in unique_list]
    return unique_list

def UR_data(df):
    def get_freq_usertag(df):
        #usertag data cleaning
        df['usertag']=df['usertag'].fillna('0')
        cate=df['usertag'].str.split(',')
        df['usertag_split']=cate.apply(lambda x: [int(item) for item in x])
        #get list of all unique usertag
        column='usertag_split'
        #get_freq
        item_list=get_usertag()
        freq_list=[]
        for item in item_list:
            freq=len(train_data[[item in row for row in train_data[column]]])/len(train_data)
            freq_list.append(freq)
        fre_col=str(column)+'_freq'
        def corresponding_value(x):
            for j in x:
                row_value=0
                for i in range(len(item_list)):
                    if item_list[i] in x:
                         row_value+=freq_list[i]
            return row_value
        df[fre_col]=df[column].apply(corresponding_value)  
    get_freq_usertag(df)
    def get_ctr_usertag(df):
        #usertag data cleaning
        df['usertag']=df['usertag'].fillna('0')
        cate=df['usertag'].str.split(',')
        df['usertag_split']=cate.apply(lambda x: [int(item) for item in x])
        df['usertag_split']=cate.apply(lambda x: [int(item) for item in x])
        column='usertag_split'
        #get list of all unique usertag 
        column='usertag_split'
        #get_ctr
        item_list=get_usertag()
        ctr_list=[]
        for item in item_list:
            c=sum(train_data[[item in row for row in train_data[column]]].click)
            im=sum(train_data[[item in row for row in train_data[column]]].imps)
            if im!=0:
                ctr=c/im
            else:
                ctr=0
            ctr_list.append(ctr)
            col_name = "usertag_ctr" + str(item)
            df[col_name] = df["usertag"].map(lambda x: ctr if item in x.split(",") else 0) 
    get_ctr_usertag(df)
    return df

In [67]:
#generate binary features
def UsertagCategories(df):
    
    # Drop nan
    df = df["usertag"].dropna().reset_index(drop = True)
    
    # Find unique usertags
    usertags_list = [df[i].split(",") for i in range(df.shape[0])]
    
    # itertools.chain.from_iterable joins a list of lists into a single list
    usertags = np.unique(list(itertools.chain.from_iterable(usertags_list)))
    
    # Remove the empty string ""
    usertags = [tag for tag in usertags if len(tag) > 0]
    
    return usertags

def FeatureEngineering(df): 

    # Usertags
    usertags = UsertagCategories(df)
    for tag in usertags:
        col_name = "usertag_" + tag
        df[col_name] = df["usertag"].map(lambda x: 1 if tag in x.split(",") else 0)

    # Slotprice binning
    df["slotprice_cat"] = 0

    df.loc[ df["slotprice"] <= 10, "slotprice_cat"] = 0
    df.loc[ (df["slotprice"] > 10) & (df["slotprice"] <= 50), "slotprice_cat"] = 1
    df.loc[ (df["slotprice"] > 50) & (df["slotprice"] <= 100), "slotprice_cat"] = 2
    df.loc[ df["slotprice"] > 100, "slotprice_cat"] = 3



    # Convert numerical to categorical
    df["weekday_cat"] = df["weekday"].map(lambda x: str(x))
    df["hour_cat"] = df["hour"].map(lambda x: str(x))
    df["region_cat"] = df["region"].map(lambda x: str(x))
    df["city_cat"] = df["city"].map(lambda x: str(x))
    df["adexchange_cat"] = df["adexchange"].map(lambda x: str(x))
    df["advertiser_cat"] = df["advertiser"].map(lambda x: str(x))

    # Operating system
    df["os"] = df["useragent"].map(lambda x: x.split("_")[0])

    # Browser
    df["browser"] = df["useragent"].map(lambda x: x.split("_")[1])
    cols = ["weekday_cat", "hour_cat", "hour_cat", "region_cat", "city_cat", "adexchange_cat", "advertiser_cat","slotprice_cat"]
    df_dum=pd.concat([pd.get_dummies(df[col]) for col in cols], axis=1, keys=df.columns)
    df=pd.concat([df,df_dum],axis=1)

    return df

In [76]:
def drop_features(df):
    columns = ['creative','slotformat','slotvisibility',"useragent", "weekday", "keypage", "usertag", "region", "city", "adexchange", "advertiser",'hour','os','browser','usertag_split','weekday_cat','slotprice_cat','hour_cat','region_cat','city_cat','adexchange_cat','advertiser_cat']
    df.drop(columns, axis = 1, inplace = True)
    df=df.fillna(0)
    return df

In [45]:
#implement
validation_features_v1=CleanDF(validation_data)
train_features_v1=CleanDF(train_downsampled)
test_features_v1=CleanDF(test_data)

In [60]:
validation_features_v2=UR_data(validation_features_v1)
train_features_v2=UR_data(train_features_v1)
test_features_v2=UR_data(test_features_v1)

In [68]:
validation_features_v3=FeatureEngineering(validation_features_v2)
train_features_v3=FeatureEngineering(train_features_v2)
test_features_v3=FeatureEngineering(test_features_v1)

In [77]:
validation_features_v4=drop_features(validation_features_v3)
train_features_v4=drop_features(train_features_v3)
test_features_v4=drop_features(test_features_v3)

# Feature Selection for different pCTR

In [78]:
X_train_X = train_features_v4.loc[:, ((train_features_v4.columns != "click") & 
                      (train_features_v4.columns != "payprice") &
                      (train_features_v4.columns != "bidprice")&
                      (train_features_v4.columns != "imps"))]
y_train_X = train_features_v4["click"]

X_validation_X = validation_features_v3.loc[:, ((validation_features_v4.columns != "click") & 
                                        (validation_features_v4.columns != "payprice") & 
                                        (validation_features_v4.columns != "bidprice")&
                                        (validation_features_v4.columns != "imps"))]
y_validation_X = validation_features_v3["click"]

X_test_X=test_features_v4

In [90]:
#save the down

#Pickled Train
pickle.dump(X_train_X, open("X_train_features_X.pkl", 'wb'))
pickle.dump(y_train_X, open("y_train_features_X.pkl", 'wb'))

#Pickled Validation
pickle.dump(X_validation_X, open("X_validation_features_X.pkl", 'wb'))
pickle.dump(y_validation_X, open("y_validation_features_X.pkl", 'wb'))

#Pickled Test
pickle.dump(X_test_X, open("X_test_features_X.pkl", 'wb'))

  after removing the cwd from sys.path.
  """
  
  if __name__ == '__main__':
  if sys.path[0] == '':


# XGBOOST

In [98]:
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', colsample_bytree = 0.2, learning_rate = 0.1,
              max_depth = 5, alpha = 10, n_estimators = 500, random_state = 123)

In [None]:
# Initialize Recursive Feature Elimination object
stepsize = 10
rfecv = RFECV(estimator = xgb_clf, step = stepsize, cv = StratifiedKFold(n_splits = 2), scoring='roc_auc')
rfecv.fit(X_train_X, y_train_X)

In [None]:
bestFeatures_X=  [feature for feature, booln in zip(list(X_train_X.columns.values), rfecv.support_) if booln]
print("Selecting the "+ str(len(bestFeatures_X)) + " best features, which include")
print(bestFeatures_X, end=", ")

In [None]:
with open('bestFeatures_X.pkl', 'wb') as f:
    pickle.dump(bestFeatures_X, f)

# Logistic Regression

In [91]:
l_clf = LogisticRegression(C = 0.1, random_state = 123)

In [92]:
# Initialize Recursive Feature Elimination object
stepsize = 10
rfecv_L = RFECV(estimator = l_clf, step = stepsize, cv = StratifiedKFold(n_splits = 2), scoring = 'roc_auc')
rfecv_L.fit(X_train_X, y_train_X)











RFECV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
   estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
   min_features_to_select=1, n_jobs=None, scoring='roc_auc', step=10,
   verbose=0)

In [96]:
#get a list of all the features that our model with benefit form
bestFeatures_L=  [feature for feature, booln in zip(list(X_train_X.columns.values), rfecv_L.support_) if booln]
print("Selecting the "+ str(len(bestFeatures_L)) + " best features, which include")
print(bestFeatures_L, end=", ")

Selecting the 216 best features, which include
['os_freq', 'browser_freq', 'slotvisibility_freq', 'slotformat_freq', 'creative_freq', 'keypage_freq', 'advertiser_freq', 'usertag_split_freq', 'usertag_10006', 'usertag_10024', 'usertag_10031', 'usertag_10052', 'usertag_10076', 'usertag_10093', 'usertag_10110', 'usertag_10111', 'usertag_10114', 'usertag_10115', 'usertag_10116', 'usertag_10117', 'usertag_10120', 'usertag_10125', 'usertag_10129', 'usertag_10131', 'usertag_10133', 'usertag_10140', 'usertag_10146', 'usertag_10147', 'usertag_10148', 'usertag_10149', 'usertag_10684', 'usertag_11092', 'usertag_11278', 'usertag_11379', 'usertag_11423', 'usertag_11632', 'usertag_11724', 'usertag_11944', 'usertag_13042', 'usertag_13403', 'usertag_13496', 'usertag_13678', 'usertag_13776', 'usertag_13800', 'usertag_13874', 'usertag_14273', 'usertag_16593', 'usertag_16617', 'usertag_16661', 'usertag_16753', ('click', '0'), ('click', '3'), ('click', '4'), ('click', '5'), ('click', '6'), ('weekday', '11

In [97]:
with open('bestFeatures_L.pkl', 'wb') as f:
    pickle.dump(bestFeatures_L, f)