In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import math
import random
from sklearn.utils import resample
from sklearn.externals import joblib
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [2]:
#Quick Processing

def CleanDF(df):

    df["adexchange"] = df['adexchange'].fillna(0)
    df["usertag"] = df['usertag'].fillna("") 
    df['slotsize'] = df['slotwidth'].astype(str) + 'x' + df['slotheight'].astype(str)
    
    #remove ids from the dataset since they won't be too useful in the model
    df.drop(['bidid', 'userid', 'IP', 'domain', 'url', 'urlid', 
             'slotid', 'creative', "slotwidth", "slotheight"], axis = 1, inplace = True)
    
    return df

In [3]:
#only importing train and validation since test isn't useable 
traindf = pd.read_csv("train.csv")
validationdf = pd.read_csv("validation.csv")
testdf = pd.read_csv("test.csv")

traindf = CleanDF(traindf)
validationdf = CleanDF(validationdf)
testdf = CleanDF(testdf)

traindf.head(5)

print(traindf["click"].value_counts())

0    2429188
1       1793
Name: click, dtype: int64


In [4]:
def UsertagCategories(df):
    
    # Drop nan
    df = df["usertag"].dropna().reset_index(drop = True)
    
    # Find unique usertags
    usertags_list = [df[i].split(",") for i in range(df.shape[0])]
    
    # itertools.chain.from_iterable joins a list of lists into a single list
    usertags = np.unique(list(itertools.chain.from_iterable(usertags_list)))
    
    # Remove the empty string ""
    usertags = [tag for tag in usertags if len(tag) > 0]
    
    return usertags

In [5]:
#########RUNNING INTO MEM ERRORS

# def FeatureEngineering(df): 
        
#         # Usertags
#         usertags = UsertagCategories(df)
#         for tag in usertags:
#             col_name = "usertag_" + tag
#             df[col_name] = df["usertag"].map(lambda x: 1 if tag in x.split(",") else 0)
            
            
#         # num of usertags
# #         df['usertagsCount'] = df.usertag.str.count(",")+1
            
#         # Slotprice binning
#         df["slotprice_cat"] = 0
        
#         df.loc[ df["slotprice"] <= 10, "slotprice_cat"] = 0
#         df.loc[ (df["slotprice"] > 10) & (df["slotprice"] <= 50), "slotprice_cat"] = 1
#         df.loc[ (df["slotprice"] > 50) & (df["slotprice"] <= 100), "slotprice_cat"] = 2
#         df.loc[ df["slotprice"] > 100, "slotprice_cat"] = 3

        
#         # IP Category
# #         df['IP_cat'] = ""     
# #         df["IP_str"] = df["IP"].astype(str)

# #         df["IP_spt"] = df["IP_str"].map(lambda x: x.split(".")[0]).astype(int)

# #         df.loc[ df["IP_spt"] <= 127, "IP_cat"] = "Class_A"
# #         df.loc[ (df["IP_spt"] > 128) & (df["IP_spt"] <= 191), "IP_cat"] = "Class_B"
# #         df.loc[ (df["IP_spt"] > 192) & (df["IP_spt"] <= 223), "IP_cat"] = "Class_C"
# #         df.loc[ df["IP_spt"] >= 224, "IP_cat"] = "Class_D"         
        
#              # Part of the day
# #         df["part_of_day"] = ""

# #         df.loc[(df["weekday"] == 0) & ((df["hour"] < 8) & (df["hour"] > 17)), "part_of_day"] = "Sun_Night"
# #         df.loc[(df["weekday"] == 0) & ((df["hour"] >= 8) & (df["hour"] <= 17)), "part_of_day"] = "Sun_Morn"

# #         df.loc[(df["weekday"] == 1) & ((df["hour"] < 8) & (df["hour"] > 17)), "part_of_day"] = "Mon_Night"
# #         df.loc[(df["weekday"] == 1) & ((df["hour"] >= 8) & (df["hour"] <= 17)), "part_of_day"] = "Mon_Morn"

# #         df.loc[(df["weekday"] == 2) & ((df["hour"] < 8) & (df["hour"] > 17)), "part_of_day"] = "Tues_Night"
# #         df.loc[(df["weekday"] == 2) & ((df["hour"] >= 8) & (df["hour"] <= 17)), "part_of_day"] = "Tues_Morn"

# #         df.loc[(df["weekday"] == 3) & ((df["hour"] < 8) & (df["hour"] > 17)), "part_of_day"] = "Wed_Night"
# #         df.loc[(df["weekday"] == 3) & ((df["hour"] >= 8) & (df["hour"] <= 17)), "part_of_day"] = "Wed_Morn"

# #         df.loc[(df["weekday"] == 4) & ((df["hour"] < 8) & (df["hour"] > 17)), "part_of_day"] = "Thur_Night"
# #         df.loc[(df["weekday"] == 4) & ((df["hour"] >= 8) & (df["hour"] <= 17)), "part_of_day"] = "Thur_Morn"

# #         df.loc[(df["weekday"] == 5) & ((df["hour"] < 8) & (df["hour"] > 17)), "part_of_day"] = "Fri_Night"
# #         df.loc[(df["weekday"] == 5) & ((df["hour"] >= 8) & (df["hour"] <= 17)), "part_of_day"] = "Fri_Morn"

# #         df.loc[(df["weekday"] == 6) & ((df["hour"] < 8) & (df["hour"] > 17)), "part_of_day"] = "Sat_Night"
# #         df.loc[(df["weekday"] == 6) & ((df["hour"] >= 8) & (df["hour"] <= 17)), "part_of_day"] = "Sat_Morn"
        
#         # Convert numerical to categorical
#         df["weekday"] = df["weekday"].map(lambda x: str(x))
#         df["hour"] = df["hour"].map(lambda x: str(x))
#         df["region"] = df["region"].map(lambda x: str(x))
#         df["city"] = df["city"].map(lambda x: str(x))
#         df["adexchange"] = df["adexchange"].map(lambda x: str(x))
#         df["advertiser"] = df["advertiser"].map(lambda x: str(x))
        
#         # Operating system
#         df["os"] = df["useragent"].map(lambda x: x.split("_")[0])
        
#         # Browser
#         df["browser"] = df["useragent"].map(lambda x: x.split("_")[1])
        
#         #Clean slotvisibility
#         #0
#         # FirstView
#         # 2
#         # 1
#         # OtherView
#         # SecondView
#         # Na
#         # 255
#         # ThirdView
#         # FifthView
#         # FourthView

        
#         columns = ["useragent", "slotprice", "usertag"]
#         df.drop(columns, axis = 1, inplace = True)
        
#         df = pd.get_dummies(df)
  
#         return df

In [6]:
def FeatureEngineering(df): 
        
        # Usertags
        usertags = UsertagCategories(df)
        for tag in usertags:
            col_name = "usertag_" + tag
            df[col_name] = df["usertag"].map(lambda x: 1 if tag in x.split(",") else 0)
            
        # Slotprice binning
        df["slotprice_cat"] = 0
        
        df.loc[ df["slotprice"] <= 10, "slotprice_cat"] = 0
        df.loc[ (df["slotprice"] > 10) & (df["slotprice"] <= 50), "slotprice_cat"] = 1
        df.loc[ (df["slotprice"] > 50) & (df["slotprice"] <= 100), "slotprice_cat"] = 2
        df.loc[ df["slotprice"] > 100, "slotprice_cat"] = 3
        
        
        
        # Convert numerical to categorical
        df["weekday_cat"] = df["weekday"].map(lambda x: str(x))
        df["hour_cat"] = df["hour"].map(lambda x: str(x))
        df["region_cat"] = df["region"].map(lambda x: str(x))
        df["city_cat"] = df["city"].map(lambda x: str(x))
        df["adexchange_cat"] = df["adexchange"].map(lambda x: str(x))
        df["advertiser_cat"] = df["advertiser"].map(lambda x: str(x))
        
        # Operating system
        df["os"] = df["useragent"].map(lambda x: x.split("_")[0])
        
        # Browser
        df["browser"] = df["useragent"].map(lambda x: x.split("_")[1])
        
        columns = ["useragent", "slotprice", "keypage", "usertag", "region", "city", "adexchange", "advertiser"]
        df.drop(columns, axis = 1, inplace = True)
        
        df = pd.get_dummies(df)
  
        return df

In [7]:
print("Creating train feature")
traindf_features  = FeatureEngineering(traindf)
print("Creating validation feature")
validationdf_features = FeatureEngineering(validationdf)
print("Creating test feature")
testdf_features = FeatureEngineering(testdf)

print(traindf_features.shape)
traindf_features.head(5)

Creating train feature
Creating validation feature
Creating test feature
(2430981, 583)


Unnamed: 0,click,weekday,hour,bidprice,payprice,usertag_10006,usertag_10024,usertag_10031,usertag_10048,usertag_10052,...,os_windows,browser_chrome,browser_firefox,browser_ie,browser_maxthon,browser_opera,browser_other,browser_safari,browser_sogou,browser_theworld
0,0,5,22,238,5,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1,0,1,20,294,23,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
2,0,3,13,238,24,1,0,0,0,1,...,1,0,0,1,0,0,0,0,0,0
3,0,6,23,300,25,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,0,5,6,277,133,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


In [8]:
'''Because there is such a huge disparity between non-clicks vs clicks, we'll perform a:
Negative downsampling (imbalanced): different samples sizes are used in this procedure. 
In all these samples all the observations from the minority class are kept and 
we take different number of observations from the majority class by performing sampling without replacement.
'''


# Separate majority and minority classes
train_data_majority = traindf_features[traindf_features.click==0]
train_data_minority = traindf_features[traindf_features.click==1]

# Downsample majority class
df_majority_downsampled = resample(train_data_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=20000,     # to match minority class
                                 random_state=123) # reproducible results

# Combine minority class with downsampled majority class
train_downsized = pd.concat([df_majority_downsampled, train_data_minority])


print("Clicked data size ::::" + str(train_data_minority.shape))
print("Not clicked data size ::::" + str(df_majority_downsampled.shape))

# Display new class counts
print("Combined dataset count ::::" + str(train_downsized.click.value_counts()))
    

Clicked data size ::::(1793, 583)
Not clicked data size ::::(20000, 583)
Combined dataset count ::::0    20000
1     1793
Name: click, dtype: int64


In [9]:

X_train = train_downsized.loc[:, ((train_downsized.columns != "click") & 
                      (train_downsized.columns != "payprice") &
                      (train_downsized.columns != "bidprice"))]

X_train_pay = train_downsized.loc[:, ((train_downsized.columns != "click") &
                      (train_downsized.columns != "bidprice"))]


y_train = train_downsized["click"]

X_validation = validationdf_features.loc[:, ((validationdf_features.columns != "click") & 
                                        (validationdf_features.columns != "payprice") & 
                                        (validationdf_features.columns != "bidprice"))]
y_validation = validationdf_features["click"]


X_test = testdf_features.loc[:, ((testdf_features.columns != "click") & 
                                        (testdf_features.columns != "payprice") & 
                                        (testdf_features.columns != "bidprice"))]


In [10]:
#save the down

#Pickled Train
pickle.dump(X_train, open("X_train_features.pkl", 'wb'))
pickle.dump(y_train, open("y_train_features.pkl", 'wb'))

pickle.dump(X_train_pay, open("X_train_pay.pkl", 'wb'))

#Pickled Validation
pickle.dump(X_validation, open("X_validation_features.pkl", 'wb'))
pickle.dump(y_validation, open("y_validation_features.pkl", 'wb'))

#Pickled Test
pickle.dump(X_test, open("X_test_features.pkl", 'wb'))


In [11]:
clf = LogisticRegression(C = 0.1, random_state = 123)

In [12]:
# Initialize Recursive Feature Elimination object
stepsize = 10
rfecv = RFECV(estimator = clf, step = stepsize, cv = StratifiedKFold(n_splits = 2), scoring = "f1_weighted")

rfecv.fit(X_train, y_train)

  'precision', 'predicted', average, warn_for)


RFECV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
   estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
   min_features_to_select=1, n_jobs=None, scoring='f1_weighted', step=10,
   verbose=0)

In [13]:
#get a list of all the features that our model with benefit form

bestFeatures=  [feature for feature, booln in zip(list(X_train.columns.values), rfecv.support_) if booln]
print("Selecting the "+ str(len(bestFeatures)) + " best features, which include")
print(bestFeatures, end=", ")


Selecting the 240 best features, which include
['usertag_10006', 'usertag_10031', 'usertag_10059', 'usertag_10063', 'usertag_10076', 'usertag_10079', 'usertag_10093', 'usertag_10111', 'usertag_10114', 'usertag_10115', 'usertag_10116', 'usertag_10117', 'usertag_10120', 'usertag_10125', 'usertag_10129', 'usertag_10131', 'usertag_10133', 'usertag_10140', 'usertag_10146', 'usertag_10147', 'usertag_10148', 'usertag_10149', 'usertag_10684', 'usertag_11092', 'usertag_11278', 'usertag_11423', 'usertag_11576', 'usertag_11632', 'usertag_11724', 'usertag_11944', 'usertag_13042', 'usertag_13403', 'usertag_13496', 'usertag_13678', 'usertag_13776', 'usertag_13800', 'usertag_13866', 'usertag_13874', 'usertag_14273', 'usertag_16593', 'usertag_16617', 'usertag_16661', 'usertag_16706', 'usertag_16753', 'slotprice_cat', 'slotvisibility_0', 'slotvisibility_1', 'slotvisibility_2', 'slotvisibility_255', 'slotvisibility_FirstView', 'slotvisibility_FourthView', 'slotvisibility_OtherView', 'slotvisibility_Seco

In [14]:
# save the model to disk
#pickle.dump(rfecv, open(bestFeatures, 'wb'))


with open('bestFeatures.pkl', 'wb') as f:
    pickle.dump(bestFeatures, f)