# 1. Purpose

 The purpose of this project is to promote brand new laptop to those who are interested in it. However, it is important not to send an email to those who are uninterested in buying the laptop so as not to annoy them. I am looking to use the data about who clicked on similar emails in previous year to help you predict which users may be interested in the promotion.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
import time
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 2. Datasets

1. train_users / test_users: information about each user
2. train_logs / test_logs: details about webpages visited by each user
3. train_y / test_y: y = 1 means they clicked the email, y = 0 otherwise

In [2]:
train_users = pd.read_csv("data/train_users.csv")
train_logs = pd.read_csv("data/train_logs.csv")
train_y = pd.read_csv("data/train_y.csv")

test_users = pd.read_csv("data/test1_users.csv")
test_logs = pd.read_csv("data/test1_logs.csv")
test_y = pd.read_csv("data/test1_y.csv")

In [3]:
train_y.head()

Unnamed: 0,user_id,y
0,0,False
1,1,False
2,2,True
3,3,False
4,4,False


In [4]:
test_users.head()

Unnamed: 0,user_id,names,age,past_purchase_amt,badge
0,30000,Craig Thompson,56,2.412409,bronze
1,30001,Blake Davidson,31,71.48807,silver
2,30002,Micheal Tate,74,22.641604,gold
3,30003,Christian Morris,31,1141.450277,silver
4,30004,Carrie Carey,76,146.152877,bronze


In [5]:
test_logs.head()

Unnamed: 0,user_id,date,url,seconds
0,30000,1/11/2021,/keyboard.html,13
1,30000,2/22/2021,/cleats.html,95
2,30000,5/6/2021,/tablet.html,113
3,30000,10/19/2021,/blender.html,87
4,30000,5/17/2021,/tablet.html,108


In [6]:
test_y.head()

Unnamed: 0,user_id,y
0,30000,True
1,30001,True
2,30002,True
3,30003,True
4,30004,True


# 3. Merging Data 

 We have two datasets to refer to: users and logs.  As you can see, each of the users is identified by user_id. 

- user_id: identifying number of each user
- names: name of each user
- age: age of each user
- past_purchase_amt: the money spent on our firms' items
- badge: the badge status

In [7]:
train_users.head()

Unnamed: 0,user_id,names,age,past_purchase_amt,badge
0,0,Adriana Mcclure,26,39.344704,gold
1,1,Stacy Gilmore,67,15.840151,silver
2,2,Joanna Walsh,50,1099.420085,bronze
3,3,Eduardo Moore,65,5.880239,bronze
4,4,Angela Freeman,88,1312.296847,bronze


- date: the date of a user visited the url
- url: the url of a user visited (keyboard, blender,laptop, tablet, cleats)
- seconds: total seconds spent on the url

In [8]:
train_logs.head()

Unnamed: 0,user_id,date,url,seconds
0,0,10/27/2021,/keyboard.html,159
1,0,3/15/2021,/blender.html,15
2,0,7/29/2021,/keyboard.html,11
3,0,1/27/2021,/laptop.html,142
4,1,3/1/2021,/keyboard.html,78


I am going to merge logs data to users data based on user_id. Also, the total seconds spent on url of each items will be another column.  

In [9]:
time0 = time.time()
all_id = train_users["user_id"]

keyboard = [0] * len(all_id)
blender = [0] * len(all_id)
laptop = [0] * len(all_id)
tablet = [0] * len(all_id)
cleats = [0] * len(all_id)

for i in range(len(all_id)):
    member_id = all_id[i]
    if member_id in train_logs["user_id"].values:
        df = train_logs[train_logs["user_id"] == member_id]
        for j in range(len(df)):
            subset_df = df.iloc[j, :]
            url = subset_df["url"]    
            seconds = subset_df["seconds"]

            if url == "/keyboard.html":
                keyboard[i] += seconds 
            elif url == "/blender.html":
                blender[i] += seconds
            elif url == "/laptop.html":
                laptop[i] += seconds
            elif url == "/tablet.html":
                tablet[i] += seconds
            elif url == "/cleats.html":
                cleats[i] += seconds
                
train_users["keyboard"] = keyboard
train_users["blender"] = blender
train_users["laptop"] = laptop
train_users["tablet"] = tablet
train_users["cleats"] = cleats
train_users = train_users.drop(["user_id","names"], axis=1)
print("Elapsed Time:", time.time() - time0)
train_users

Elapsed Time: 119.5639591217041


Unnamed: 0,age,past_purchase_amt,badge,keyboard,blender,laptop,tablet,cleats
0,26,39.344704,gold,170,15,142,0,0
1,67,15.840151,silver,78,0,0,0,0
2,50,1099.420085,bronze,86,208,0,0,138
3,65,5.880239,bronze,0,0,0,0,0
4,88,1312.296847,bronze,0,0,0,0,0
...,...,...,...,...,...,...,...,...
29995,87,108.914205,bronze,171,0,392,0,0
29996,49,291.969273,gold,0,116,0,247,169
29997,46,2.609698,bronze,0,0,0,0,0
29998,23,164.885894,bronze,0,0,0,0,0


Same could be also argued for test data.

In [10]:
time0 = time.time()
all_id = test_users["user_id"]

keyboard = [0] * len(all_id)
blender = [0] * len(all_id)
laptop = [0] * len(all_id)
tablet = [0] * len(all_id)
cleats = [0] * len(all_id)

for i in range(len(all_id)):
    member_id = all_id[i]
    if member_id in test_logs["user_id"].values:
        df = test_logs[test_logs["user_id"] == member_id]
        for j in range(len(df)):
            subset_df = df.iloc[j, :]
            url = subset_df["url"]    
            seconds = subset_df["seconds"]

            if url == "/keyboard.html":
                keyboard[i] += seconds 
            elif url == "/blender.html":
                blender[i] += seconds
            elif url == "/laptop.html":
                laptop[i] += seconds
            elif url == "/tablet.html":
                tablet[i] += seconds
            elif url == "/cleats.html":
                cleats[i] += seconds
                
test_users["keyboard"] = keyboard
test_users["blender"] = blender
test_users["laptop"] = laptop
test_users["tablet"] = tablet
test_users["cleats"] = cleats
test_users = test_users.drop(["user_id","names"], axis=1)
print("Elapsed Time:", time.time() - time0)
test_users

Elapsed Time: 117.73961138725281


Unnamed: 0,age,past_purchase_amt,badge,keyboard,blender,laptop,tablet,cleats
0,56,2.412409,bronze,126,87,10,221,95
1,31,71.488070,silver,0,142,231,66,101
2,74,22.641604,gold,0,0,49,390,0
3,31,1141.450277,silver,231,8,269,178,103
4,76,146.152877,bronze,0,373,0,214,72
...,...,...,...,...,...,...,...,...
29995,65,9.128541,bronze,255,385,0,0,145
29996,53,41.161813,gold,54,0,0,0,0
29997,88,7.126658,silver,0,0,0,29,139
29998,46,30.402432,silver,61,0,31,195,40


As badge column consists of caregorical labels, I implement onehotencoding to convert into float.

In [11]:
time0 = time.time()
oh = OneHotEncoder()
data = oh.fit_transform(train_users[["badge"]])

badge_bronze = pd.DataFrame(data.toarray(), columns=oh.get_feature_names_out())["badge_bronze"]
badge_silver = pd.DataFrame(data.toarray(), columns=oh.get_feature_names_out())["badge_silver"]
badge_gold = pd.DataFrame(data.toarray(), columns=oh.get_feature_names_out())["badge_gold"]

train_users["badge_bronze"] = badge_bronze
train_users["badge_silver"] = badge_silver
train_users["badge_gold"] = badge_gold
train_users = train_users.drop(["badge"], axis=1)
print("Elapsed Time:", time.time() - time0)
train_users

Elapsed Time: 0.0539247989654541


Unnamed: 0,age,past_purchase_amt,keyboard,blender,laptop,tablet,cleats,badge_bronze,badge_silver,badge_gold
0,26,39.344704,170,15,142,0,0,0.0,0.0,1.0
1,67,15.840151,78,0,0,0,0,0.0,1.0,0.0
2,50,1099.420085,86,208,0,0,138,1.0,0.0,0.0
3,65,5.880239,0,0,0,0,0,1.0,0.0,0.0
4,88,1312.296847,0,0,0,0,0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
29995,87,108.914205,171,0,392,0,0,1.0,0.0,0.0
29996,49,291.969273,0,116,0,247,169,0.0,0.0,1.0
29997,46,2.609698,0,0,0,0,0,1.0,0.0,0.0
29998,23,164.885894,0,0,0,0,0,1.0,0.0,0.0


In [12]:
time0 = time.time()
oh = OneHotEncoder()
data = oh.fit_transform(test_users[["badge"]])

badge_bronze = pd.DataFrame(data.toarray(), columns=oh.get_feature_names_out())["badge_bronze"]
badge_silver = pd.DataFrame(data.toarray(), columns=oh.get_feature_names_out())["badge_silver"]
badge_gold = pd.DataFrame(data.toarray(), columns=oh.get_feature_names_out())["badge_gold"]

test_users["badge_bronze"] = badge_bronze
test_users["badge_silver"] = badge_silver
test_users["badge_gold"] = badge_gold
test_users = test_users.drop(["badge"], axis=1)
print("Elapsed Time:", time.time() - time0)
test_users

Elapsed Time: 0.04926776885986328


Unnamed: 0,age,past_purchase_amt,keyboard,blender,laptop,tablet,cleats,badge_bronze,badge_silver,badge_gold
0,56,2.412409,126,87,10,221,95,1.0,0.0,0.0
1,31,71.488070,0,142,231,66,101,0.0,1.0,0.0
2,74,22.641604,0,0,49,390,0,0.0,0.0,1.0
3,31,1141.450277,231,8,269,178,103,0.0,1.0,0.0
4,76,146.152877,0,373,0,214,72,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
29995,65,9.128541,255,385,0,0,145,1.0,0.0,0.0
29996,53,41.161813,54,0,0,0,0,0.0,0.0,1.0
29997,88,7.126658,0,0,0,29,139,0.0,1.0,0.0
29998,46,30.402432,61,0,31,195,40,0.0,1.0,0.0


# 4. Results

## Logistic Regression

In [13]:
model = Pipeline([
    ("std", StandardScaler()),
    ("lr", LogisticRegression())
])
train_y = pd.read_csv("data/train_y.csv")
test_y = pd.read_csv("data/test1_y.csv")

train_y = train_y.drop(["user_id"], axis = 1)
test_y = test_y.drop(["user_id"], axis = 1)
model.fit(train_users, train_y)
model.score(test_users, test_y)

  y = column_or_1d(y, warn=True)


0.8007333333333333

## Random Forest

In [14]:
model = Pipeline([
    ("std", StandardScaler()),
    ("rf", RandomForestClassifier(n_estimators=100))
])
train_y = pd.read_csv("data/train_y.csv")
test_y = pd.read_csv("data/test1_y.csv")

train_y = train_y.drop(["user_id"], axis = 1)
test_y = test_y.drop(["user_id"], axis = 1)
model.fit(train_users, train_y)
model.score(test_users, test_y)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.9282666666666667

## Random Forest without blender & cleats column

In [15]:
model = Pipeline([
    ("std", StandardScaler()),
    ("rf", RandomForestClassifier(n_estimators=100))
])
train_y = pd.read_csv("data/train_y.csv")
test_y = pd.read_csv("data/test1_y.csv")

train_y = train_y.drop(["user_id"], axis = 1)
test_y = test_y.drop(["user_id"], axis = 1)
train_users_v1 = train_users.drop(["blender", "cleats"], axis = 1)
test_users_v1 = test_users.drop(["blender", "cleats"], axis = 1)
model.fit(train_users_v1, train_y)
model.score(test_users_v1, test_y)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.8857333333333334

## Random Forest without blender, cleats and age

In [16]:
model = Pipeline([
    ("std", StandardScaler()),
    ("rf", RandomForestClassifier(n_estimators=100))
])
train_y = pd.read_csv("data/train_y.csv")
test_y = pd.read_csv("data/test1_y.csv")

train_y = train_y.drop(["user_id"], axis = 1)
test_y = test_y.drop(["user_id"], axis = 1)
train_users_v2 = train_users_v1.drop(["age"], axis = 1)
test_users_v2 = test_users_v1.drop(["age"], axis = 1)

model.fit(train_users_v2, train_y)
model.score(test_users_v2, test_y)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.7442666666666666