# DESCRIPTION: 
This script develops several machine learning models including Decision Tree, Random Forest, Gradient Descending Classifier, XGBoost.
- Objective: To classify two types of users of a specific online product (Paid Online users vs Organic users) based on different features of users behaviors such as adblock is on/off, total online time per day, country.

- Purpose: Classifying users into correct categories helps the company to measure the effectiveness of Paid Marketing campaign in terms of getting more Paid Online user and Organic users or not as the result of paid campaigns.

- Data: The datasets describe the behaviors of users in the form of different features. Each row equals to one user. The original data frame has a shape of approximately 235.000 rows with 25 distinct features.

- Result: Among the four models, Gradient Descending Classifier and XGBoost appear to be the best model with the accuracy score on the training datasets of 82% and on the test datasets of 73%.

In [1]:
# Importing essential libraries and packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from collections import Counter
from sklearn.model_selection import GridSearchCV

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
# Reading the data from the datawarehouse - BigQuery
sql = f"""
WITH
  userbase AS (
  SELECT DISTINCT
    small_device_id,
    firstseen as first_seen,
    distribution_channel,
    MIN(major_version) AS version_to_use

  FROM
    `osp-bu-mobile.bi_playground.ofa_client_uplift`
  WHERE
    firstseen = DATE(_PARTITIONTIME) 
    AND country LIKE 'US'            
    AND firstseen < '2022-08-01'
    AND active
  GROUP BY 1,2,3
    ),

day_of_use as (
SELECT
  small_device_id,
  count(distinct DATE(_PARTITIONTIME) ) as DoU
FROM
    `osp-bu-mobile.bi_playground.ofa_client_uplift`
WHERE
  country LIKE 'US'            
  AND firstseen < '2022-08-01'
  AND  DATE(_PARTITIONTIME) BETWEEN firstseen AND DATE_ADD(firstseen, INTERVAL 150 DAY)
  AND active
GROUP BY 1

)

SELECT
  small_device_id,
  CASE 
    WHEN version_to_use BETWEEN 0 AND 10 THEN '10'
    WHEN version_to_use BETWEEN 11 AND 20 THEN '20'
    WHEN version_to_use BETWEEN 21 AND 30 THEN '30'
    WHEN version_to_use BETWEEN 31 AND 40 THEN '40'
    WHEN version_to_use BETWEEN 41 AND 50 THEN '50'
    WHEN version_to_use BETWEEN 51 AND 60 THEN '60'
    WHEN version_to_use BETWEEN 61 AND 70 THEN '70'
    WHEN version_to_use BETWEEN 71 AND 80 THEN '80'
    
  END AS version_to_use,
  DoU as DOU,

  SUM(page_loads_total)/DoU AS page_loads,
  SUM(google_searches)/DoU as google_searches,

  SUM(foreground_duration)/3600000/DoU as foreground_duration_hour,

  SUM(ad_opportunities)/DoU as ad_opportunities,
  SUM(ad_missed_opportunities)/DoU as ad_missed_opportunities,
  SUM(ad_unique_clicks)/DoU as ad_unique_clicks,
  SUM(ad_unique_impressions)/DoU as ad_unique_impressions,

  SUM(sd_user_clicks)/DoU as sd_user_clicks,
  SUM(sd_partner_clicks)/DoU as sd_partner_clicks, 

  SUM(ad_successful_requests)/DoU as ad_successful_requests,
  SUM(ad_no_fill_requests)/DoU as ad_no_fill_requests,
  SUM(ad_failed_requests)/DoU as ad_failed_requests,
  SUM(news_sessions)/DoU as news_sessions, 
  SUM(news_article_interactions)/DoU as news_article_interactions,
  SUM(start_page_views)/DoU as start_page_views,

  MAX (CASE WHEN extended_stats is true then 1 ELSE 0 END) AS extended_stats,
  MAX (CASE WHEN news_notifications_enabled is true then 1 ELSE 0 END) AS news_notifications_enabled,
  MAX (CASE WHEN adblock_enabled is true then 1 ELSE 0 END) AS adblock_enabled,
  MAX (CASE WHEN vpn_enabled is true then 1 ELSE 0 END) AS vpn_enabled, 
  MAX (CASE WHEN is_default_browser is true then 1 ELSE 0 END) AS is_default_browser,
  MAX (CASE WHEN premium is true then 1 ELSE 0 END) AS premium, 

    CASE 
    WHEN userbase.distribution_channel = 'Organic' then 1
    ELSE 0
    END AS user_type

 FROM
  `osp-bu-mobile.bi_playground.ofa_client_uplift`
LEFT JOIN
  userbase
USING
  (small_device_id)
LEFT JOIN day_of_use
USING (small_device_id)
WHERE
  DATE(_PARTITIONTIME) BETWEEN first_seen AND DATE_ADD(first_seen, INTERVAL 150 DAY)
  AND active IS TRUE
  AND public_release is TRUE
GROUP BY 1,2,3,user_type
ORDER BY 2,3

"""
df = pd.read_gbq(sql, project_id='osp-bu-mobile')
df.shape

(233169, 25)

# DATA EXPLORATION AND PREPARATION

In [6]:
# Removing extreme outliers manually (This is performed by drawing scatter plots in another script)

df.drop(df[df['page_loads'] >= 2000].index, inplace = True)
df.drop(df[df['google_searches'] >= 600].index, inplace = True)
df.drop(df[df['ad_opportunities'] >= 500].index, inplace = True)
df.drop(df[df['ad_missed_opportunities'] >= 500].index, inplace = True)
df.drop(df[df['ad_unique_impressions'] >= 250].index, inplace = True)
df.drop(df[df['sd_user_clicks'] >= 2000].index, inplace = True)
df.drop(df[df['sd_partner_clicks'] >= 100].index, inplace = True)
df.drop(df[df['ad_no_fill_requests'] >= 1000].index, inplace = True)
df.drop(df[df['ad_failed_requests'] >= 1000].index, inplace = True)
df.drop(df[df['ad_successful_requests'] >= 600].index, inplace = True)
df.drop(df[df['news_article_interactions'] >= 60].index, inplace = True)
df.drop(df[df['start_page_views'] >= 500].index, inplace = True)

# Filling in missing values with 0
df.fillna(0, inplace = True)

In [5]:
# Then removing outliers using function with 5 standard deviations

# Define a function to remove outliers with 5 stds
def remove_outliers(df, columns, n_std):
    for col in columns:
        mean = df[col].mean()
        sd = df[col].std()
        df = df[(df[col] >= mean-(n_std*sd)) & (df[col] <= mean+(n_std*sd))]
    return df

# Apply the function the the numeric columns in the df
numeric_features = df.columns[3:-7].to_list()

df1 = remove_outliers(df, numeric_features, 5)
df1.drop(columns = ['small_device_id'], inplace = True)
df1.shape

(215788, 25)

In [7]:
# Creating seperate dataframe for each version of the product.
version50 = df1[df1['version_to_use'] == '50']
version60 = df1[df1['version_to_use'] == '60']
version70 = df1[df1['version_to_use'] == '70']

version70.shape

(205390, 24)

In [9]:
# Within the scope of this script, we only build classification model for "Version 70" 
y = version70.pop('user_type')
X = version70.copy(deep=True)

# The original balance of y
print('Original dataset shape: ', Counter (y))

Original dataset shape:  Counter({1: 158754, 0: 46636})


In [10]:
# Spliting training and test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, test_size = .20, stratify = y)

In [11]:
# Standardization dataset
from sklearn.preprocessing import StandardScaler
numeric_features = numeric_features = df1.columns[2:-7].to_list()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Define the scaler
scaler = StandardScaler()

# Fit on the numeric features of training dataset
scaler.fit(X_train_scaled[numeric_features])

StandardScaler()

In [12]:
# scale the training dataset
X_train_scaled[numeric_features] = scaler.transform(X_train_scaled[numeric_features])

# scale the test dataset
X_test_scaled[numeric_features] = scaler.transform(X_test_scaled[numeric_features])

In [13]:
#splitting has preserved the class balance
print('Training dataset shape %s' % Counter(y_train))
print('Testing dataset shape %s' % Counter(y_test))

Training dataset shape Counter({1: 127003, 0: 37309})
Testing dataset shape Counter({1: 31751, 0: 9327})


In [14]:
# Upsampling by creating synthetic observations for Paid users
upsampling = SMOTE()
X_upsample, y_upsample = upsampling.fit_resample(X_train_scaled, y_train)

# MODEL 1: DECISION TREE

In [42]:
# DECISION TREE

# Specify the model
DT = DecisionTreeClassifier()

# Specify parameters range for the model
DT_param = {'max_depth': np.arange(1,30),
            'max_features': np.arange(2,15),
            'min_samples_leaf': np.arange(1,20),
            'min_samples_split': np.arange(2,20),
            'criterion': ['gini', 'entropy']}

# Apply RandomizedSearchCV to find the optimal parameters for the model
DT_cv = RandomizedSearchCV(DT, param_distributions=DT_param, cv=10, n_iter=80, random_state=0)

# DT_cv = GridSearchCV(DT, DT_param, cv=5, return_train_score = True)

# Fitting the model to the rebalanced datasets
DT_cv.fit(X_upsample, y_upsample)

# Printing out the best parameters for the model
DT_cv.best_params_

{'min_samples_split': 17,
 'min_samples_leaf': 3,
 'max_features': 12,
 'max_depth': 28,
 'criterion': 'entropy'}

In [43]:
# DECISION TREE WITH THE BEST PARAMS

# Running the new Decision Tree model with the best parameters gained from above
DT_best = DecisionTreeClassifier(min_samples_leaf = DT_cv.best_params_['min_samples_leaf'],
                                 min_samples_split = DT_cv.best_params_['min_samples_split'],
                                 max_features = DT_cv.best_params_['max_features'],
                                 max_depth = DT_cv.best_params_['max_depth'],
                                 criterion = DT_cv.best_params_['criterion'],
                                 random_state = 0)

# Fitting the model to the datasets
DT_best.fit(X_upsample, y_upsample)

# Printing out the accuracy score of the Decision Tree model
cv_scores = cross_validate(DT_best, X_upsample, y_upsample, scoring='accuracy', cv=10)
score_mean = round(cv_scores['test_score'].mean(), 5)
score_std = round(cv_scores['test_score'].std(), 5)
print(f'Score of Decision Tree: {score_mean} ({score_std})')

Score of Decision Tree: 0.71158 (0.07023)


In [44]:
# Accuracy score when running on test dataset
y_model = DT_best.predict(X_test_scaled)
accuracy_score(y_test, y_model)

# Score of Decision Tree: 0.6358

0.6358391353035688

# MODEL 2 : RANDOM FOREST

In [1]:
# RANDOM FOREST

# Specify the Random Forest model
RF = RandomForestClassifier(random_state=0)

# Specify parameters range for the model
RF_param = {'n_estimators': [20,50,100,150,200],
            'max_depth': np.arange(1,30),
            'max_features': np.arange(2,15),
            'min_samples_leaf': np.arange(1,20),
            'min_samples_split': np.arange(2,20),
            'criterion': ['gini', 'entropy']}

# Apply RandomizedSearchCV to find the optimal parameters for the model
RF_cv = RandomizedSearchCV(RF, param_distributions=RF_param, cv=10, n_iter=40, random_state=0, verbose=3)

# Fitting the model to the rebalanced datasets
RF_cv.fit(X_upsample, y_upsample)

In [46]:
# The best parameters chosen:
RF_cv.best_params_
print('''best score = {:.2f}'''.format(RF_cv.best_score_))

best score = 0.79


In [48]:
# Printing out the best parameters for the model 
RF_cv.best_params_

{'n_estimators': 200,
 'min_samples_split': 8,
 'min_samples_leaf': 9,
 'max_features': 7,
 'max_depth': 29,
 'criterion': 'entropy'}

In [49]:
#  Running the randomforest classifier with the best paras chosen

RF_best = RandomForestClassifier(n_estimators=RF_cv.best_params_['n_estimators'], 
                                 max_features=RF_cv.best_params_['max_features'], 
                                 max_depth=RF_cv.best_params_['max_depth'], 
                                 criterion=RF_cv.best_params_['criterion'],
                                 min_samples_leaf=RF_cv.best_params_['min_samples_leaf'],
                                 min_samples_split=RF_cv.best_params_['min_samples_split'],
                                 random_state = 0)

# Fitting the model to the datasets

RF_best.fit(X_upsample, y_upsample)
cv_scores = cross_validate(RF_best, X_upsample, y_upsample, scoring='accuracy', cv=10) # note that cross validation = 10
score_mean = round(cv_scores['test_score'].mean(), 5)
score_std = round(cv_scores['test_score'].std(), 5)
print(f'Score of Random Forest: {score_mean} ({score_std})')

# Random Forest Classifier give an accuracy score of 0.7857 (0.09678)

Score of Random Forest: 0.7857 (0.09678)


In [51]:
# Predict for test dataset
y_model = RF_best.predict(X_test_scaled)
accuracy_score(y_test, y_model)

# Accuracy score for test dataset: 0.7087492088222406

0.7087492088222406

# MODEL 3: Gradient Boosting Classifier 

In [2]:
# Gradient Boosting Classifier 

# Specify Gradient Boosting model
gbc = GradientBoostingClassifier(random_state=0)

# Specify parameters range for the model
gbc_param = {'n_estimators': [20,50,100,150,200],
             'learning_rate': [0.001, 0.01, 0.1, 1],
             'subsample': np.arange(0.1, 1.1, 0.1),
             'criterion': ['friedman_mse', 'squared_error'],
             'min_samples_split': np.arange(2,20),
             'min_samples_leaf': np.arange(1,20),
             'max_depth': np.arange(1,30),
             'max_features': np.arange(2,15)
            }

# Apply RandomizedSearchCV to find the optimal parameters for the model

gbc_cv = RandomizedSearchCV(gbc, param_distributions=gbc_param, cv=10, n_iter=60, random_state=0, verbose=3)
gbc_cv.fit(X_upsample, y_upsample)
gbc_cv.best_params_

In [18]:
# The best parameters chosen:
gbc_cv.best_params_

{'subsample': 0.7000000000000001,
 'n_estimators': 100,
 'min_samples_split': 16,
 'min_samples_leaf': 19,
 'max_features': 6,
 'max_depth': 29,
 'learning_rate': 0.1,
 'criterion': 'squared_error'}

In [17]:
print('''best score = {:.2f}'''.format(gbc_cv.best_score_))

# best score = 0.82

best score = 0.82


In [19]:
#  Running the randomforest classifier with the best paras chosen

gbc_best = GradientBoostingClassifier(n_estimators=gbc_cv.best_params_['n_estimators'], 
                                      #loss=gbc_cv.best_params_['loss'], 
                                      subsample=gbc_cv.best_params_['subsample'], 
                                      criterion=gbc_cv.best_params_['criterion'],
                                      learning_rate=gbc_cv.best_params_['learning_rate'],
                                      max_features=gbc_cv.best_params_['max_features'], 
                                      max_depth=gbc_cv.best_params_['max_depth'], 
                                      min_samples_leaf=gbc_cv.best_params_['min_samples_leaf'],
                                      min_samples_split=gbc_cv.best_params_['min_samples_split'],
                                      random_state=0)

# Fitting the model to the datasets
gbc_best.fit(X_upsample, y_upsample)

cv_scores = cross_validate(gbc_best, X_upsample, y_upsample, scoring='accuracy', cv=10)
score_mean = round(cv_scores['test_score'].mean(), 5)
score_std = round(cv_scores['test_score'].std(), 5)
print(f'Score of Gradient Boosting: {score_mean} ({score_std})')

# Score of Gradient Boosting: 0.81914 (0.12273)

Score of Gradient Boosting: 0.81914 (0.12273)


In [20]:
# Predict for test dataset
y_model = gbc_best.predict(X_test_scaled)
accuracy_score(y_test, y_model)

# Accuracy score for test dataset: 0.7252787380106139

0.7252787380106139

# MODEL 4: XGBoost 

In [3]:
# !pip install xgboost

In [16]:
from xgboost import XGBClassifier

In [17]:
X_upsample.drop(columns = ['version_to_use'], inplace = True)

In [4]:
# XGBoosting Classifier *try 1 - cv = 10, n_iter = 60

# Specify the XGBClassifier model
XGB = XGBClassifier(random_state=0)

#Specify parameter range for the model
XGB_param = {'n_estimators': [20,50,100,150,200,250],
             'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
             'min_child_weight': np.arange(1,20),
             'max_depth': np.arange(1,30),
             'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
             'colsample_bytree': np.arange(0.08, 0.7)
            }

# Apply RandomizedSearchCV to find the optimal parameters for the model
XGB_cv = RandomizedSearchCV(XGB, param_distributions=XGB_param, cv=10, n_iter=60, random_state=0, verbose=3)
XGB_cv.fit(X_upsample, y_upsample)
XGB_cv.best_params_

In [34]:
print('''best score = {:.2f}'''.format(XGB_cv.best_score_))


best score = 0.79


In [36]:
#  Running the randomforest classifier with the best paras chosen
XGB_best = XGBClassifier(n_estimators=XGB_cv.best_params_['n_estimators'],                                     
                                      learning_rate = XGB_cv.best_params_['learning_rate'],
                                      min_child_weight = XGB_cv.best_params_['min_child_weight'], 
                                      max_depth = XGB_cv.best_params_['max_depth'], 
                                      gamma = XGB_cv.best_params_['gamma'],
                                      colsample_bytree = XGB_cv.best_params_['colsample_bytree'],
                                      random_state=0)

# Fitting the model to the datasets
XGB_best.fit(X_upsample, y_upsample)

cv_scores = cross_validate(XGB_best, X_upsample, y_upsample, scoring='accuracy', cv=10)
score_mean = round(cv_scores['test_score'].mean(), 5)
score_std = round(cv_scores['test_score'].std(), 5)
print(f'Score of XGBoosting: {score_mean} ({score_std})')

# Score of Gradient Boosting: 0.81914 (0.12273)

Score of XGBoosting: 0.78665 (0.16331)


In [39]:
X_test_scaled.drop(columns = ['version_to_use'], inplace = True)

# Predict for test dataset
y_model = XGB_best.predict(X_test_scaled)
accuracy_score(y_test, y_model)

# Accuracy score for test dataset: 0.7252787380106139

0.7163445153123327

In [5]:
# XGBoosting Classifier * try 2 - cv = 10, n_iter = 100

# Specify the XGBClassifier model
XGB = XGBClassifier(random_state=0)

#Specify parameter range for the model
XGB_param = {'n_estimators': [20,50,100,150,200],
             'learning_rate': [0.001, 0.01, 0.1, 1],
             'min_child_weight': np.arange(1,20),
             'max_depth': np.arange(1,30),
             'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
             'colsample_bytree': np.arange(0.1, 0.8)
            }

# Apply RandomizedSearchCV to find the optimal parameters for the model
XGB_cv = RandomizedSearchCV(XGB, param_distributions=XGB_param, cv=10, n_iter=100, random_state=0, verbose=3)
XGB_cv.fit(X_upsample, y_upsample)
XGB_cv.best_params_

In [19]:
print('''best score = {:.2f}'''.format(XGB_cv.best_score_))

best score = 0.80


In [20]:
#  Running the randomforest classifier with the best paras chosen
XGB_best = XGBClassifier(n_estimators=XGB_cv.best_params_['n_estimators'],                                     
                                      learning_rate = XGB_cv.best_params_['learning_rate'],
                                      min_child_weight = XGB_cv.best_params_['min_child_weight'], 
                                      max_depth = XGB_cv.best_params_['max_depth'], 
                                      gamma = XGB_cv.best_params_['gamma'],
                                      colsample_bytree = XGB_cv.best_params_['colsample_bytree'],
                                      random_state=0)

# Fitting the model to the datasets
XGB_best.fit(X_upsample, y_upsample)
cv_scores = cross_validate(XGB_best, X_upsample, y_upsample, scoring='accuracy', cv=10)
score_mean = round(cv_scores['test_score'].mean(), 5)
score_std = round(cv_scores['test_score'].std(), 5)
print(f'Score of XGBoosting: {score_mean} ({score_std})')

# Score of Gradient Boosting: 0.81914 (0.12273)

Score of XGBoosting: 0.7974 (0.16521)


In [22]:
# Predict for test dataset
y_model = XGB_best.predict(X_test_scaled)
accuracy_score(y_test, y_model)

# Accuracy score for test dataset: 0.7252787380106139

0.7247675154583962