<a href="https://colab.research.google.com/github/harshvardhanhub/AllianceWebsite_Project/blob/main/FinalYearProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
import os
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# **DEFINE**

In [None]:
# Define additional classifiers
random_forest_classifier = RandomForestClassifier()
svm_classifier = SVC()
logistic_regression_classifier = LogisticRegression()


In [None]:
# Define a dictionary of classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'Random Forest': random_forest_classifier,
    'SVM': svm_classifier,
    'Logistic Regression': logistic_regression_classifier
}


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dtypes = {
        'ip'            : 'uint16',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

colnames=['ip','app','device','os', 'channel', 'click_time', 'is_attributed']

train_sample = pd.read_csv('/content/train_sample.csv',dtype=dtypes,usecols=colnames)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
len(train_sample)

In [None]:
train_sample.memory_usage()

In [None]:
# space used by training data
print('Training dataset uses {0} MB'.format(train_sample.memory_usage().sum()/1024**2))

In [None]:
train_sample.head()

# **Exploring the Data - Univariate Analysis**

Let's now understand and explore the data. Let's start with understanding the size and data types of the train_sample data.

In [None]:
# look at non-null values, number of entries etc.
# there are no missing values
train_sample.info()

In [None]:
# Basic exploratory analysis

# Number of unique values in each column

def fraction_unique(x):
    return len(train_sample[x].unique())

number_unique_vals = {x : fraction_unique(x) for x in train_sample.columns}
number_unique_vals

In [None]:
# All columns apart from click time are originally int type,
# though note that they are all actually categorical
train_sample.dtypes

There are certain 'apps' which have quite high number of instances/rows (each row is a click). The plot below shows this.

In [None]:
# distribution of 'app'
# some 'apps' have a disproportionately high number of clicks (>15k), and some are very rare (3-4)
plt.figure(figsize=(60,10))
sns.countplot(x="app",data=train_sample)


In [None]:
# distribution of 'device'
# this is expected because a few popular devices are used heavily
plt.figure(figsize=(54, 8))
sns.countplot(x="device", data=train_sample)

In [None]:
# channel: various channels get clicks in comparable quantities
plt.figure(figsize=(100, 8))
sns.countplot(x="channel", data=train_sample)

In [None]:
# os: there are a couple commos OSes (android and ios?), though some are rare and can indicate suspicion
plt.figure(figsize=(14, 8))
sns.countplot(x="os", data=train_sample)

Let's now look at the distribution of the target variable 'is_attributed'.

In [None]:
# target variable distribution
100 * (train_sample['is_attributed'].astype('object').value_counts()/len(train_sample.index))

italicized textExploring the Data - Segmented Univariate Analysis

In [None]:
# plot the average of 'is_attributed', or 'download rate'
# with app (clearly this is non-readable)

app_target = train_sample.groupby('app').is_attributed.agg(['mean','count'])
app_target

This is clearly non-readable, so let's first get rid of all the apps that are very rare (say which comprise of less than 20% clicks) and plot the rest

In [None]:
frequent_apps = train_sample.groupby('app').size().reset_index(name='count')
frequent_apps = frequent_apps[frequent_apps['count']>frequent_apps['count'].quantile(0.80)]
frequent_apps = frequent_apps.merge(train_sample,on='app',how='inner')
frequent_apps.head()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y="app", hue="is_attributed", data=frequent_apps)

We can do lots of other interesting ananlysis with the existing features. For now, let's create some new features which will probably improve the model.

Feature Engineering

Let's now derive some new features from the existing ones. There are a number of features one can extract from click_time itself, and by grouping combinations of IP with other features.

# *** Datetime Based Features***

In [None]:
# Creating datetime variables
# takes in a df, adds date/time based columns to it, and returns the modified df

def timeFeatures(df):
    # Derive new features using the click_time column
    df['datetime'] = pd.to_datetime(df['click_time'])
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['day_of_year'] = df['datetime'].dt.dayofyear
    df['month'] = df['datetime'].dt.month
    df['hour'] =df['datetime'].dt.hour
    return df


In [None]:
# creating new datetime variables and dropping the old ones
train_sample = timeFeatures(train_sample)
train_sample.drop(['click_time','datetime'], axis=1, inplace=True)
train_sample.head()


In [None]:
# datatypes
# note that by default the new datetime variables are int64
train_sample.dtypes


In [None]:
# memory used by training data
print('Training dataset uses {0} MB'.format(train_sample.memory_usage().sum()/1024**2))

In [None]:
# lets convert the variables back to lower dtype again
int_vars = ['app', 'device', 'os', 'channel', 'day_of_week','day_of_year', 'month', 'hour']
train_sample[int_vars] = train_sample[int_vars].astype('uint16')

In [None]:
train_sample.dtypes

In [None]:
# space used by training data
print('Training dataset uses {0} MB'.format(train_sample.memory_usage().sum()/1024**2))

# **IP Grouping Based Features**

Let's now create some important features by grouping IP addresses with features such as os, channel, hour, day etc. Also, count of each IP address will also be a feature.

Note that though we are deriving new features by grouping IP addresses, using IP adress itself as a features is not a good idea. This is because (in the test data) if a new IP address is seen, the model will see a new 'category' and will not be able to make predictions (IP is a categorical variable, it has just been encoded with numbers).

In [None]:
# number of clicks by count of IP address
# note that we are explicitly asking pandas to re-encode the aggregated features
# as 'int16' to save memory
ip_count = train_sample.groupby('ip').size().reset_index(name='ip_count').astype('int16')
ip_count.head()


We can now merge this dataframe with the original training df. Similarly, we can create combinations of various features such as ip_day_hour (count of ip-day-hour combinations), ip_hour_channel, ip_hour_app, etc.

The following function takes in a dataframe and creates these features.

In [None]:
# creates groupings of IP addresses with other features and appends the new features to the df

def grouped_features(df):
    # ip_count
    ip_count = df.groupby('ip').size().reset_index(name='ip_count').astype('uint16')
    ip_day_hour = df.groupby(['ip','day_of_week','hour']).size().reset_index(name='ip_day_hour').astype('uint16')
    ip_hour_channel = df[['ip','hour','channel']].groupby(['ip','hour','channel']).size().reset_index(name='ip_hour_channel').astype('uint16')
    ip_hour_os = df.groupby(['ip', 'hour', 'os']).channel.count().reset_index(name='ip_hour_os').astype('uint16')
    ip_hour_app = df.groupby(['ip', 'hour', 'app']).channel.count().reset_index(name='ip_hour_app').astype('uint16')
    ip_hour_device = df.groupby(['ip', 'hour', 'device']).channel.count().reset_index(name='ip_hour_device').astype('uint16')

    # merge the new aggregated features with the df
    df = pd.merge(df, ip_count, on='ip', how='left')
    del ip_count
    df = pd.merge(df, ip_day_hour, on=['ip', 'day_of_week', 'hour'], how='left')
    del ip_day_hour
    df = pd.merge(df, ip_hour_channel, on=['ip', 'hour', 'channel'], how='left')
    del ip_hour_channel
    df = pd.merge(df, ip_hour_os, on=['ip', 'hour', 'os'], how='left')
    del ip_hour_os
    df = pd.merge(df, ip_hour_app, on=['ip', 'hour', 'app'], how='left')
    del ip_hour_app
    df = pd.merge(df, ip_hour_device, on=['ip', 'hour', 'device'], how='left')
    del ip_hour_device

    return df

In [None]:
train_sample = grouped_features(train_sample)


In [None]:
train_sample.head()


In [None]:
print('Training dataset uses {0} MB'.format(train_sample.memory_usage().sum()/1024**2))

In [None]:
import gc
# garbage collect (unused) object
gc.collect()

# **Modelling**

Let's now build models to predict the variable is_attributed (downloaded). We'll try the several variants of boosting (adaboost, gradient boosting and XGBoost), tune the hyperparameters in each model and choose the one which gives the best performance.

In the Kaggle competition, the metric for model evaluation is area under the ROC curve.

In [None]:
# create x and y train
X = train_sample.drop('is_attributed',axis=1)
y = train_sample[['is_attributed']]

# split data into train and test/validation sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=101)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# check the average download rates in train and test data, should be comparable
print(y_train.mean())
print(y_test.mean())

# **AdaBoost Classifier**

In [None]:
# adaboost classifier with max 600 decision tress of depth=2
# learning_rate/shrinkage = 1.5

# base estimator
tree = DecisionTreeClassifier(max_depth=2)

# adaboost with the tree as base estimator
adaboost_model_1 = AdaBoostClassifier(base_estimator=tree,
                                     n_estimators=600,
                                     learning_rate=1.5,
                                     algorithm="SAMME")

In [None]:
# fit
adaboost_model_1.fit(X_train,y_train)

In [None]:
# predictions
# the second column represents the probability of click resulting in a download

predictions = adaboost_model_1.predict_proba(X_test)
predictions[:10]

In [None]:
# metrics : AUC
metrics.roc_auc_score(y_test,predictions[:,1])

# **AdaBoost - Hyperparameter Tuning**

In [None]:
# parameter grid
param_grid = {"base_estimator__max_depth" : [2,5],
             "n_estimators" : [200,400,600]
             }

In [None]:
# base estimator
tree = DecisionTreeClassifier()

# adaboost with the tree as base estimator
# learning rate is arbitrality set to 0.6

ABC = AdaBoostClassifier(base_estimator=tree,
                        learning_rate=0.6,
                        algorithm="SAMME")

In [None]:
# run grid search
folds = 3
grid_search_ABC = GridSearchCV(ABC,
                              cv=folds,
                              param_grid=param_grid,
                              scoring='roc_auc',
                              return_train_score=True,
                              verbose=1)

In [None]:
# fit
grid_search_ABC.fit(X_train, y_train)

In [None]:
# cv results
cv_results = pd.DataFrame(grid_search_ABC.cv_results_)
cv_results

In [None]:
# plotting AUC with hyperparameter combinations

plt.figure(figsize=(16,6))
for n, depth in enumerate(param_grid['base_estimator__max_depth']):


    # subplot 1/n
    plt.subplot(1,3, n+1)
    depth_df = cv_results[cv_results['param_base_estimator__max_depth']==depth]

    plt.plot(depth_df["param_n_estimators"], depth_df["mean_test_score"])
    plt.plot(depth_df["param_n_estimators"], depth_df["mean_train_score"])
    plt.xlabel('n_estimators')
    plt.ylabel('AUC')
    plt.title("max_depth={0}".format(depth))
    plt.ylim([0.60, 1])
    plt.legend(['test score', 'train score'], loc='upper left')
    plt.xscale('log')

The results above show that:

The ensemble with max_depth=5 is clearly overfitting (training auc is almost 1, while the test score is much lower)
At max_depth=2, the model performs slightly better (approx 95% AUC) with a higher test score

Thus, we should go ahead with max_depth=2 and n_estimators=200.

Note that we haven't experimented with many other important hyperparameters till now, such as learning rate, subsample etc., and the results might be considerably improved by tuning them. We'll next experiment with these hyperparameters.

In [None]:
# model performance on test data with chosen hyperparameters

# base estimator
tree = DecisionTreeClassifier(max_depth=2)

# adaboost with the tree as base estimator
# learning rate is arbitrarily set, we'll discuss learning_rate below
ABC = AdaBoostClassifier(
    base_estimator=tree,
    learning_rate=0.6,
    n_estimators=200,
    algorithm="SAMME")

ABC.fit(X_train, y_train)

In [None]:
# predict on test data
predictions = ABC.predict_proba(X_test)
predictions[:10]

In [None]:
# roc auc
metrics.roc_auc_score(y_test, predictions[:, 1])

# **Gradient Boosting Classifier**

In [None]:
# parameter grid
param_grid = {"learning_rate": [0.2, 0.6, 0.9],
              "subsample": [0.3, 0.6, 0.9]
             }

In [None]:
# adaboost with the tree as base estimator
GBC = GradientBoostingClassifier(max_depth=2, n_estimators=200)

In [None]:
# run grid search
folds = 3
grid_search_GBC = GridSearchCV(GBC,
                               cv = folds,
                               param_grid=param_grid,
                               scoring = 'roc_auc',
                               return_train_score=True,
                               verbose = 1)

grid_search_GBC.fit(X_train, y_train)

In [None]:
cv_results = pd.DataFrame(grid_search_GBC.cv_results_)
cv_results.head()

In [None]:
# # plotting
plt.figure(figsize=(16,6))


for n, subsample in enumerate(param_grid['subsample']):


    # subplot 1/n
    plt.subplot(1,len(param_grid['subsample']), n+1)
    df = cv_results[cv_results['param_subsample']==subsample]

    plt.plot(df["param_learning_rate"], df["mean_test_score"])
    plt.plot(df["param_learning_rate"], df["mean_train_score"])
    plt.xlabel('learning_rate')
    plt.ylabel('AUC')
    plt.title("subsample={0}".format(subsample))
    plt.ylim([0.60, 1])
    plt.legend(['test score', 'train score'], loc='upper left')
    plt.xscale('log')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Instantiate the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()

# Fit the model on the training data
gb_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions_gb = gb_classifier.predict(X_test)

# Evaluate the model
accuracy_gb = accuracy_score(y_test, predictions_gb)
roc_auc_gb = roc_auc_score(y_test, predictions_gb)
classification_report_gb = classification_report(y_test, predictions_gb)

# Print the accuracy, ROC AUC, and classification report
print("Gradient Boosting Classifier Accuracy:", accuracy_gb)
print("Gradient Boosting Classifier ROC AUC Score:", roc_auc_gb)
print("Classification Report:")
print(classification_report_gb)

# Plot feature importance
plt.figure(figsize=(10,6))
sns.barplot(x=gb_classifier.feature_importances_, y=X_train.columns)
plt.title('Feature Importance Plot - Gradient Boosting Classifier')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()


It is clear from the plot above that the model with a lower subsample ratio performs better, while those with higher subsamples tend to overfit.

Also, a lower learning rate results in less overfitting.

# **XGBoost (Extreme Gradient Boosting) Classifier**

In [None]:
# fit model on training data with default hyperparameters
model = XGBClassifier()
model.fit(X_train,y_train)

In [None]:
# make predictions for test data
# use predict_proba since we need probabilities to compute auc
y_pred = model.predict_proba(X_test)
y_pred[:10]

In [None]:
# evaluate predictions
roc = metrics.roc_auc_score(y_test,y_pred[:,1])
print("AUC : %.2f%%" %(roc * 100.0))

The roc_auc in this case is about 0.95% with default hyperparameters. Let's try changing the hyperparameters

Let's now try tuning the hyperparameters using k-fold CV. We'll then use grid search CV to find the optimal values of hyperparameters.

In [None]:
# hyperparameter tuning with XGBoost
# creating a KFold object
folds = 3
# specify range of hyperparamaters
param_grid = {'learning_rate' : [0.2,0.6],
             'subsample' : [0.3,0.6,0.9]
             }
# specify model
xgb_model = XGBClassifier(max_depth=2,n_estimators=200)
# set up GridSearchCV()
model_cv = GridSearchCV(estimator = xgb_model,
                       param_grid = param_grid,
                       scoring = 'roc_auc',
                       cv = folds,
                       verbose = 1,
                       return_train_score = True)

In [None]:
# fit the model
model_cv.fit(X_train,y_train)

In [None]:
# cv results
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

In [None]:
# convert parameters to int for plotting on x-axis
cv_results['param_learning_rate'] = cv_results['param_learning_rate'].astype('float')
cv_results.head()

In [None]:
# # plotting
plt.figure(figsize=(16,6))

param_grid = {'learning_rate': [0.2, 0.6],
             'subsample': [0.3, 0.6, 0.9]}


for n, subsample in enumerate(param_grid['subsample']):


    # subplot 1/n
    plt.subplot(1,len(param_grid['subsample']), n+1)
    df = cv_results[cv_results['param_subsample']==subsample]

    plt.plot(df["param_learning_rate"], df["mean_test_score"])
    plt.plot(df["param_learning_rate"], df["mean_train_score"])
    plt.xlabel('learning_rate')
    plt.ylabel('AUC')
    plt.title("subsample={0}".format(subsample))
    plt.ylim([0.60, 1])
    plt.legend(['test score', 'train score'], loc='upper left')
    plt.xscale('log')

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Instantiate the XGBoost classifier
xgb_model = XGBClassifier()

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
predictions_xgb = xgb_model.predict(X_test)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test, predictions_xgb)
classification_report_xgb = classification_report(y_test, predictions_xgb)

# Print the accuracy and classification report
print("XGBoost Classifier Accuracy:", accuracy_xgb)
print("Classification Report:")
print(classification_report_xgb)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Instantiate the XGBoost classifier
xgb_model = XGBClassifier()

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
predictions_xgb = xgb_model.predict(X_test)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test, predictions_xgb)
roc_auc_xgb = roc_auc_score(y_test, predictions_xgb)
classification_report_xgb = classification_report(y_test, predictions_xgb)

# Print the accuracy, ROC AUC, and classification report
print("XGBoost Classifier Accuracy:", accuracy_xgb)
print("XGBoost Classifier ROC AUC Score:", roc_auc_xgb)
print("Classification Report:")
print(classification_report_xgb)

# Plot feature importance
plt.figure(figsize=(10,6))
sns.barplot(x=xgb_model.feature_importances_, y=X_train.columns)
plt.title('Feature Importance Plot - XGBoost')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()


# **Random Forest Classifier:**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Instantiate the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
rf_predictions = rf_classifier.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_roc_auc = roc_auc_score(y_test, rf_predictions)

print("Random Forest Classifier Accuracy:", rf_accuracy)
print("Random Forest Classifier ROC AUC Score:", rf_roc_auc)


In [None]:
# Plot feature importances
plt.figure(figsize=(10, 6))
feat_importances = pd.Series(rf_classifier.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importances')
plt.xlabel('Relative Importance')
plt.ylabel('Feature')
plt.show()



In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, rf_predictions)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


# **Support Vector Machine (SVM) Classifier**

In [None]:
from sklearn.svm import SVC

# Instantiate the SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Fit the model on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
svm_predictions = svm_classifier.predict(X_test)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_roc_auc = roc_auc_score(y_test, svm_predictions)

print("SVM Classifier Accuracy:", svm_accuracy)
print("SVM Classifier ROC AUC Score:", svm_roc_auc)


Confusion Matrix:

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate the confusion matrix
cm = confusion_matrix(y_test, svm_predictions)

# Plot confusion matrix heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


# **Logistic Regression classifier:**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Instantiate the Logistic Regression classifier
log_reg = LogisticRegression()

# Fit the model on the training data
log_reg.fit(X_train, y_train)

# Make predictions on the test data
predictions_lr = log_reg.predict(X_test)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, predictions_lr)
classification_report_lr = classification_report(y_test, predictions_lr)

# Print the accuracy and classification report
print("Logistic Regression Classifier Accuracy:", accuracy_lr)
print("Classification Report:")
print(classification_report_lr)


Confusion Matrix:

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions_lr)

# Plot confusion matrix heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
