<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Load-and-Prepare-Data" data-toc-modified-id="Load-and-Prepare-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load and Prepare Data</a></span></li><li><span><a href="#Support-Vector-Machines" data-toc-modified-id="Support-Vector-Machines-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Support Vector Machines</a></span></li><li><span><a href="#Classification" data-toc-modified-id="Classification-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Classification</a></span></li></ul></div>

# Learning Methodology Implementation

In [None]:
# Import Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import scipy.sparse
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt

# Programming tools
import os
import sys
import gc

# Notebook options
%matplotlib inline

## Load and Prepare Data

In [None]:
# First load the training datasets
X_train = scipy.sparse.load_npz('X_train1.npz')
y_train = np.load('y_train1.npy')

# Check data type
display(type(X_train))
display(type(y_train))

# Prepare Train and Test Data
from sklearn.model_selection import train_test_split
trainX, testX, trainy, testy = train_test_split(X_train, y_train, test_size=0.33)

## Support Vector Machines

In [None]:
# Import Packages
from sklearn import svm

# Check the unique values
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

In [None]:
# Parameters Preparation
Cs = (2**-5,2**-4,2**-3,2**-2,2**-1,1,2,2**3,2**5,2**7,2**9,2**11,2**13,2**15)
gammas = (2**-15,2**-13,2**-11,2**-9,2**-7,2**-5,2**-3,2**-1,1,2,2**3,2**5)
print(degrees,gammas,Cs)

In [None]:
# Linear Kernel: Tune parameters
accuracy_svm_li = []
for i in range(len(Cs)):    
    svm_linear = svm.SVC(kernel='linear', C=Cs[i], class_weight={1:99773/227})
    svm_linear.fit(trainX, trainy)
    y_predict_svm_linear = svm_linear.predict(testX)
    accuracy_svm_linear=accuracy_score(y_predict_svm_linear,testy)
    accuracy_svm_li.append(accuracy_svm_linear)

# Plot Linear Kernel: Penalty Parameter vs Accuracy
plt.plot(Cs, accuracy_svm_li)
plt.ylabel('Accuracy for SVC with Linear Kernel')
plt.xlabel('Penalty Parameter C')
plt.title('SVC with Linear Kernel')
plt.legend()
plt.show()

In [None]:
# RBF Kernel: Tune parameters
accuracy_svm_rbf_ = []

#for i in range(len(gammas)):
svm_rbf = svm.SVC(kernel='rbf', gamma=gammas[4], C=Cs[0], class_weight={1:99773/227})
svm_rbf.fit(trainX, trainy)
y_predict_svm_rbf = svm_rbf.predict(testX)
accuracy_svm_rbf = accuracy_score(y_predict_svm_rbf,testy)
accuracy_svm_rbf_.append(accuracy_svm_rbf)

In [None]:
print(accuracy_svm_rbf_)

In [None]:
# Plot for Gammas and Accuracy
plt.plot(gammas, accuracy_svm_rbf_)
plt.ylabel('Accuracy for SVC with RBF Kernel')
plt.xlabel('Values of Gammas')
plt.title('SVC with RBF Kernel')
plt.legend()
plt.show()

## Classification

In [None]:
# Load Packages and Prepare Parameters
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
estimators_sizes = [10,20,30,40,50,60,80,100,120,140,200]
max_depths = range(1,4)
learn_rate = [0.1,0.3,0.5,0.7,0.9,1.1,1.3,1.5,1.7,1.9]
sample_weight = [1/99773 if x==1 else 1/227 for x in trainy ]

In [None]:
#AdaBoostClassifier: Tune Parameters
accuracy_adab_ = []
for i in range(len(estimators_sizes)):
    dt = DecisionTreeClassifier(max_depth=6) 
    adab = AdaBoostClassifier(n_estimators=estimators_sizes[i], base_estimator=dt)
    adab.fit(trainX,trainy,sample_weight=None)
    y_predict_adab = adab.predict(testX)
    accuracy_adab=accuracy_score(y_predict_adab,testy)
    accuracy_adab_.append(accuracy_adab)

# Plot Accuracy vs Number of Estimators
plt.plot(estimators_sizes, accuracy_adab_)
plt.ylabel('Accuracy for AdaBoostClassifier')
plt.xlabel('Estimators_sizes')
plt.title('AdaBoostClassifier')
plt.legend()
plt.show()

In [None]:
# Random Forest
feature_sizes = range(1,20)
accuracy_rf_=[]

# Tune the parameters
for i in range(len(feature_sizes)):
    rf = RandomForestClassifier(n_estimators=10,max_depth=max_depths[2],max_features=feature_sizes[i],class_weight={1:99773/227})
    rf.fit(trainX,trainy)
    y_predict_rf = rf.predict(testX)
    accuracy_rf=accuracy_score(y_predict_rf,testy)
    accuracy_rf_.append(accuracy_rf)
print(accuracy_rf_)

# Plot Accuracy vs Maximum Number of Features
plt.plot(feature_sizes, accuracy_rf_)
plt.ylabel('Accuracy for RandomForest')
plt.xlabel('Feature_sizes')
plt.title('RandomForestClassifier')
plt.legend()
plt.show()

In [None]:
# LightGBM Model
import lightgbm as lgb

# Prepare Variables and Parameters
predictors = cat_cols.extend(num_cols)
leaf_sizes = range(25,40)
learn_rate = range(1,110)
min_data_in_leaf_sizes = range(30,100,10)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 45
params['min_data_in_leaf'] = min_data_in_leaf_sizes[0]
params['max_depth'] = max_depths[2]

cat_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_minute_mod15', 'click_second_mod5']
num_cols = ['click_hour', 'click_minute', 'click_second', 
            'clicks_by_ip', 'downloads_by_ip', 'download_ratio_by_ip', 
            'clicks_by_app', 'downloads_by_app', 'download_ratio_by_app', 
            'clicks_by_device', 'downloads_by_device', 'download_ratio_by_device', 
            'clicks_by_os', 'downloads_by_os', 'download_ratio_by_os', 
            'clicks_by_channel', 'downloads_by_channel', 'download_ratio_by_channel']
target_col = 'is_attributed'

# Implement Model 
d_train = lgb.Dataset(trainX, feature_name=predictors, label=trainy)
lightgbm = lgb.train(params, d_train, 100)

In [None]:
accuracy_lightgbm_=[]
learn = []

# Tune Parameters
for j in range(105): 
    params['learning_rate'] = learn_rate[j]/1000.0
    learn.append(params['learning_rate'])
    lightgbm = lgb.train(params, d_train, 100)   
    y_predict_lightgbm = lightgbm.predict(testX)
    for i in range(0,len(y_predict_lightgbm)):
        if y_predict_lightgbm[i]>=.5:       # setting threshold to .5
           y_predict_lightgbm[i]=1
        else:  
           y_predict_lightgbm[i]=0
    accuracy_lightgbm=accuracy_score(y_predict_lightgbm,testy)
    accuracy_lightgbm_.append(accuracy_lightgbm)
    
#Plot learning rate and number of boosting round
plt.plot(learn, accuracy_lightgbm_)
plt.ylabel('Accuracy for LightGBM')
plt.xlabel('Learing_rate')
plt.title('LightGBM')
plt.legend()
plt.show()