<a href="https://colab.research.google.com/github/jajsmith/pml4dc/blob/master/PML4DC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Analysis of Gradient Boosting Models for Startups in Developing Countries

Many startups in developing countries are interested to see if machine learning models can help them solve problems, but have limited computational resources. For startups working on tabular data they may have heard that gradient boosting models are superior for performance. While it has been used to win many competitions and is gaining traction in North American industries, let's see if it will perform as well in resource-constrained environments.

References:
https://arxiv.org/pdf/1708.05070.pdf


## Setup


In [0]:
!pip install pmlb
!pip install xgboost
!pip install lightgbm
!pip install memory_profiler
!pip install line_profiler
!pip install seaborn

Collecting pmlb
  Downloading https://files.pythonhosted.org/packages/0d/38/bb7bf1785add978af1e6c2bd27926405959c71ebaeefa6ff610bf53d8b25/pmlb-0.3.tar.gz
Building wheels for collected packages: pmlb
  Building wheel for pmlb (setup.py) ... [?25l[?25hdone
  Created wheel for pmlb: filename=pmlb-0.3-cp36-none-any.whl size=11918 sha256=9ea9a3ed4e4be9e605e7fd4c31d5c11e580f5f35aa2403f076940330ec92ccc1
  Stored in directory: /root/.cache/pip/wheels/5a/c7/a4/be59e63a2cb56f6c58f068305b95c212e0aac1a930fd77d6b0
Successfully built pmlb
Installing collected packages: pmlb
Successfully installed pmlb-0.3
Collecting memory_profiler
  Downloading https://files.pythonhosted.org/packages/f4/03/175d380294b2333b9b79c2f2aa235eb90ee95e3ddef644497a9455404312/memory_profiler-0.57.0.tar.gz
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25l[?25hdone
  Created wheel for memory-profiler: filename=memory_profiler-0.57.0-cp36-none-any.whl size=28992

In [0]:
%load_ext memory_profiler


In [0]:
%load_ext line_profiler

In [0]:
import numpy as np
import pandas as pd

import seaborn as sb
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import torch
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier


from pmlb import fetch_data, classification_dataset_names, regression_dataset_names

import matplotlib
import matplotlib.pyplot as plt

import math
import time
import cProfile

## Datasets from PMLB

In [0]:
sizes = []
for dataset in classification_dataset_names:
    X, y = fetch_data(dataset, return_X_y=True)
    instances = X.shape[0]
    features = X.shape[1]
    classes = len(np.unique(y))
    size = X.shape[0] * X.shape[1]
    print(size, classes, instances, features, dataset)
    sizes.append([dataset, size, instances, features, classes])
df_s = pd.DataFrame(sizes, columns=['dataset', 'size', 'instances', 'features', 'classes'])

In [0]:
df_s.sort_values('size', ascending=False)

Unnamed: 0,dataset,size,instances,features,classes
108,mnist,54880000,70000,784,10
91,kddcup,20254820,494020,41,23
126,poker,10250090,1025009,10,10
72,fars,2928072,100968,29,8
61,connect-4,2837394,67557,42,3
...,...,...,...,...,...
15,analcatdata_asbestos,249,83,3,2
60,confidence,216,72,3,6
14,analcatdata_aids,200,50,4,2
26,analcatdata_happiness,180,60,3,3


In [0]:
biggest_10 = df_s.sort_values('size', ascending=False).head(10)['dataset'].to_numpy()

## Gradient Boosting Methods Evaluation

In [0]:
def compare(datasets, models=['log','gnb','xgb','lgb', 'dtc'], verbose=False):
    logit_test_scores = []
    gnb_test_scores = []
    xgbm_test_scores = []
    lgbm_test_scores = []
    dtc_test_scores = []


    for i, classification_dataset in enumerate(datasets):
        if verbose:
            print('Checking dataset: ', classification_dataset, ' (',i,'/',len(datasets),')')
            
        X, y = fetch_data(classification_dataset, return_X_y=True)
        le = preprocessing.LabelEncoder()
        le.fit(y)
        y = le.transform(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
        eval_set = [(X_test, y_test)]
        classes = np.max(np.unique(y)) + 1

        if 'log' in models:
            logit = LogisticRegression(solver='sag', multi_class='auto')
            logit.fit(X_train, y_train)
            logit_test_scores.append(logit.score(X_test, y_test))

        if 'gnb' in models:
            gnb = GaussianNB()
            gnb.fit(X_train, y_train)
            gnb_test_scores.append(gnb.score(X_test, y_test))
            
        if 'dtc' in models:
            dtc = DecisionTreeClassifier()
            dtc.fit(X_train, y_train)
            dtc_test_scores.append(dtc.score(X_test, y_test))

        if 'xgb' in models:
            xgbm_params = {
                'objective' : 'multi:softmax',
                'num_class' : classes
            }
            xgbm = XGBClassifier(**xgbm_params)
            xgbm.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=False)
            xgbm_test_scores.append(xgbm.score(X_test, y_test))
            
        if 'lgb' in models:
            lgbm_params = {
                'objective' : 'softmax',
                'num_class' : classes
            }
            #lgbm = LGBMClassifier(**lgbm_params)
            #lgbm.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=False)

            d_train = lgb.Dataset(X_train, label=y_train)
            d_test = lgb.Dataset(X_test, label=y_test)
            evals_result = {}
            
            lgbm = lgb.train(lgbm_params, d_train, valid_sets=[d_test], early_stopping_rounds=10, evals_result=evals_result, verbose_eval=False)
            
            scores = lgbm.predict(X_test)
            preds = [np.argmax(values) for values in scores]
            s = accuracy_score( preds, y_test )
            lgbm_test_scores.append(s)
        
        #lgbm_test_scores.append(lgbm.score(X_test, y_test))
        #print('Scores: ', logit_test_scores[-1], gnb_test_scores[-1], xgbm_test_scores[-1], lgbm_test_scores[-1])
        
    return logit_test_scores, gnb_test_scores, xgbm_test_scores, lgbm_test_scores, dtc_test_scores

In [0]:
#results = compare(classification_dataset_names[50:70])
results = compare(biggest_10, verbose=True)

Checking dataset:  mnist  ( 0 / 10 )




Checking dataset:  kddcup  ( 1 / 10 )




Checking dataset:  poker  ( 2 / 10 )




Checking dataset:  fars  ( 3 / 10 )




Checking dataset:  connect-4  ( 4 / 10 )




Checking dataset:  GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1  ( 5 / 10 )




Checking dataset:  sleep  ( 6 / 10 )
Checking dataset:  clean2  ( 7 / 10 )




Checking dataset:  coil2000  ( 8 / 10 )




Checking dataset:  adult  ( 9 / 10 )




In [0]:
sb.boxplot(data=results)
plt.xticks([0, 1, 2, 3, 4], ['LogisticRegression', 'GaussianNB', 'XGBoost', 'LightGBM', 'DecisionTree'])
plt.ylabel('Test Accuracy')

## Profilers

In [0]:
cProfile.run('results = compare(classification_dataset_names[63:70])')



         138470 function calls (137765 primitive calls) in 5.543 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        7    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(all)
        7    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(amax)
        7    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(amin)
     1871    0.001    0.000    0.008    0.000 <__array_function__ internals>:2(argmax)
        7    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(argsort)
        7    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(array_split)
        7    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(atleast_2d)
       28    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(average)
        7    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(bincount)
        7    0.000    0.000    0.00

        7    0.000    0.000    0.000    0.000 common.py:124(_stringify_path)
        7    0.000    0.000    0.000    0.000 common.py:1284(is_datetime_or_timedelta_dtype)
      141    0.000    0.000    0.000    0.000 common.py:150(cast_scalar_indexer)
        7    0.000    0.000    0.000    0.000 common.py:1553(is_string_like_dtype)
        7    0.000    0.000    0.000    0.000 common.py:1582(<lambda>)
       51    0.000    0.000    0.003    0.000 common.py:1585(is_float_dtype)
       48    0.000    0.000    0.001    0.000 common.py:1619(is_bool_dtype)
      111    0.000    0.000    0.004    0.000 common.py:1684(is_extension_type)
        7    0.000    0.000    4.184    0.598 common.py:169(get_filepath_or_buffer)
      961    0.003    0.000    0.009    0.000 common.py:1743(is_extension_array_dtype)
       14    0.000    0.000    0.000    0.000 common.py:180(ensure_python_int)
       28    0.000    0.000    0.001    0.000 common.py:1825(_is_dtype)
        7    0.000    0.000    0.000    

        7    0.001    0.000    0.239    0.034 logistic.py:1466(fit)
       14    0.000    0.000    0.000    0.000 logistic.py:1598(<genexpr>)
       14    0.000    0.000    0.000    0.000 logistic.py:427(_check_solver)
       14    0.000    0.000    0.000    0.000 logistic.py:463(_check_multi_class)
        7    0.000    0.000    0.231    0.033 logistic.py:658(_logistic_regression_path)
        7    0.000    0.000    0.008    0.001 managers.py:1223(reindex_indexer)
       14    0.000    0.000    0.003    0.000 managers.py:126(__init__)
        7    0.001    0.000    0.006    0.001 managers.py:1273(_slice_take_blocks_ax0)
       14    0.000    0.000    0.000    0.000 managers.py:132(<listcomp>)
       21    0.000    0.000    0.003    0.000 managers.py:1469(__init__)
       56    0.000    0.000    0.000    0.000 managers.py:1522(_block)
        7    0.000    0.000    0.000    0.000 managers.py:1557(dtype)
       14    0.000    0.000    0.000    0.000 managers.py:1581(external_values)
   

        7    0.000    0.000    0.349    0.050 training.py:115(train)
        7    0.004    0.001    0.349    0.050 training.py:15(_train_internal)
        7    0.000    0.000    0.000    0.000 training.py:32(<listcomp>)
        7    0.000    0.000    0.000    0.000 training.py:58(<listcomp>)
        7    0.000    0.000    0.000    0.000 training.py:60(<listcomp>)
      150    0.000    0.000    0.000    0.000 training.py:89(<listcomp>)
      150    0.000    0.000    0.000    0.000 training.py:90(<listcomp>)
      105    0.000    0.000    0.000    0.000 utils.py:51(_has_surrogates)
      273    0.001    0.000    0.002    0.000 validation.py:131(_num_samples)
       91    0.000    0.000    0.005    0.000 validation.py:190(check_consistent_length)
       91    0.000    0.000    0.002    0.000 validation.py:201(<listcomp>)
       14    0.000    0.000    0.001    0.000 validation.py:208(indexable)
       49    0.000    0.000    0.000    0.000 validation.py:325(_ensure_no_complex_data)
      

%lprun -f compare compare(classification_dataset_names[0:1], models='lgb')

In [0]:
%timeit compare(biggest_10, models=['lgb'])

6min 38s ± 2.54 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [0]:
%timeit compare(biggest_10[3:10], models=['xgb'])

KeyboardInterrupt: 

In [0]:
%timeit compare(biggest_10, models=['gnb'])

3.06 s ± 24.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [0]:
%timeit compare(biggest, models=['log'])





3.34 s ± 220 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)




## Scratch Below

In [0]:
np.array(logit_test_scores) - np.array(gnb_test_scores)

array([ 0.00376412,  0.        , -0.03363442,  0.11111111,  0.01872659,
        0.052     ,  0.11538462,  0.09756098, -0.01298701, -0.275     ,
        0.01315789,  0.        ,  0.01470588,  0.23076923, -0.05434783,
        0.02752294,  0.        ,  0.73704172, -0.02272727,  0.        ,
       -0.088     ,  0.01415327,  0.34793492,  0.05674366,  0.13333333])

In [0]:
X, y = fetch_data(classification_dataset_names[89], return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [0]:
y

array([2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [0]:

# XGBoost params
params = {
    'max_depth' : 5,
    'eta' : 0.3,
    'nthread' : 1,
    'objective' : 'multi:softmax',
    'num_class' : 3
}
num_rounds = 500
# specify validation set for performance
#evallist = [(xgb_train, 'train'), (xgb_test, 'test')]
eval_set = [(X_test, y_test)]
model = XGBClassifier(**params)
bst = model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=True)
#bst.save_model('model/05.model')

[0]	validation_0-merror:0.026316
Will train until validation_0-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.026316
[2]	validation_0-merror:0.026316
[3]	validation_0-merror:0.026316
[4]	validation_0-merror:0.026316
[5]	validation_0-merror:0.026316
[6]	validation_0-merror:0.026316
[7]	validation_0-merror:0.026316
[8]	validation_0-merror:0.026316
[9]	validation_0-merror:0.052632
[10]	validation_0-merror:0.052632
Stopping. Best iteration:
[0]	validation_0-merror:0.026316



In [0]:
model.score(X_test, y_test)

0.9736842105263158

In [0]:
le = preprocessing.LabelEncoder()
le.fit(y_train)

LabelEncoder()

In [0]:
le.classes_

array([0, 1, 2, 3, 4, 5, 6, 7])

In [0]:
np.unique(le.transform(y_test))

array([0, 1, 2, 3, 4, 5, 6, 7])

In [0]:
np.unique(le.inverse_transform(y_test))

array([0, 1, 2, 3, 4, 5, 6, 7])

In [0]:

X, y = fetch_data('corral', return_X_y=True)
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
eval_set = [(X_test, y_test)]
classes = np.max(np.unique(y)) + 1

logit = LogisticRegression(solver='sag', multi_class='auto', max_iter=5000)
gnb = GaussianNB()

xgbm_params = {
    'objective' : 'multi:softmax',
    'num_class' : classes
}
xgbm = XGBClassifier(**xgbm_params)

lgbm_params = {
    'objective' : 'softmax',
    'num_class' : classes
}
#lgbm = LGBMClassifier(**lgbm_params)

logit.fit(X_train, y_train)
gnb.fit(X_train, y_train)
xgbm.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=False)
#lgbm.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=False)

d_train = lgb.Dataset(X_train, label=y_train)
d_test = lgb.Dataset(X_test, label=y_test)
evals_result = {}
lgbm = lgb.train(lgbm_params, d_train, valid_sets=[d_test], early_stopping_rounds=10, evals_result=evals_result, verbose_eval=False)
scores = lgbm.predict(X_test)
preds = [np.argmax(values) for values in scores]
s = accuracy_score( preds, y_test )
lgbm_test_scores.append(s)

logit_test_scores.append(logit.score(X_test, y_test))
gnb_test_scores.append(gnb.score(X_test, y_test))
xgbm_test_scores.append(xgbm.score(X_test, y_test))
#lgbm_test_scores.append(lgbm.score(X_test, y_test))
print('Scores: ', logit_test_scores[-1], gnb_test_scores[-1], xgbm_test_scores[-1], lgbm_test_scores[-1])


Scores:  0.925 0.9 1.0 1.0


In [0]:
y_train

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0])

In [0]:
p = lgbm.predict(X_test)


[0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0]