# part 3 follow along

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.precision',3)

In [2]:
df = pd.DataFrame({
    "Likes Popcorn" : [1,1,0,1,0,0],
    "Age": [12, 87, 44, 19, 32, 14],
    "Fav Color": ['B','G','B','R','G','B'],
    "Loves Troll": [1,1,0,0,1,1]
})
df

Unnamed: 0,Likes Popcorn,Age,Fav Color,Loves Troll
0,1,12,B,1
1,1,87,G,1
2,0,44,B,0
3,1,19,R,0
4,0,32,G,1
5,0,14,B,1


In [3]:
df = pd.get_dummies(df)
df

Unnamed: 0,Likes Popcorn,Age,Loves Troll,Fav Color_B,Fav Color_G,Fav Color_R
0,1,12,1,True,False,False
1,1,87,1,False,True,False
2,0,44,0,True,False,False
3,1,19,0,False,False,True
4,0,32,1,False,True,False
5,0,14,1,True,False,False


In [4]:
df.columns

Index(['Likes Popcorn', 'Age', 'Loves Troll', 'Fav Color_B', 'Fav Color_G',
       'Fav Color_R'],
      dtype='object')

In [5]:
# rerraange
df = df[['Likes Popcorn', 'Age', 'Fav Color_B', 'Fav Color_G',
       'Fav Color_R', 'Loves Troll']]
df

Unnamed: 0,Likes Popcorn,Age,Fav Color_B,Fav Color_G,Fav Color_R,Loves Troll
0,1,12,True,False,False,1
1,1,87,False,True,False,1
2,0,44,True,False,False,0
3,1,19,False,False,True,0
4,0,32,False,True,False,1
5,0,14,True,False,False,1


## Step:

- log of odd

- convert odds to probability $p = \frac{e^{\text{logodds}}}{ e^{\text{logodds}} + 1}$

In [6]:
## Step 1a
odds = (df['Loves Troll'] == 1).sum()/(df['Loves Troll'] == 0).sum()
logodds = np.log(odds)
df.loc[:,'logodds'] = logodds
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'logodds'] = logodds


Unnamed: 0,Likes Popcorn,Age,Fav Color_B,Fav Color_G,Fav Color_R,Loves Troll,logodds
0,1,12,True,False,False,1,0.693
1,1,87,False,True,False,1,0.693
2,0,44,True,False,False,0,0.693
3,1,19,False,False,True,0,0.693
4,0,32,False,True,False,1,0.693
5,0,14,True,False,False,1,0.693


In [7]:
p = np.exp(logodds)/(1 + np.exp(logodds))
p

df.loc[:,'pred_prob'] = p
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'pred_prob'] = p


Unnamed: 0,Likes Popcorn,Age,Fav Color_B,Fav Color_G,Fav Color_R,Loves Troll,logodds,pred_prob
0,1,12,True,False,False,1,0.693,0.667
1,1,87,False,True,False,1,0.693,0.667
2,0,44,True,False,False,0,0.693,0.667
3,1,19,False,False,True,0,0.693,0.667
4,0,32,False,True,False,1,0.693,0.667
5,0,14,True,False,False,1,0.693,0.667


## Step 2: Calculate residuals

residuals  = observed - prev predictions


In [8]:
df.loc[:,'res_prob'] = df['Loves Troll'] - df['pred_prob']
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'res_prob'] = df['Loves Troll'] - df['pred_prob']


Unnamed: 0,Likes Popcorn,Age,Fav Color_B,Fav Color_G,Fav Color_R,Loves Troll,logodds,pred_prob,res_prob
0,1,12,True,False,False,1,0.693,0.667,0.333
1,1,87,False,True,False,1,0.693,0.667,0.333
2,0,44,True,False,False,0,0.693,0.667,-0.667
3,1,19,False,False,True,0,0.693,0.667,-0.667
4,0,32,False,True,False,1,0.693,0.667,0.333
5,0,14,True,False,False,1,0.693,0.667,0.333


## Step 3:

Create a tree to predict residuals


In [9]:
df.columns

Index(['Likes Popcorn', 'Age', 'Fav Color_B', 'Fav Color_G', 'Fav Color_R',
       'Loves Troll', 'logodds', 'pred_prob', 'res_prob'],
      dtype='object')

In [None]:
x = np.array(df[['Likes Popcorn', 'Age', 'Fav Color_B', 'Fav Color_G', 'Fav Color_R']])
y = np.array(df['res_prob'])

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

t1 = DecisionTreeRegressor(max_leaf_nodes=4)
t1.fit(x,y)
tree.plot_tree(t1, feature_names=['Likes Popcorn','Age', 'Fav Color_B', 'Fav Color_G', 'Fav Color_R'])

In [None]:
trs = []
trs.append(t1)

In [None]:
df["leaf"] = t1.apply(x)
df

In [None]:
gp = df.groupby('leaf')

In [None]:
leaf_probRes_to_logoddRes = gp.apply(lambda dummy: dummy["res_prob"].sum()/(dummy["pred_prob"]*(1-dummy["pred_prob"])).sum())
leaf_probRes_to_logoddRes

In [None]:
df['res_logodd'] = [leaf_probRes_to_logoddRes[l] for l in df['leaf']]
df

In [None]:
# update logg odds
alpha = 0.1
df['logodds'] = df['logodds']+ alpha*df['res_logodd']
df



In [None]:
# update prediction probability
df['pred_prob'] = np.exp(df['logodds'])/(np.exp(df['logodds']) + 1)
df

In [None]:
# prediction with probability cutoff 0.5
df['prediction'] = df['pred_prob'] >=0.5
df

In [None]:
# repeat

for i in range(100):

    df['res_prob'] = df['Loves Troll'] - df['pred_prob'] # get pseudo residual

    # fit a tree to predict probability residuals
    x = np.array(df[['Likes Popcorn', 'Age', 'Fav Color_B', 'Fav Color_G', 'Fav Color_R']])
    y = np.array(df['res_prob'])
    t2 = DecisionTreeRegressor(max_leaf_nodes=4)
    t2.fit(x,y)
    trs.append(t2) # save this tree for later use on unseen data


    # transform the probability residuals to log add residuals
    df["leaf"] = t2.apply(x)
    gp = df.groupby('leaf')
    leaf_probRes_to_logoddRes = gp.apply(lambda dummy: dummy["res_prob"].sum()/(dummy["pred_prob"]*(1-dummy["pred_prob"])).sum())
    df['res_logodd'] = [leaf_probRes_to_logoddRes[l] for l in df['leaf']]
    df['pred_prob'] = np.exp(df['logodds'])/(np.exp(df['logodds']) + 1)
    # prediction with probability cutoff 0.5
    df['prediction'] = df['pred_prob'] >=0.5
df


In [39]:
df

Unnamed: 0,Likes Popcorn,Age,Fav Color_B,Fav Color_G,Fav Color_R,Loves Troll,logodds,pred_prob,res_prob,leaf,res_logodd,prediction
0,1,12,True,False,False,1,0.993,0.73,0.27,3,1.37,True
1,1,87,False,True,False,1,0.993,0.73,0.27,6,1.37,True
2,0,44,True,False,False,0,0.093,0.523,-0.523,5,-2.098,True
3,1,19,False,False,True,0,0.093,0.523,-0.523,2,-2.098,True
4,0,32,False,True,False,1,0.993,0.73,0.27,3,1.37,True
5,0,14,True,False,False,1,0.993,0.73,0.27,3,1.37,True


# Gradient boost with sklearn on Toy data

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

clf  = GradientBoostingClassifier(learning_rate=0.1)

x = np.array(df[['Likes Popcorn', 'Age', 'Fav Color_B', 'Fav Color_G', 'Fav Color_R']])
y = np.array(df[['Loves Troll']])

clf.fit(x,y)

  y = column_or_1d(y, warn=True)


In [15]:
df['sklearn_predict'] = clf.predict(x)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sklearn_predict'] = clf.predict(x)


Unnamed: 0,Likes Popcorn,Age,Fav Color_B,Fav Color_G,Fav Color_R,Loves Troll,logodds,pred_prob,res_prob,sklearn_predict
0,1,12,True,False,False,1,0.693,0.667,0.333,1
1,1,87,False,True,False,1,0.693,0.667,0.333,1
2,0,44,True,False,False,0,0.693,0.667,-0.667,0
3,1,19,False,False,True,0,0.693,0.667,-0.667,0
4,0,32,False,True,False,1,0.693,0.667,0.333,1
5,0,14,True,False,False,1,0.693,0.667,0.333,1


# Gradient boost with sklearn on bigger data

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.precision',3)

## Problem: Adult Data set

In [2]:
# for Jupyter-book, we copy data from GitHub, locally, to save Internet traffic,
# you can specify the data/ folder from the root of your cloned
# https://github.com/Yorko/mlcourse.ai repo, to save Internet traffic
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [3]:
df = pd.read_csv(DATA_PATH + "adult_train.csv", sep=";")
df.tail()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [4]:
df.drop(columns=['Workclass', 'Occupation', 'Country'],inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   fnlwgt          32561 non-null  int64 
 2   Education       32561 non-null  object
 3   Education_Num   32561 non-null  int64 
 4   Martial_Status  32561 non-null  object
 5   Relationship    32561 non-null  object
 6   Race            32561 non-null  object
 7   Sex             32561 non-null  object
 8   Capital_Gain    32561 non-null  int64 
 9   Capital_Loss    32561 non-null  int64 
 10  Hours_per_week  32561 non-null  int64 
 11  Target          32561 non-null  object
dtypes: int64(6), object(6)
memory usage: 3.0+ MB


In [5]:
df['TargetNew'] = df['Target'].map({' <=50K':0, ' >50K':1 })
df

Unnamed: 0,Age,fnlwgt,Education,Education_Num,Martial_Status,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Target,TargetNew
0,39,77516,Bachelors,13,Never-married,Not-in-family,White,Male,2174,0,40,<=50K,0
1,50,83311,Bachelors,13,Married-civ-spouse,Husband,White,Male,0,0,13,<=50K,0
2,38,215646,HS-grad,9,Divorced,Not-in-family,White,Male,0,0,40,<=50K,0
3,53,234721,11th,7,Married-civ-spouse,Husband,Black,Male,0,0,40,<=50K,0
4,28,338409,Bachelors,13,Married-civ-spouse,Wife,Black,Female,0,0,40,<=50K,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,Assoc-acdm,12,Married-civ-spouse,Wife,White,Female,0,0,38,<=50K,0
32557,40,154374,HS-grad,9,Married-civ-spouse,Husband,White,Male,0,0,40,>50K,1
32558,58,151910,HS-grad,9,Widowed,Unmarried,White,Female,0,0,40,<=50K,0
32559,22,201490,HS-grad,9,Never-married,Own-child,White,Male,0,0,20,<=50K,0


In [6]:
df['Target'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [8]:
import sklearn.tree as tree
from sklearn.ensemble import GradientBoostingClassifier

featureNames = ['Age', 'Education_Num','Hours_per_week']

x = np.array(df[featureNames])
y = np.array(df['TargetNew'])


In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [10]:
clf = GradientBoostingClassifier()
clf.fit(x_train,y_train)

In [11]:
y_train_predict = clf.predict(x_train)

In [12]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [13]:
confusion_matrix(y_train,y_train_predict)

array([[18549,  1276],
       [ 3813,  2410]], dtype=int64)

In [14]:
accuracy_score(y_train,y_train_predict)

0.804629914004914

In [16]:
# validation data
y_test_predict = clf.predict(x_test)
accuracy_score(y_test,y_test_predict)


0.7956394902502687

# Parameter tuning with Sklearn

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate':[0.01,0.1,0.3]
}

rf = GradientBoostingClassifier()
gs = GridSearchCV(estimator=rf, param_grid=param_grid)
result = gs.fit(x_train,y_train)

In [18]:
#see results details
gs.cv_results_

{'mean_fit_time': array([0.74939036, 0.82639327, 0.72332215]),
 'std_fit_time': array([0.03847864, 0.05713375, 0.02024406]),
 'mean_score_time': array([0.0087213 , 0.00801101, 0.00500703]),
 'std_score_time': array([1.39692173e-03, 1.23275087e-05, 4.10501398e-03]),
 'param_learning_rate': masked_array(data=[0.01, 0.1, 0.3],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_rate': 0.01},
  {'learning_rate': 0.1},
  {'learning_rate': 0.3}],
 'split0_test_score': array([0.77024952, 0.80076775, 0.79961612]),
 'split1_test_score': array([0.76986564, 0.79980806, 0.80038388]),
 'split2_test_score': array([0.77581574, 0.80287908, 0.79827255]),
 'split3_test_score': array([0.77884431, 0.80668074, 0.80514494]),
 'split4_test_score': array([0.77750048, 0.79708197, 0.79401037]),
 'mean_test_score': array([0.77445514, 0.80144352, 0.79948557]),
 'std_test_score': array([0.00371862, 0.00321384, 0.00358613]),
 'rank_test_score': array(

In [19]:
gs.best_estimator_

In [21]:
y_train_predict = gs.predict(x_train)
accuracy_score(y_train,y_train_predict)

0.804629914004914

In [22]:
y_test_predict = gs.predict(x_test)
accuracy_score(y_test,y_test_predict)

0.7956394902502687

In [23]:
for i, j in enumerate(gs.cv_results_['rank_test_score']):
    print(i,j, gs.cv_results_['params'][i])                       

0 3 {'learning_rate': 0.01}
1 1 {'learning_rate': 0.1}
2 2 {'learning_rate': 0.3}
