In [5]:
import pandas as pd
import numpy as np

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import recall_score, f1_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as ppt

In [74]:
data = pd.read_csv('data/Cars-dataset.csv')

In [75]:
data['Gender'] = data['Gender'].apply(lambda x: 0 if x == "Female" else 1)

In [76]:
corr = data.corr()
corr[corr['Opt_service'] >= .7]

Unnamed: 0,Age,Gender,Engineer,MBA,Work Exp,Salary,Distance,license,Opt_service
Work Exp,0.92443,0.085161,0.079911,0.039443,1.0,0.93181,0.395161,0.389882,0.731563
Salary,0.857784,0.103673,0.079428,0.028627,0.93181,1.0,0.475367,0.457207,0.810703
Opt_service,0.683498,0.078684,0.075514,-0.002494,0.731563,0.810703,0.548475,0.487126,1.0


In [77]:

X = data.drop(columns=["Opt_service"])
y = data['Opt_service']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [78]:
y_test.value_counts(normalize=True)

0    0.912698
1    0.087302
Name: Opt_service, dtype: float64

In [79]:
bag = BaggingClassifier()
bag.fit(X_train, y_train)


BaggingClassifier()

In [80]:
pos_examples = y_train[y_train == 1]


In [81]:
bag.score(data.iloc[pos_examples.index].drop(columns=["Opt_service"]), pos_examples)

0.9583333333333334

In [83]:
len(pos_examples) * .95833

22.99992

In [100]:
def display_scores(estimator, X, y):
    print("recall ", recall_score(y, estimator.predict(X)))
    print("precision ", precision_score(y, estimator.predict(X)))
    print("f1 ", f1_score(y, estimator.predict(X)))

In [87]:
random_forest = RandomForestClassifier()

In [91]:
bag = BaggingClassifier(base_estimator=LogisticRegression(max_iter=2000))

In [92]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(max_iter=2000))

In [93]:
random_forest.fit(X_train, y_train)

RandomForestClassifier()

In [107]:
display_scores(random_forest, X_test, y_test)

recall  1.0
precision  1.0
f1  1.0


In [106]:
display_scores(bag, X_test, y_test)

recall  1.0
precision  0.7857142857142857
f1  0.88


In [115]:
for x in range(1,6):
    bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=x, random_state=1))
    bag.fit(X_train, y_train)
    print(f1_score(y_test, bag.predict(X_test)))

1.0
0.9565217391304348
0.88
0.88
0.9565217391304348


In [120]:
ada = AdaBoostClassifier()
grad = GradientBoostingClassifier()
xg = XGBClassifier(eval_metric="logloss", use_label_encoder=False)

In [121]:
ada.fit(X_train, y_train)
grad.fit(X_train, y_train)
xg.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='logloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [123]:
display_scores(ada, X_test, y_test)
display_scores(grad, X_test, y_test)
display_scores(xg, X_test, y_test)

recall  1.0
precision  1.0
f1  1.0
recall  1.0
precision  0.9166666666666666
f1  0.9565217391304348
recall  1.0
precision  1.0
f1  1.0


In [124]:
grad.feature_importances_

array([7.83970176e-03, 4.58604307e-04, 7.54293803e-03, 4.67118629e-08,
       5.27197688e-03, 6.95737055e-01, 2.80311456e-01, 2.83822160e-03])

In [134]:
result=list(zip(X_train.columns, grad.feature_importances_))
result.sort(key=lambda x: x[1])
result

[('MBA', 4.671186288916439e-08),
 ('Gender', 0.0004586043066376853),
 ('license', 0.0028382216046108754),
 ('Work Exp', 0.005271976876997903),
 ('Engineer', 0.0075429380331877675),
 ('Age', 0.00783970176465623),
 ('Distance', 0.28031145552263065),
 ('Salary', 0.6957370551794159)]

In [138]:
for x in [50, 100, 400]:
    grad = GradientBoostingClassifier(n_estimators=x, learning_rate=0.01, random_state=1)
    grad.fit(X_train, y_train)
    print(f1_score(y_train, grad.predict(X_train)))

0.0
0.9787234042553191
1.0


In [140]:
help(StackingClassifier)

Help on class StackingClassifier in module sklearn.ensemble._stacking:

class StackingClassifier(sklearn.base.ClassifierMixin, _BaseStacking)
 |  StackingClassifier(estimators, final_estimator=None, *, cv=None, stack_method='auto', n_jobs=None, passthrough=False, verbose=0)
 |  
 |  Stack of estimators with a final classifier.
 |  
 |  Stacked generalization consists in stacking the output of individual
 |  estimator and use a classifier to compute the final prediction. Stacking
 |  allows to use the strength of each individual estimator by using their
 |  output as input of a final estimator.
 |  
 |  Note that `estimators_` are fitted on the full `X` while `final_estimator_`
 |  is trained using cross-validated predictions of the base estimators using
 |  `cross_val_predict`.
 |  
 |  Read more in the :ref:`User Guide <stacking>`.
 |  
 |  .. versionadded:: 0.22
 |  
 |  Parameters
 |  ----------
 |  estimators : list of (str, estimator)
 |      Base estimators which will be stacked 

In [150]:
dtree = AdaBoostClassifier()
dtree.fit(X_train, y_train)
bag = GradientBoostingClassifier()
bag.fit(X_train, y_train)

GradientBoostingClassifier()

In [151]:
stack = StackingClassifier(estimators=[('tree', dtree), ('bag',bag)], final_estimator=XGBClassifier())

In [155]:
stack.fit(X_train, y_train)





StackingClassifier(estimators=[('tree', AdaBoostClassifier()),
                               ('bag', GradientBoostingClassifier())],
                   final_estimator=XGBClassifier(base_score=None, booster=None,
                                                 colsample_bylevel=None,
                                                 colsample_bynode=None,
                                                 colsample_bytree=None,
                                                 enable_categorical=False,
                                                 gamma=None, gpu_id=None,
                                                 importance_type=None,
                                                 interaction_constraints=None,
                                                 learning_rate=None,
                                                 max_delta_step=None,
                                                 max_depth=None,
                                                 min_child_weight=

In [156]:
display_scores(stack, X_test, y_test)

recall  1.0
precision  0.7857142857142857
f1  0.88
