# Logistic Regression

In [None]:
classifier = LogisticRegression()
params = {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 
          'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 
          'random_state': 0, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
classifier.set_params(**params)

classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)


# multinomial
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10)
softmax_reg.fit(X_train, Y_train)

# SGD Classifier

> SGD Classifier is a linear classifier (SVM, logistic regression)

In [None]:
clf = SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(X, Y)

# LinearSVC
- penalty{‘l1’, ‘l2’}
- loss{‘hinge’, ‘squared_hinge’}, default=’squared_hinge’ [penalty='l1' and loss='hinge' is not supported]

In [None]:
reg = LinearSVC(random_state=0, tol=1e-5)
reg.fit(X, y)

# Support Vector Classifier

- kernel: linear, rbf, poly, sigmoid

In [None]:
classifier = SVC()

# kernel="rbf"
params = {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 
          'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 
          'probability': False, 'random_state': 0, 'shrinking': True, 'tol': 0.001, 'verbose': False}

# kernel="linear"
params = {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 
          'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'linear', 
          'max_iter': -1, 'probability': False, 'random_state': 0, 'shrinking': True, 'tol': 0.001, 'verbose': False}

classifier.set_params(**params)

# kernel="poly"
classifier = SVC(kernel="poly", degree=3, coef0=1, C=5)

# kernel ='sigmoid'
classifier = SVC(kernel ='sigmoid')

classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

# K-Neighbors Classifier

- weights: {‘uniform’, ‘distance’}
    1. ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally
    2. ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away

- algorithm: {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}. Algorithm used to compute the nearest neighbors

- leaf_size: Leaf size passed to BallTree or KDTree

- p: Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2

In [None]:
# KNN
classifier = KNeighborsClassifier()
params = {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 
          'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
classifier.set_params(**params)


classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

# Decision Tree Classifier

- criterion: {“gini”, “entropy”}. The function to measure the quality of a split
- splitter: {“best”, “random”}. The strategy used to choose the split at each node
- max_depth: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples
- min_samples_split: The minimum number of samples required to split an internal node
-min_samples_leaf: The minimum number of samples required to be at a leaf node
- max_features: The number of features to consider when looking for the best split

In [None]:
classifier = DecisionTreeClassifier()   # CART cost function
params = {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': None, 
          'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 
          'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}
classifier.set_params(**params)

classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)


# Visualize the trained tree
tree.plot_tree(classifier);

"""
  OR
"""

fn=['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
cn=['setosa', 'versicolor', 'virginica']

export_graphviz(clf,
    out_file="tree.dot",
    feature_names = fn, 
    class_names=cn,
    filled = True
)

export_graphviz(
    classifier,
    out_file=image_path("view_tree.dot"),
    feature_names=iris.feature_names[2:],
    class_name=iris.target_names,
    rounded=True,
    filled=True
)

#> dot -Tpng view_tree.dot -o view_tree.png     [run it to convert from .dot to .png]

# Gaussian Naive Bayes

In [None]:
classifier = GaussianNB()
params = {'priors': None, 'var_smoothing': 1e-09}
classifier.set_params(**params)

# BernoulliNB
classifier = BernoulliNB()

# MultinomialNB
classifier = MultinomialNB()

# CategoricalNB
classifier = CategoricalNB()


classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

# Random Forest Classifier

In [None]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
params = {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 
          'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 
          'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 
          'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
classifier.set_params(**params)

classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)


# The following code trains a Random Forest classifier with 500 trees (each limited to maximum 16 nodes), using all available CPU cores
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

# Bagging Classifier

> performs soft voting

- base_estimator: The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a DecisionTreeRegressor
- n_estimators: The number of base estimators in the ensemble
- max_samples: The number of samples to draw from X to train each base estimator
- max_features: The number of features to draw from X to train each base estimator
- bootstrap: Whether samples are drawn with replacement. If False, sampling without replacement is performed
- oob_score: Whether to use out-of-bag samples to estimate the generalization error. Only available if bootstrap=True

In [None]:
clf = BaggingClassifier(
    base_estimator=SVC(), n_estimators=10,
    random_state=0
)

clf.fit(X, y)
clf.clf.predict(X)


bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, max_samples=100, 
    bootstrap=True, n_jobs=-1
)

bag_clf.fit(X, y)

print("oob score = ", bag_clf.oob_score_ )

# Voting Classifier

In [None]:
# Example 1
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[
        ('clf1', log_clf), ('clf2', rnd_clf), ('clf3', svm_clf)
    ],
    voting='hard'
)

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)


# Example 2
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1 = eclf1.fit(X, y)
print(eclf1.predict(X))

eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
eclf2 = eclf2.fit(X, y)
print(eclf2.predict(X))

eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2,1,1], flatten_transform=True)
eclf3 = eclf3.fit(X, y)
print(eclf3.predict(X))
print(eclf3.transform(X).shape)

# Stacking [Stacked Generalization]

> Blending is an ensemble machine learning technique that uses a machine learning model to learn how to best combine the predictions from multiple contributing ensemble member models

In [None]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

estimators=[
    ('clf1', log_clf), ('clf2', rnd_clf), ('clf3', svm_clf)
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

clf.fit(X_train, y_train).score(X_test, y_test)

## Gradient Boosted

In [None]:
classifier = GradientBoostingClassifier()
params = {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 1.0, 'loss': 'deviance', 'max_depth': 1, 
          'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 
          'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 0, 'subsample': 1.0, 
          'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
classifier.set_params(**params)

classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

# Hist Gradient Boosting Classifier

Histogram-based Gradient Boosting Classification Tree

> This estimator is much faster than GradientBoostingClassifier for big datasets (n_samples >= 10 000)

In [None]:
clf = HistGradientBoostingClassifier()
clf.fit(X, y)
clf.score(X, y)

# XGB Classifier [Extreme Gradient Boosting]

In [None]:
my_model = XGBClassifier()

my_model.fit(X_train, y_train)

# Adaptive Boosting

- base_estimator: If None, then the base estimator is DecisionTreeClassifier initialized with max_depth=1
- algorithm: {‘SAMME’, ‘SAMME.R’} 
If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations

In [None]:
classifier = AdaBoostClassifier()
params = {'algorithm': 'SAMME.R', 'base_estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': None}
classifier.set_params(**params)

classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

# Light GBM [LGBM]

In [None]:
model = LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)

model.fit(
    x_train, y_train, eval_set=[(x_test,y_test), (x_train,y_train)],
    verbose=20, eval_metric='logloss'
)


# Extra
lgb.plot_importance(model)
lgb.plot_metric(model)

# CatBoost

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
categorical_features_indices = np.where(X.dtypes != np.float)[0]

cls = CatBoostClassifier(iterations=5, learning_rate=0.1, loss_function='MultiClass')

cls.fit(X_train, y_train, 
        cat_features=categorical_features_indices, 
        eval_set=(X_val, y_val), 
        verbose=False
)

# Score analysis

### Precision vs. Recall
- False Positive (FP) – Type 1 error
>The predicted value was falsely predicted
The actual value was negative but the model predicted a positive value
Also known as the Type 1 error

- False Negative (FN) – Type 2 error
>The predicted value was falsely predicted
The actual value was positive but the model predicted a negative value
Also known as the Type 2 error

### Precision
>It tells us how many of the correctly predicted cases actually turned out to be positive. <br/>Precision is a useful metric in cases where False Positive is a higher concern than False Negatives.
```
  precision = TP/(TP+FP) => 0.5
```

### Recall 
>It tells us how many of the actual positive cases we were able to predict correctly with our model. <br/>Recall is a useful metric in cases where False Negative trumps False Positive.
```
  recall = TP/(TP+FN) => 0.75
```

### Precision Recall Tradeoff
>If you increase precision, it will reduce recall and vice versa. This is called the precision/recall tradeoff.

### F1-Score
>when we try to increase the precision of our model, the recall goes down, and vice-versa. The F1-score captures both the trends in a single value. F1-score is a harmonic mean of Precision and Recall. <br/>The interpretability of the F1-score is poor. This means that we don’t know what our classifier is maximizing – precision or recall? So, we use it in combination with other evaluation metrics which gives us a complete picture of the result.
```
  f1_score = 2/((1/recall)+(1/precision))
```

### ROC Curve
>ROC curves typically feature true positive rate on the Y axis, and false positive rate on the X axis. Best value = 1

### AUC Curve
> The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes and is used as a summary of the ROC curve. The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes


Ref Link: https://www.analyticsvidhya.com/blog/2020/04/confusion-matrix-machine-learning

In [None]:
# confusion matrix
tp, fn, fp, tn = confusion_matrix(Y_test, Y_pred).reshape(-1)
con_matrix = confusion_matrix(Y_test, Y_pred)
print("confusion matrix = ", con_matrix)

In [None]:
# precision_score
precision_score = precision_score(Y_test, Y_pred)

# recall_score
recall_score = recall_score(Y_test, Y_pred)

# F1 score
f1_score = f1_score(Y_test, Y_pred)

# classification_report
print(classification_report(actual,predicted))

# precision_recall_curve
precision, recall, thresholds = precision_recall_curve(Y_test, Y_pred)
plt.plot(thresholds, precision[:-1], "b--", label="Precision")
plt.plot(thresholds, recall[:-1], "g-", label="Recall")
plt.xlabel("Thresholds")
plt.legend()

# ROC
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
plt.plot(fpr, tpr, lw=2)
plt.xlabel("Flase Positive Rate")
plt.ylabel("True Positive Rate")

# roc_auc_score
roc_auc_score = roc_auc_score(Y_test, Y_pred)

# score
score = classifier.score(X_test, Y_pred)
print("score = ", score)

# accuracy score
acc_score = accuracy_score(Y_test, Y_pred)
print("accuracy score = ", acc_score)

In [None]:
# zero one loss
"""
  If normalize is True, return the fraction of misclassifications (float), 
  else it returns the number of misclassifications (int). 
  The best performance is 0.
  
  normalize = True/False
"""
zero_one_loss(Y_test, Y_pred, normalize=True)

## Compare between Actual and Predicted

In [None]:
compare_predict_data = np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1)[0:5]
print(compare_predict_data)

## K-fold Cross-Validation

In [None]:
# Way 1
accuracies = cross_val_score(estimator = classifier, X = X_train, y = Y_train, cv = 5)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
# Way 2
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
accuracies = cross_val_score(classifier, X, y, cv=cv, n_jobs=-1)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

## **Model Improvement**  
### Using GridSearchCV and RandomizedSearchCV

1. https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
# GridSearchCV
# Logistic Regression
""" [default params]
params = {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 
          'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 
          'random_state': 0, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
"""

""" [All scoring value for Regression]

scoring = [
  accuracy, balanced_accuracy, top_k_accuracy, average_precision, neg_brier_score, f1. f1_micro, f1_macro,
  f1_weighted, f1_samples, neg_log_loss, roc_auc_ovo_weighted, roc_auc_ovo, roc_auc_ovr,  roc_auc, jaccard, recall, precision
  ]

penalty : ['l2', 'l1', 'elasticnet']
solver : ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']
"""

parameters = [
  {
    'C': [0.25, 0.5, 0.75, 1],
    'fit_intercept': [True, False],
    'intercept_scaling': [1, 2, 3, 4, 5],
    'max_iter': [100],
    'penalty': ['l2'],
    'warm_start': [True, False],
    'solver': ['lbfgs']
  },
  {
    'C': [0.25, 0.5, 0.75, 1],
    'fit_intercept': [True, False],
    'intercept_scaling': [1, 2, 3, 4, 5],
    'max_iter': [100],
    'penalty': ['l2', 'l1'],
    'warm_start': [True, False],
    'solver': ['liblinear']
  },
  {
    'C': [0.25, 0.5, 0.75, 1],
    'fit_intercept': [True, False],
    'intercept_scaling': [1, 2, 3, 4, 5],
    'max_iter': [100],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'warm_start': [True, False],
    'solver': ['saga']
  }  
]

grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 5)
grid_search.fit(X_train, Y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

In [None]:
from sklearn.metrics import zero_one_loss

y_pred = [1, 2, 3, 4]
y_true = [2, 0, 3, 4]
zero_one_loss(y_true, y_pred)

In [None]:
zero_one_loss(y_true, y_pred, normalize=False)