In [1]:
from sklearn.datasets import make_classification
X,y = make_classification(n_samples=10000, n_features=10, n_informative=5,
                          n_redundant=0, n_clusters_per_class=1, 
                          n_classes=2, weights=[0.99, 0.01], 
                          random_state=42)

In [2]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, 
                                             stratify=y, random_state=42)

In [3]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
resampled_X, resampled_y = sm.fit_resample(train_X, train_y)

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, max_features=2, 
                                  random_state=42)
rf_model.fit(resampled_X, resampled_y)

In [5]:
y_pred_rfc = rf_model.predict(test_X)
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, y_pred_rfc)

array([[2957,    1],
       [  29,   13]], dtype=int64)

In [6]:
# from sklearn.metrics import confusion_matrix
# def get_f1score(y, y_pred):
#     cm = confusion_matrix(y, y_pred)
#     precision = cm[1,1] / (cm[0,1]+cm[1,1])
#     recall = cm[1,1] / (cm[1,0]+cm[1,1])
#     f_measure = (2*precision*recall) / (precision+recall)
#     return f_measure

In [7]:
# get_f1score(test_y, y_pred_rfc)

In [8]:
from sklearn.metrics import confusion_matrix
def model_measure(model, train_X, train_y, test_X, test_y) :
    model.fit(train_X, train_y)
    y_pred = model.predict(test_X)
    cm = confusion_matrix(test_y, y_pred)
    precision = cm[1,1] / (cm[0,1]+cm[1,1])
    recall = cm[1,1] / (cm[1,0]+cm[1,1])
    f1score = (2*precision*recall) / (precision+recall)
    print(f"precision: {precision:.6f}, recall: {recall:.6f}, f1score: {f1score:.6f}")
#     return precision, recall, f_measure

In [9]:
from sklearn.ensemble import RandomForestClassifier
model_measure(RandomForestClassifier(n_estimators=100, max_features=2, 
                                     random_state=42), 
              resampled_X, resampled_y, test_X, test_y)

precision: 0.928571, recall: 0.309524, f1score: 0.464286


In [10]:
from sklearn.svm import SVC
model_measure(SVC(random_state=42), 
              resampled_X, resampled_y, test_X, test_y)

precision: 0.214953, recall: 0.547619, f1score: 0.308725


In [11]:
from sklearn.neural_network import MLPClassifier
model_measure(MLPClassifier(hidden_layer_sizes=(50,), max_iter=500),
              resampled_X, resampled_y, test_X, test_y)

precision: 0.462963, recall: 0.595238, f1score: 0.520833


In [12]:
from xgboost import XGBClassifier  # pip install xgboost
model_measure(XGBClassifier(max_depth=10, n_estimators=100, 
                            learning_rate=0.01, use_label_encoder=False),
              resampled_X, resampled_y, test_X, test_y)

precision: 0.227273, recall: 0.476190, f1score: 0.307692


In [13]:
from lightgbm import LGBMClassifier #  # pip install lightgbm
model_measure(LGBMClassifier(n_estimators=100), 
              resampled_X, resampled_y, test_X, test_y)

[LightGBM] [Info] Number of positive: 6902, number of negative: 6902
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 13804, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
precision: 0.564103, recall: 0.523810, f1score: 0.543210
