In [1]:
import numpy as np
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, plot_importance, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold

In [2]:
df = pd.read_csv("../data/trainDRUG.csv")

In [3]:
X = df.drop(['再犯註記'], axis=1).values
y = df['再犯註記'].values


In [71]:
steps = [('over', SMOTE()), ('model', RandomForestClassifier(n_estimators=1000))]
pipeline = Pipeline(steps=steps)
for scoring in ["accuracy","roc_auc"]:
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=33)
    scores = cross_val_score(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print("Model", scoring, "mean=", np.mean(scores), "stddev=", np.std(scores))


Model accuracy mean= 0.8286902200598936 stddev= 0.0023666393598995904
Model roc_auc mean= 0.7627036150016931 stddev= 0.003344826290022118


In [5]:
for i in [0.5, 0.75, 1.0]:
    print("a=", i, end=" ")
    steps = [('over', SMOTE(sampling_strategy=i)), ('model', RandomForestClassifier(n_estimators=1000))]
    pipeline = Pipeline(steps=steps)
    for scoring in ["precision","recall"]: #"accuracy","f1", "roc_auc"]:
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=33)
        scores = cross_val_score(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
        print(scoring, np.mean(scores), end=" ")
    print()
        

a= 0.5 precision 0.5480003030859643 recall 0.30326545252819437 
a= 0.75 precision 0.5299737747978971 recall 0.32089486922480476 
a= 1.0 precision 0.5178663962369797 recall 0.33136233144845584 


In [6]:
model = RandomForestClassifier(n_estimators=1000)
for score in ["precision","recall"]: #"accuracy","f1", "roc_auc"]:
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=33)
    scores = cross_val_score(model, X, y, scoring=score, cv=cv, n_jobs=-1)
    print(score, "=", np.mean(scores), end=" ")
print()

precision = 0.5926649772697749 recall = 0.2575391695191987 


In [7]:
model = RandomForestClassifier(n_estimators=1000, class_weight="balanced")
for score in ["precision","recall"]: #"accuracy","f1", "roc_auc"]:
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=33)
    scores = cross_val_score(model, X, y, scoring=score, cv=cv, n_jobs=-1)
    print(score, "=", np.mean(scores), end=" ")
print()

precision = 0.5640592427269188 recall = 0.2621495420188992 


In [31]:
X = df.drop(['再犯註記'], axis=1)
y = df['再犯註記'].values
model = RandomForestClassifier(n_estimators=1000, class_weight="balanced_subsample")
for score in ["precision","recall","accuracy","f1", "roc_auc"]:
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=33)
    scores = cross_val_score(model, X, y, scoring=score, cv=cv, n_jobs=-1)
    print(score, "=", np.mean(scores), end=" ")
print()

precision = 0.5641234331755773 recall = 0.262874394033953 accuracy = 0.8346482638832877 f1 = 0.35889564030858895 roc_auc = 0.7644613777002062 


In [79]:
X = df.drop(['再犯註記'], axis=1).values
y = df['再犯註記'].values
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for train_ix, test_ix in kfold.split(X, y):
    # select rows
    train_X, test_X = X[train_ix], X[test_ix]
    train_y, test_y = y[train_ix], y[test_ix]
    # summarize train and test composition
    train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
    test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
    print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

>Train: 0=43124, 1=9197, Test: 0=10782, 1=2299
>Train: 0=43125, 1=9196, Test: 0=10781, 1=2300
>Train: 0=43125, 1=9197, Test: 0=10781, 1=2299
>Train: 0=43125, 1=9197, Test: 0=10781, 1=2299
>Train: 0=43125, 1=9197, Test: 0=10781, 1=2299


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
model = RandomForestClassifier(n_estimators=1000)
m = model.fit(X_train, y_train)
y_pred = m.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[17113,   676],
       [ 2845,   949]], dtype=int64)

In [20]:
from collections import Counter
print(Counter(y_test))

Counter({0: 17789, 1: 3794})


In [25]:

from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

In [26]:
df = pd.read_csv("../data/trainDRUG.csv", encoding='utf8')
X = df.drop(['再犯註記'], axis=1)
y = df['再犯註記'].values

In [29]:
model=RandomForestClassifier(n_estimators=1000)
resample=SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))
pipeline=Pipeline(steps=[('r', resample), ('m', model)])
cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring=['accuracy','precision_macro','recall_macro', 'f1', 'roc_auc']
scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
print('Mean f1: %.4f' % np.mean(scores['test_f1']))
print('Mean roc: %.4f' % np.mean(scores['test_roc_auc']))

Mean Accuracy: 0.6976
Mean Precision: 0.6160
Mean Recall: 0.6844
Mean f1: 0.4357
Mean roc: 0.7536


0.6484011073515841


In [7]:
#X.shape
#y.shape
#y.value_counts()

(65402, 15)

In [17]:
measures = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
#model = RandomForestClassifier(n_estimators=1000)
model = RandomForestClassifier(class_weight="balanced", n_estimators=1000)
def evModel(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    for m in measures:
        scores = cross_val_score(model, X, y, scoring=m, cv=cv, n_jobs=-1)
        print("mean %s of cv for RF is %.4f" % (m, np.mean(scores)))
evModel(model)

mean accuracy of cv for baseline RF is 0.8351
mean precision of cv for baseline RF is 0.5666
mean recall of cv for baseline RF is 0.2636
mean f1 of cv for baseline RF is 0.3589
mean roc_auc of cv for baseline RF is 0.7645


In [34]:
measures = ['accuracy', 'precision', 'f1', 'roc_auc']
score_list = list()
def evModel(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    for m in measures:
        scores = cross_val_score(model, X, y, scoring=m, cv=cv, n_jobs=-1)
        score_list.append(np.mean(scores))
    print(score_list)

In [35]:
model = RandomForestClassifier(n_estimators=1000)
for a in np.arange(0.3, 1.0, 0.1):
    oversample = SMOTE(sampling_strategy=a)
    xo, yo = oversample.fit_resample(X, y)
    print("a=%.1f" % a, end=" ")
    evModel(model, xo, yo)
    


a=0.3 [0.8243408647426571, 0.6983263978008315, 0.5251406041135837, 0.8244975346051232]
a=0.4 [0.8243408647426571, 0.6983263978008315, 0.5251406041135837, 0.8244975346051232, 0.8214960561948891, 0.7536142842305554, 0.6412898694397433, 0.859945787855499]
a=0.5 [0.8243408647426571, 0.6983263978008315, 0.5251406041135837, 0.8244975346051232, 0.8214960561948891, 0.7536142842305554, 0.6412898694397433, 0.859945787855499, 0.823214929948909, 0.780720789040772, 0.7112714379212814, 0.8790805138846094]
a=0.6 [0.8243408647426571, 0.6983263978008315, 0.5251406041135837, 0.8244975346051232, 0.8214960561948891, 0.7536142842305554, 0.6412898694397433, 0.859945787855499, 0.823214929948909, 0.780720789040772, 0.7112714379212814, 0.8790805138846094, 0.828600937840547, 0.8026759683061326, 0.7590769403902794, 0.8930198229849903]
a=0.7 [0.8243408647426571, 0.6983263978008315, 0.5251406041135837, 0.8244975346051232, 0.8214960561948891, 0.7536142842305554, 0.6412898694397433, 0.859945787855499, 0.823214929948