In [1]:
import pandas as pd
from sklearn.datasets import make_classification

In [14]:
# 0이 99%, 1이 1%인 데이터셋 생성
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
dfX = pd.DataFrame(X, columns=['a', 'b'])
dfy = pd.DataFrame(y, columns=['y'])
df = pd.concat([dfX, dfy], axis=1)
df

Unnamed: 0,a,b,y
0,0.222014,0.540207,0
1,1.347439,1.412824,0
2,0.537238,0.372730,0
3,2.134462,1.404819,0
4,2.315827,1.356858,0
...,...,...,...
9995,2.440385,1.695643,0
9996,-0.790502,0.194243,0
9997,1.878130,0.829500,0
9998,2.585933,1.927995,0


In [15]:
X1 = df[['a', 'b']]
y1 = df['y']

In [16]:
df['y'].value_counts()

0    9900
1     100
Name: y, dtype: int64

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, stratify=y1, random_state=10)

model1 = LogisticRegression(random_state=42)
model1.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [18]:
print("학습용 : ", model1.score(X_train, y_train))
print("검증용 : ", model1.score(X_test, y_test))

학습용 :  0.994125
검증용 :  0.995


In [19]:
pred1 = model1.predict(X_test) # 모형이 분류한 값

In [20]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred1)
cm

# 클래스 1이 20개 있는데 모형에서는 10개밖에 못 찾아냄

array([[1980,    0],
       [  10,   10]], dtype=int64)

In [21]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred1))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1980
           1       1.00      0.50      0.67        20

    accuracy                           0.99      2000
   macro avg       1.00      0.75      0.83      2000
weighted avg       1.00      0.99      0.99      2000



In [22]:
# 균형 데이터

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, flip_y=0, random_state=1)
dfX = pd.DataFrame(X, columns=['a', 'b'])
dfy = pd.DataFrame(y, columns=['y'])
df = pd.concat([dfX, dfy], axis=1)
df

Unnamed: 0,a,b,y
0,0.222014,0.540207,0
1,-1.461718,1.556713,1
2,-0.185450,0.170503,1
3,-0.076351,1.745782,1
4,2.315827,1.356858,0
...,...,...,...
9995,-0.295564,2.153372,1
9996,-2.027697,-0.366690,1
9997,0.962237,1.027313,1
9998,-0.642143,2.454120,1


In [23]:
X2 = df[['a', 'b']]
y2 = df['y']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, stratify=y2, random_state=10)

model2 = LogisticRegression(random_state=42)
model2.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [25]:
print("학습용 : ", model2.score(X_train, y_train))
print("검증용 : ", model2.score(X_test, y_test))

학습용 :  0.896125
검증용 :  0.891


In [26]:
pred2 = model2.predict(X_test)

In [27]:
cm = confusion_matrix(y_test, pred2)
cm

array([[908,  92],
       [126, 874]], dtype=int64)

In [28]:
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1000
           1       0.90      0.87      0.89      1000

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000



In [33]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

from imblearn.under_sampling import RandomUnderSampler

X_sample, y_sample = RandomUnderSampler(random_state=0).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, random_state=10)

from sklearn.svm import SVC

model3 = SVC(random_state=42)
model3.fit(X_train, y_train)

print("학습용 : ", model3.score(X_train, y_train))
print("검증용 : ", model3.score(X_test, y_test))

pred3 = model3.predict(X_test)

print(classification_report(y_test, pred3))

학습용 :  0.875
검증용 :  0.95
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        22
           1       0.94      0.94      0.94        18

    accuracy                           0.95        40
   macro avg       0.95      0.95      0.95        40
weighted avg       0.95      0.95      0.95        40



  return f(*args, **kwargs)


In [34]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks

X_sample, y_sample = TomekLinks(sampling_strategy='majority').fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

# X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, random_state=10)

# model3 = SVC(random_state=42)
# model3.fit(X_train, y_train)

# print("학습용 : ", model3.score(X_train, y_train))
# print("검증용 : ", model3.score(X_test, y_test))

# pred3 = model3.predict(X_test)

# print(classification_report(y_test, pred3))

0    9874
1     100
Name: y, dtype: int64

In [35]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import CondensedNearestNeighbour

X_sample, y_sample = CondensedNearestNeighbour(random_state=0).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

# X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, random_state=10)

# model3 = SVC(random_state=42)
# model3.fit(X_train, y_train)

# print("학습용 : ", model3.score(X_train, y_train))
# print("검증용 : ", model3.score(X_test, y_test))

# pred3 = model3.predict(X_test)

# print(classification_report(y_test, pred3))

0    187
1    100
Name: y, dtype: int64

In [36]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.under_sampling import TomekLinks
# from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import OneSidedSelection

X_sample, y_sample = OneSidedSelection(random_state=0).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

# X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, random_state=10)

# model3 = SVC(random_state=42)
# model3.fit(X_train, y_train)

# print("학습용 : ", model3.score(X_train, y_train))
# print("검증용 : ", model3.score(X_test, y_test))

# pred3 = model3.predict(X_test)

# print(classification_report(y_test, pred3))

0    6593
1     100
Name: y, dtype: int64

In [37]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.under_sampling import TomekLinks
# from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import EditedNearestNeighbours

X_sample, y_sample = EditedNearestNeighbours(kind_sel='all', n_neighbors=5).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

# X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, random_state=10)

# model3 = SVC(random_state=42)
# model3.fit(X_train, y_train)

# print("학습용 : ", model3.score(X_train, y_train))
# print("검증용 : ", model3.score(X_test, y_test))

# pred3 = model3.predict(X_test)

# print(classification_report(y_test, pred3))

0    9747
1     100
Name: y, dtype: int64

In [38]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.under_sampling import TomekLinks
# from imblearn.under_sampling import CondensedNearestNeighbour
# from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import NeighbourhoodCleaningRule

X_sample, y_sample = NeighbourhoodCleaningRule(kind_sel='all', n_neighbors=5).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

# X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, random_state=10)

# model3 = SVC(random_state=42)
# model3.fit(X_train, y_train)

# print("학습용 : ", model3.score(X_train, y_train))
# print("검증용 : ", model3.score(X_test, y_test))

# pred3 = model3.predict(X_test)

# print(classification_report(y_test, pred3))

0    9721
1     100
Name: y, dtype: int64

In [None]:
# 오버샘플링

In [40]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

from imblearn.over_sampling import RandomOverSampler

X_sample, y_sample = RandomOverSampler(random_state=0).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

0    9900
1    9900
Name: y, dtype: int64

In [41]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN

X_sample, y_sample = ADASYN(random_state=0).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

0    9900
1    9899
Name: y, dtype: int64

In [42]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# from imblearn.over_sampling import RandomOverSampler
# from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE

X_sample, y_sample = SMOTE(random_state=0).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

0    9900
1    9900
Name: y, dtype: int64

In [43]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# from imblearn.over_sampling import RandomOverSampler
# from imblearn.over_sampling import ADASYN
# from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

X_sample, y_sample = SMOTEENN(random_state=0).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

1    8941
0    8645
Name: y, dtype: int64

In [44]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

# from imblearn.over_sampling import RandomOverSampler
# from imblearn.over_sampling import ADASYN
# from imblearn.over_sampling import SMOTE
# from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

X_sample, y_sample = SMOTETomek(random_state=0).fit_resample(X, y)

X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])

y_samp.y.value_counts()

0    9653
1    9653
Name: y, dtype: int64