In [1]:
import pandas as pd
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=2,
n_redundant=0, n_clusters_per_class=1, weights=[0.99],
flip_y=0, random_state=1)
dfX=pd.DataFrame(X,columns=['a','b'])
dfy=pd.DataFrame(y,columns=['y'])
df=pd.concat([dfX,dfy],axis=1)
df

Unnamed: 0,a,b,y
0,0.222014,0.540207,0
1,1.347439,1.412824,0
2,0.537238,0.372730,0
3,2.134462,1.404819,0
4,2.315827,1.356858,0
...,...,...,...
9995,2.440385,1.695643,0
9996,-0.790502,0.194243,0
9997,1.878130,0.829500,0
9998,2.585933,1.927995,0


In [2]:
X1 = df[['a','b']] # 독립변수
y1 = df["y"]

In [3]:
df["y"].value_counts() #불균형 데이터셋

0    9900
1     100
Name: y, dtype: int64

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y1,
test_size=0.2, stratify=y1, random_state=10)
#불균형 데이터셋으로 만든 모형
model1 = LogisticRegression(random_state=0)
model1.fit(X_train, y_train)

In [None]:
print(" 학습용:",model1.score(X_train, y_train))
print(" 검증용:",model1.score(X_test, y_test))

In [5]:
pred1=model1.predict(X_test)

In [6]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, pred1)
cm
# score 0.9945 recall 10/(10+10) = 0.5 는 로 높으나 의 경우 로 낮아지는 문제가 발생함

array([[1980,    0],
       [  10,   10]], dtype=int64)

In [7]:
from sklearn.metrics import classification_report
# 소수 클래스의 정확도와 precision, precision,recall,f1-score 확인
print(classification_report(y_test, pred1))
# (accuracy) 모형의 전반적인 정확도 는 높지만 소수 클래스의 재현율 (recall)이 0.5로 낮은 문제점

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1980
           1       1.00      0.50      0.67        20

    accuracy                           0.99      2000
   macro avg       1.00      0.75      0.83      2000
weighted avg       1.00      0.99      0.99      2000



In [8]:
#균형 데이터
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, flip_y=0, random_state=1)
dfX=pd.DataFrame(X,columns=['a','b'])
dfy=pd.DataFrame(y,columns=['y'])
df2=pd.concat([dfX,dfy],axis=1)

In [9]:
df2["y"].value_counts()

0    5000
1    5000
Name: y, dtype: int64

In [10]:
X2 = df2[['a','b']] # 독립변수
y2 = df2["y"]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, stratify=y2, random_state=10)
model2 = LogisticRegression(random_state=42)
model2.fit(X_train, y_train)

In [12]:
print(" 학습용:",model2.score(X_train, y_train))
print(" 검증용:",model2.score(X_test, y_test)) 

 학습용: 0.896125
 검증용: 0.891


In [13]:
pred2=model2.predict(X_test)


In [14]:
print(classification_report(y_test, pred2))
#정확도와 재현율이 비슷하게 처리됨

              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1000
           1       0.90      0.87      0.89      1000

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000



In [15]:
len(X1),len(y1),len(pred1)

(10000, 10000, 2000)

In [16]:
# 비대칭 데이터는 언더샘플링, 오버샘플링, 복합샘플링 등의 방법으로 데이터 비율을 맞추면 정밀도가 향상된다.
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [17]:
# : 언더샘플링 데이터의 손실이 크고 중요한 특성을 가진 데이터를 잃을 수 있음
#무작위로 다수 클래스의 데이터를 없애는 단순 샘플링
from imblearn.under_sampling import RandomUnderSampler
X_sample, y_sample = RandomUnderSampler(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()

0    100
1    100
Name: y, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp,random_state=10)
model3 = LogisticRegression(random_state=42)
model3.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [19]:
print(" 학습용:",model3.score(X_train, y_train)) 
print(" 검증용:",model3.score(X_test, y_test))

 학습용: 0.8625
 검증용: 0.925


In [20]:
pred3=model3.predict(X_test)

In [21]:
print(classification_report(y_test, pred3))

              precision    recall  f1-score   support

           0       0.95      0.90      0.92        20
           1       0.90      0.95      0.93        20

    accuracy                           0.93        40
   macro avg       0.93      0.93      0.92        40
weighted avg       0.93      0.93      0.92        40



In [29]:
# : 언더샘플링 데이터의 손실이 크고 중요한 특성을 가진 데이터를 잃을 수 있음
#무작위로 다수 클래스의 데이터를 없애는 단순 샘플링
from imblearn.over_sampling import SMOTE
X_sample, y_sample = SMOTE(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()


0    9900
1    9900
Name: y, dtype: int64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_samp,
y_samp, test_size=0.2,
stratify=y_samp,random_state=10)
model3 = LogisticRegression(random_state=42)
model3.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


In [31]:
print(" 학습용:",model3.score(X_train, y_train))
print(" 검증용:",model3.score(X_test, y_test))

 학습용: 0.9096590909090909
 검증용: 0.9085858585858586


In [32]:
pred3=model3.predict(X_test)

In [33]:
print(classification_report(y_test, pred3))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1980
           1       0.92      0.89      0.91      1980

    accuracy                           0.91      3960
   macro avg       0.91      0.91      0.91      3960
weighted avg       0.91      0.91      0.91      3960



In [None]:
from imblearn.under_sampling import TomekLinks
X_sample,y_sample=TomekLinks(sampling_strategy='majority').fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()
# 토멕링크 중에서 다수 클래스의 샘플들을 제거하는 방식 1:1로 맞추는 방식은 아님

#시간이 많이 걸림
from imblearn.under_sampling import CondensedNearestNeighbo
ur
X_sample, y_sample = CondensedNearestNeighbour(random_state
=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()
#1:1로 맞춰지지는 않음