In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler

diabetes_data = pd.read_csv('diabetes.csv')
print(diabetes_data['Outcome'].value_counts())
diabetes_data.head(3)

Outcome
0    500
1    268
Name: count, dtype: int64


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


* Pregnancies: 임신 횟수
* Glucose: 포도당 부하 검사 수치
* BloodPressure: 혈압(mm Hg)
* SkinThickness: 팔 삼두근 뒤쪽의 피하지방 측정값(mm)
* Insulin: 혈청 인슐린(mu U/ml)
* BMI: 체질량지수(체중(kg)/(키(m))^2)
* DiabetesPedigreeFunction: 당뇨 내력 가중치 값
* Age: 나이
* Outcome: 클래스 결정 값(0또는 1)

In [51]:
print(diabetes_data.info())
print("\n")

print(diabetes_data.describe())
print("\n")

print('데이터 세트 Null 값 갯수 ',diabetes_data.isnull().sum().sum(), "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None


       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std  

In [56]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    
    accuracy = accuracy_score(y_test , pred)
    
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    
    f1 = f1_score(y_test,pred)
    
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('오차 행렬')
    print(confusion, "\n")
    
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc), "\n")


def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
    
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [53]:
X = diabetes_data.iloc[:, :-1]
y = diabetes_data.iloc[:, -1]

def stscaler(df):
    sc = StandardScaler()
    df = sc.fit_transform(df)
    return df

X = stscaler(X)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [32]:
# 다중 회귀
lr = LinearRegression()
lr.fit(X_train, y_train)

print('LinearRegression 정확도: {0:.4f}'.format(lr.score(X_train, y_train)))
print('LinearRegression 정확도: {0:.4f}'.format(lr.score(X_test, y_test)))
print()

pred = lr.predict(X_test)
print(pred)

print(lr.predict(stscaler([[1, 85, 66, 29, 0, 26.6, 0.351, 31]])))

LinearRegression 정확도: 0.3040
LinearRegression 정확도: 0.2875

[ 0.15182454  0.22204731  0.62819444 -0.61155427 -0.05772226  0.1254447
  0.48467238  0.18596931  0.43984491  0.20482871  0.33758539 -0.18683003
 -0.59988364  0.35329657  0.36071883  0.1326454   0.83903015 -0.06682265
  0.61062817  0.66649942  0.63869762  0.20284461  0.44292079  0.19166942
  0.17795893 -0.21514919  0.24332576  0.38084482  0.18801498  0.71720681
  0.20337505 -0.1640005   0.44522016  0.01512445  0.41675192  0.46233264
 -0.00235424  0.35756007  0.63041493  0.95518635  0.4336128   0.20805064
  0.55303849  0.1417843   0.28368432  0.45784956  0.37889061  0.4274339
  0.2941686   0.78674955  0.67855563  0.54119071  0.16431667  0.2183193
  0.73279658 -0.05969463  0.33814218  0.62060285  0.0710649   0.59979353
  0.35362189  0.3903238   0.28029277  0.10793304  0.69838286  0.22523043
  0.65467606  0.59327166  0.24331589  0.36524227  0.34584214  0.03877743
  0.14371825  0.02751084  0.7705387  -0.06659139  0.13357592  0.5203

In [44]:
# 다항 회귀
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures()
poly.fit(X_train)

train_poly = poly.transform(X_train)
test_poly = poly.transform(X_test)

lr = LinearRegression()
lr.fit(train_poly, y_train)

print('LinearRegression 정확도: {0:.4f}'.format(lr.score(train_poly, y_train)))
print('LinearRegression 정확도: {0:.4f}'.format(lr.score(test_poly, y_test)))

print(lr.predict(poly.transform(stscaler([[1, 85, 66, 29, 0, 26.6, 0.351, 31]]))))

LinearRegression 정확도: 0.3861
LinearRegression 정확도: 0.1311
[0.39384981]


In [47]:
# 규제: 릿지
from sklearn.linear_model import Ridge

train = []
test = []

alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alpha_list:
    
    ridge = Ridge(alpha=alpha)
    ridge.fit(train_poly, y_train)

    train.append(ridge.score(train_poly, y_train))
    print(f"train ridge =>  {alpha}: ", train)
    
    test.append(ridge.score(test_poly, y_test))
    print(f"test ridge =>  {alpha}: ", test)
    print("\n")

ridge = Ridge(alpha=0.01)
ridge.fit(train_poly, y_train)

print(ridge.score(train_poly, y_train))
print(ridge.score(test_poly, y_test))

train ridge =>  0.001:  [0.38612999158282535]
test ridge =>  0.001:  [0.13111704046514783]


train ridge =>  0.01:  [0.38612999158282535, 0.3861299910779743]
test ridge =>  0.01:  [0.13111704046514783, 0.13117950286860547]


train ridge =>  0.1:  [0.38612999158282535, 0.3861299910779743, 0.3861299407131704]
test ridge =>  0.1:  [0.13111704046514783, 0.13117950286860547, 0.13180274970953798]


train ridge =>  1:  [0.38612999158282535, 0.3861299910779743, 0.3861299407131704, 0.38612502143464533]
test ridge =>  1:  [0.13111704046514783, 0.13117950286860547, 0.13180274970953798, 0.13790049261759596]


train ridge =>  10:  [0.38612999158282535, 0.3861299910779743, 0.3861299407131704, 0.38612502143464533, 0.3857258770474419]
test ridge =>  10:  [0.13111704046514783, 0.13117950286860547, 0.13180274970953798, 0.13790049261759596, 0.187844315888836]


train ridge =>  100:  [0.38612999158282535, 0.3861299910779743, 0.3861299407131704, 0.38612502143464533, 0.3857258770474419, 0.37212478054923037]

In [48]:
# 규제: 라쏘
from sklearn.linear_model import Lasso

train = []
test = []

alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alpha_list:
    
    lasso = Lasso(alpha=alpha)
    lasso.fit(train_poly, y_train)

    train.append(lasso.score(train_poly, y_train))
    print(f"train ridge =>  {alpha}: ", train)
    
    test.append(lasso.score(test_poly, y_test))
    print(f"test ridge =>  {alpha}: ", test)
    print("\n")

lasso = Lasso(alpha=0.01)
lasso.fit(train_poly, y_train)

print(lasso.score(train_poly, y_train))
print(lasso.score(test_poly, y_test))

train ridge =>  0.001:  [0.38541734732066624]
test ridge =>  0.001:  [0.20460365710663964]


train ridge =>  0.01:  [0.38541734732066624, 0.3589700485675049]
test ridge =>  0.01:  [0.20460365710663964, 0.3473096268310045]


train ridge =>  0.1:  [0.38541734732066624, 0.3589700485675049, 0.18537742420818526]
test ridge =>  0.1:  [0.20460365710663964, 0.3473096268310045, 0.18418357683161912]


train ridge =>  1:  [0.38541734732066624, 0.3589700485675049, 0.18537742420818526, 0.0]
test ridge =>  1:  [0.20460365710663964, 0.3473096268310045, 0.18418357683161912, -0.0014566278081014072]


train ridge =>  10:  [0.38541734732066624, 0.3589700485675049, 0.18537742420818526, 0.0, 0.0]
test ridge =>  10:  [0.20460365710663964, 0.3473096268310045, 0.18418357683161912, -0.0014566278081014072, -0.0014566278081014072]


train ridge =>  100:  [0.38541734732066624, 0.3589700485675049, 0.18537742420818526, 0.0, 0.0, 0.0]
test ridge =>  100:  [0.20460365710663964, 0.3473096268310045, 0.18418357683161912