In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# 1. 데이터 셋 로드
df = pd.read_csv('StudentsPerformance.csv')

In [13]:
# 데이터셋 : 특성, 타깃
X = df.drop(columns=['math score']) #학습용 입력 데이터
y = df['math score'] #타깃 : 답


In [14]:
# 범주형 데이터 -> 숫자 데이터
ctgy_data = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

In [15]:
# 전처리
preprocessor = ColumnTransformer(
    transformers=[
        ('cool', OneHotEncoder(handle_unknown='ignore'), ctgy_data)
    ],
    remainder='passthrough'
)

In [16]:
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]
)

In [17]:
# 데이터를 8:2로 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [None]:
#학습
model.fit(X_test, y_test)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cool', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [20]:
#예측
y_pred = model.predict(X_test)

In [24]:
#모델 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [28]:
print('mse : ', mse)
print('r2 ; ', r2)

mse :  25.467306127793176
r2 ;  0.8953420170109068


In [29]:
# 필요한 라이브러리 불러오기
from sklearn.datasets import load_breast_cancer # 유방암 데이터셋 로드
from sklearn.model_selection import train_test_split # 학습용/테스트용 데이터 분리
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀 모델
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report # 평가 지표

In [41]:
# 1. 데이터 셋 준비
data = load_breast_cancer()

In [42]:
# 특성과 타깃으로 분류
X = data.data       #특성 데이터(다양한 세포 정보)
y = data.target     #레이블 (0 : 음성, 1 : 양성)


In [43]:
#print(X)
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 1 0 

In [44]:
#훈련 데이터와 테스트 데이터 구분
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [45]:
#학습 모델 서낵, 객체 생성, 학습
model = LogisticRegression(max_iter=10000)

In [46]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [47]:
#테스트 데이터로 예측
y_pred = model.predict(X_test)

In [48]:
# 평가 지표
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [50]:
# 5. 결과 출력
print("정확도(Accuracy):", acc) # 정확도 출력
print("정밀도(Precision):", prec) # 정밀도 출력
print("재현율(Recall):", recall) # 재현율 출력
print("F1-Score:", f1) # F1-Score 출력

정확도(Accuracy): 0.956140350877193
정밀도(Precision): 0.9459459459459459
재현율(Recall): 0.9859154929577465
F1-Score: 0.9655172413793104


In [51]:
# 혼동 행렬 출력
print("\n혼동 행렬 (Confusion Matrix):\n", confusion_matrix(y_test, y_pred))
# 혼동 행렬을 통해 TP, TN, FP, FN 값 확인


혼동 행렬 (Confusion Matrix):
 [[39  4]
 [ 1 70]]


In [55]:
# 분류 보고서 출력
print("분류 보고서 출력")
print(classification_report(y_test, y_pred))

분류 보고서 출력
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        43
           1       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [57]:
###KNN

In [58]:
# 1. 필요한 라이브러리 임포트
from sklearn.datasets import load_iris # Iris 데이터셋 로드
from sklearn.model_selection import train_test_split # 데이터 분리
from sklearn.neighbors import KNeighborsClassifier # KNN 알고리즘 사용
from sklearn.metrics import classification_report, accuracy_score #성능평가

In [None]:
# 2. 데이터 로드 및 확인
iris = load_iris() # Iris 데이터셋 로드
X = iris.data # 특성 데이터 (꽃잎/꽃받침의 길이와 너비)
y = iris.target # 레이블 데이터 (품종: Setosa, Versicolor, Virginica)