In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes

In [None]:
diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
y = pd.DataFrame(diabetes.target)

#--

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.2,
                                                    random_state = 2023,
                                                    )
x_train = pd.DataFrame(x_train.reset_index())
x_test = pd.DataFrame(x_test.reset_index())

y_train = pd.DataFrame(y_train.reset_index())

In [None]:
x_train = x_train.rename(columns ={'index':'cust_id'}, inplace = False)
x_test = x_test.rename(columns ={'index':'cust_id'}, inplace = False)
y_train.columns = ['cust_id', 'target']

# 1. 당뇨병 환자의 질병 진행정도를 예측해보자.
- 라이브러리, 데이터 확인
- 결측치, 이상치, 변수 확인
- 회귀모델을 사용하여 MSE 계산
- 제출은 cust_id, target 변수를 가진 데이터프레임 형식

### 1. 결측치, 이상치, 변수 확인 후 데이터 분리

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(353, 11)
(89, 11)
(353, 2)


In [None]:
#-- 결측치 확인
print(x_train.isnull().sum().sum())
print(x_test.isnull().sum().sum())
print(y_train.isnull().sum().sum())

0
0
0


In [None]:
#-- 이상치 확인
print(x_train.describe())
print("================")
print(x_test.describe())
print("================")
print(y_train.describe())

          cust_id         age         sex         bmi          bp          s1  \
count  353.000000  353.000000  353.000000  353.000000  353.000000  353.000000   
mean   212.634561    0.000804    0.000724    0.000640   -0.000326    0.001179   
std    126.668903    0.047617    0.047673    0.048141    0.046585    0.047891   
min      0.000000   -0.107226   -0.044642   -0.084886   -0.112399   -0.126781   
25%    105.000000   -0.038207   -0.044642   -0.035307   -0.033213   -0.033216   
50%    210.000000    0.005383   -0.044642   -0.006206   -0.005670   -0.002945   
75%    322.000000    0.038076    0.050680    0.030440    0.032201    0.027326   
max    441.000000    0.110727    0.050680    0.170555    0.125158    0.153914   

               s2          s3          s4          s5          s6  
count  353.000000  353.000000  353.000000  353.000000  353.000000  
mean     0.001110   -0.000452    0.000901    0.001446    0.000589  
std      0.048248    0.048600    0.048045    0.047160    0.048122 

In [None]:
#-- 변수 확인
print(y_train.head(5))
#-- 불필요한 cust_id 변수 제거

cust_id = x_test['cust_id'].copy()
x_train = x_train.drop('cust_id', axis = 1)
x_test = x_test.drop('cust_id', axis = 1)

   cust_id  target
0        4   135.0
1      318   109.0
2      301    65.0
3      189    79.0
4      288    80.0


In [None]:
#-- 학습, 검증 데이터로 분리
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['target'],
                                                  # stratify=y_train['target'],
                                                  test_size = 0.2,
                                                  random_state = 2023
                                                  )

In [None]:
print(x_train.shape)
print(x_val.shape)

print(y_train.shape)
print(y_val.shape)

(282, 10)
(71, 10)
(282,)
(71,)


### 2. 모델 불러오기

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state = 2023)
model.fit(x_train,y_train)

In [None]:
#-- 모델 예측
y_pred = model.predict(x_val)
print(y_pred[:5])

[206.42 207.97 100.51  98.86 126.86]


In [None]:
#-- 모델 성능 평가
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_val, y_pred) #-- 실제값, 예측값
r2 = r2_score(y_val, y_pred)

In [None]:
print(mse) #-- mse
print(r2) #-- re
rmse = mse ** 0.5 #-- 루트
print(rmse)

3620.4645070422534
0.28792015815082017
60.17029588627808


### 3.제출 x_test를 넣었을 때 예측값을 제출해야함

In [None]:
y_result = model.predict(x_test)

result = pd.DataFrame({
    'cust_id' : cust_id,
    "target" : y_result,
})

In [None]:
result[:5]

Unnamed: 0,cust_id,target
0,280,173.93
1,412,246.28
2,68,79.49
3,324,159.0
4,101,91.81


# 2. 레스토랑의 팁 예측하기
- 데이터의 결측치, 이상치, 변수에 대해 처리
- 회귀모델을 사용하여 Rsq, MSE값을 산출하시오

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = sns.load_dataset('tips')

In [3]:
df.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
x = df.drop(['tip'], axis =1)
y = df['tip']

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,
                                                    test_size = 0.2,
                                                    random_state = 2023)


In [14]:
x_test = pd.DataFrame(x_test.reset_index())
x_train = pd.DataFrame(x_train.reset_index())
y_test = pd.DataFrame(y_test.reset_index())
y_train = pd.DataFrame(y_train.reset_index())

In [15]:
x_test.rename(columns = {'index' : 'cust_id'}, inplace = True)
x_train.rename(columns = {'index' : 'cust_id'}, inplace = True)
y_train.columns = ['cust_id', 'target']

In [31]:
#-- 1. 데이터 확인
# print(x_train.shape) (195, 7)
# print(x_test.shape) (49, 7)
# print(y_train.shape) (195, 2)

#-- 결측치 확인
# print(x_train.isnull().sum().sum())
# print(x_test.isnull().sum().sum())
# print(y_train.isnull().sum().sum())

#-- 이상치 확인
print(x_train.describe(include ='category'))
# print(x_test.describe())
# print(y_train.describe())

         sex smoker  day    time
count    195    195  195     195
unique     2      2    4       2
top     Male     No  Sat  Dinner
freq     125    120   71     142


In [33]:
#-- 변수 처리
cust_id = x_test['cust_id'].copy()

x_train = x_train.drop(columns = ['cust_id'])
x_test = x_test.drop(columns = ['cust_id'])

In [35]:
#-- 원핫 인코딩
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

print(x_train.info())
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   total_bill   195 non-null    float64
 1   size         195 non-null    int64  
 2   sex_Male     195 non-null    uint8  
 3   sex_Female   195 non-null    uint8  
 4   smoker_Yes   195 non-null    uint8  
 5   smoker_No    195 non-null    uint8  
 6   day_Thur     195 non-null    uint8  
 7   day_Fri      195 non-null    uint8  
 8   day_Sat      195 non-null    uint8  
 9   day_Sun      195 non-null    uint8  
 10  time_Lunch   195 non-null    uint8  
 11  time_Dinner  195 non-null    uint8  
dtypes: float64(1), int64(1), uint8(10)
memory usage: 5.1 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   total_bill   49 non-null     float64
 1   size   

In [38]:
#-- 데이터 분리
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['target'],
                                                  test_size = 0.2,
                                                  random_state = 2023,
                                                  # stratify= y_train['target']
                                                  )

In [41]:
print(x_train.shape)
print(y_train.shape)

print(x_val.shape)
print(y_val.shape)

(156, 12)
(156,)
(39, 12)
(39,)


In [62]:
#-- 2. 모델링 및 성능평가

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state = 2023)
model.fit(x_train, y_train)
pred = model.predict(x_val)


In [63]:
#-- 성능 평가

from sklearn.metrics import mean_absolute_error, r2_score

mse = mean_absolute_error(y_val, pred)
r2 = r2_score(y_val, pred)

In [64]:
print(mse)
print(r2)

0.7400974358974362
0.6214234662421672


In [67]:
#-- 3. x_test 제출
y_result = model.predict(x_test)
result = pd.DataFrame({
    'cust_id' : cust_id,
    'target' : y_result,
})

In [68]:
print(result[:5])

   cust_id  target
0      154  3.1114
1        4  2.8938
2       30  1.7715
3       75  1.6389
4       33  3.1873


# 3. 붓꽃 종류 예측

In [113]:
############### 실기환경 복사 영역 ############### import pandas as pd
import numpy as np
# 실기 시험 데이터셋으로 셋팅하기 (수정금지)
from sklearn.datasets import load_iris
# Iris 데이터셋을 로드
iris = load_iris()
x = pd.DataFrame(iris.data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
y = iris.target # 'setosa'=0, 'versicolor'=1, 'virginica'=2
y = np.where(y>0, 1, 0) # setosa 종은 0, 나머지 종은 1로 변경
# 실기 시험 데이터셋으로 셋팅하기 (수정금지)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                    stratify=y,
                                                    random_state = 2023)

x_test = pd.DataFrame(x_test)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
y_train.columns = ['species']
# 결측치 삽입
x_test['sepal_length'].iloc[0] = None
x_train['sepal_length'].iloc[0] = None
# 이상치 삽입
x_train['sepal_width'].iloc[0] = 150 ############### 실기환경 복사 영역 ###############
### 참고사항 ###
# y_test 는 실기 문제상에 주어지지 않음
# ★Tip : X를 대문자로 쓰지말고 소문자 x로 쓰세요. 시험에서 실수하기 쉽습니다.(문제풀기 전에 소문자로 변경!) # (참고 : 보통 X는 2차원 배열(행렬)이기 때문에 대문자로 쓰고, y는 1차원 배열(벡터)이기 때문에 소문자로 씀)
# (참고) 실기시험 데이터 형식 (실제 시험장에서는 다를 수 있으니 반드시 체크) # X_test = pd.read_csv("data/X_test.csv")
# X_train = pd.read_csv("data/X_train.csv")
# y_train = pd.read_csv("data/y_train.csv")

### 1. 데이터 분석

In [107]:
print(x_train.shape) #-- (120, 4)
print(y_train.shape) #-- (120, 1)
# setosa 종은 0, 나머지 종은 1로 변경

(120, 4)
(120, 1)


In [112]:
print(x_train.info())
print(y_train.info()) #-- 범주형으로 변경해야 할듯?

<class 'pandas.core.series.Series'>
Int64Index: 120 entries, 2 to 44
Series name: sepal_length
Non-Null Count  Dtype  
--------------  -----  
120 non-null    float64
dtypes: float64(1)
memory usage: 1.9 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   species  120 non-null    int64
dtypes: int64(1)
memory usage: 1.1 KB
None


### 2. 데이터 결측치, 이상치, 변수 처리 후 데이터 분리

In [120]:
# print(x_train.isnull().sum()) #-- sepal_length 1개
# print(y_train.isnull().sum()) #-- 없음
# print(x_test.isnull().sum()) #-- sepal_length 1개

#-- 중앙값으로 대체
sepal_length_median = x_train['sepal_length'].median()
# print(sepal_length_median)

x_train['sepal_length'] = x_train['sepal_length'].fillna(sepal_length_median)
x_test['sepal_length'] = x_test['sepal_length'].fillna(sepal_length_median)

In [121]:
#-- 이상치 확인
print(x_train.describe())
print(x_test.describe())
print(y_train.describe())

#-- 연속형 데이터 이므로 이상치를 중앙값으로 대체

       sepal_length  sepal_width  petal_length  petal_width
count    120.000000     120.0000    120.000000   120.000000
mean       5.848333       4.2950      3.816667     1.226667
std        0.747093      13.4191      1.798848     0.780512
min        4.300000       2.2000      1.100000     0.100000
25%        5.175000       2.8000      1.575000     0.300000
50%        6.000000       3.0000      4.400000     1.350000
75%        6.400000       3.4000      5.225000     1.800000
max        7.400000     150.0000      6.900000     2.500000
       sepal_length  sepal_width  petal_length  petal_width
count     30.000000    30.000000     30.000000     30.00000
mean       5.593333     3.000000      3.523333      1.09000
std        0.697252     0.522593      1.631518      0.68549
min        4.600000     2.000000      1.000000      0.10000
25%        5.025000     2.625000      1.600000      0.35000
50%        5.500000     3.000000      4.050000      1.15000
75%        5.900000     3.300000      4.

In [135]:
x_train['sepal_width'] = np.where((x_train['sepal_width'] >= 10),x_test['sepal_width'].max(), x_train['sepal_width'])

print(x_train.describe())
print(x_test.describe())
print(y_train.describe())

       sepal_length  sepal_width  petal_length  petal_width
count    120.000000   120.000000    120.000000   120.000000
mean       5.848333     3.080000      3.816667     1.226667
std        0.747093     0.425174      1.798848     0.780512
min        4.300000     2.200000      1.100000     0.100000
25%        5.175000     2.800000      1.575000     0.300000
50%        6.000000     3.000000      4.400000     1.350000
75%        6.400000     3.400000      5.225000     1.800000
max        7.400000     4.400000      6.900000     2.500000
       sepal_length  sepal_width  petal_length  petal_width
count     30.000000    30.000000     30.000000     30.00000
mean       5.593333     3.000000      3.523333      1.09000
std        0.697252     0.522593      1.631518      0.68549
min        4.600000     2.000000      1.000000      0.10000
25%        5.025000     2.625000      1.600000      0.35000
50%        5.500000     3.000000      4.050000      1.15000
75%        5.900000     3.300000      4.

In [137]:
#-- 데이터 분리
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train['species'],
                                                  random_state = 2023,
                                                  test_size = 0.2,
                                                  stratify= y_train['species']
                                                  )

### 2. 모델링 및 성능평가

In [159]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = 2023)
model.fit(x_train, y_train)

y_pred = model.predict(x_val)

In [160]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
acc = accuracy_score(y_val, y_pred) # (실제값, 예측값)
f1 = f1_score(y_val, y_pred) # (실제값, 예측값)
# 다중분류일 경우 f1 = f1_score(y_val, y_pred, average = 'macro')
auc = roc_auc_score(y_val, y_pred) # (실제값, 예측값)

In [161]:
print(f1)
print(acc)
print(auc)

1.0
1.0
1.0


In [162]:
# 1. 특정 클래스로 분류할 경우 (predict)
y_result = model.predict(x_test)
print(y_result[:5])
# 2. 특정 클래스로 분류될 확률을 구할 경우 (predict_proba)

y_result_prob = model.predict_proba(x_test)
print(y_result_prob[:5])
# 이해해보기
result_prob = pd.DataFrame({
    'result': y_result,
    'prob_0': y_result_prob[:,0]
})
# setosa 일 확률 : y_result_prob[:,0] # 그 외 종일 확률 : y_result_prob[:,1]
print(result_prob[:5])

[1 1 1 0 1]
[[0.05 0.95]
 [0.   1.  ]
 [0.   1.  ]
 [1.   0.  ]
 [0.06 0.94]]
   result  prob_0
0       1    0.05
1       1    0.00
2       1    0.00
3       0    1.00
4       1    0.06


# 4. 타이타닉 생존자 분류 ✅ 데이터 분석 순서
1. 라이브러리 및 데이터 확인 2. 데이터 탐색(EDA)
3. 데이터 전처리 및 분리
4. 모델링 및 성능평가
5. 예측값 제출

타이타닉 생존자 예측 문제
- 데이터의 결측치, 중복 변수값에 대해 처리하고
- 분류모델을 사용하여 Accuracy, F1 score, AUC 값을 산출하시오.


In [289]:
############### 복사 영역 ############### # 실기 시험 데이터셋으로 셋팅하기 (수정금지)
# Seaborn의 내장 타이타닉 데이터셋을 불러옵니다. import seaborn as sns
df = sns.load_dataset('titanic')
x = df.drop('survived', axis=1)
y = df['survived']
# 실기 시험 데이터셋으로 셋팅하기 (수정금지)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    stratify = y,
                                                    test_size=0.2,
                                                    random_state = 2023)
x_test = pd.DataFrame(x_test)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test) # 평가용
x_test.reset_index()
y_train.columns = ['target']
y_test.columns = ['target'] ############### 복사 영역 ###############
### 참고사항 ###
# y_test 는 실기 문제상에 주어지지 않음
# ★Tip : X를 대문자로 쓰지말고 소문자 x로 쓰세요. 시험에서 실수하기 쉽습니다.(문제풀기 전에 소문자로 변경!) # (참고 : 보통 X는 2차원 배열(행렬)이기 때문에 대문자로 쓰고, y는 1차원 배열(벡터)이기 때문에 소문자로 씀)
# (참고) 실기시험 데이터 형식 (실제 시험장에서는 다를 수 있으니 반드시 체크) # X_test = pd.read_csv("data/X_test.csv")
# X_train = pd.read_csv("data/X_train.csv")
# y_train = pd.read_csv("data/y_train.csv")

### 1. 데이터 확인

In [275]:
print(x_train.shape) #-- (712, 14)
print(y_train.shape)#-- (712, 1)
print(x_test.shape)#-- (179, 14)

(712, 14)
(712, 1)
(179, 14)


In [276]:
x_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
517,3,male,,0,0,24.15,Q,Third,man,True,,Queenstown,no,True
861,2,male,21.0,1,0,11.5,S,Second,man,True,,Southampton,no,False
487,1,male,58.0,0,0,29.7,C,First,man,True,B,Cherbourg,no,True
58,2,female,5.0,1,2,27.75,S,Second,child,False,,Southampton,yes,False


In [290]:
# print(x_train.info())
#-- 결측치 제거
# print(x_train.isnull().sum()) #-- age 133, deck 548, embarked 2 , embark_town 2
#-- 덱은 결측치가 너무 많으니 버림
x_train = x_train.drop('deck', axis = 1)
x_test = x_test.drop('deck', axis = 1)

x_train = x_train.drop('class', axis = 1)
x_test = x_test.drop('class', axis = 1)

x_train = x_train.drop('embark_town', axis = 1)
x_test = x_test.drop('embark_town', axis = 1)

x_train = x_train.drop('alive', axis = 1)
x_test = x_test.drop('alive', axis = 1)
# print(x_train['embarked'].value_counts()) #-- S
# print(x_train['embark_town'].value_counts()) #-- Southampton

# print(x_train.isnull().sum()) #-- age 133, embarked 2 , embark_town 2
age_median = x_train['age'].median()

x_train['age'] = x_train['age'].fillna(age_median)
x_train['embarked'] = x_train['embarked'].fillna('S')

x_test['age'] = x_test['age'].fillna(age_median)
x_test['embarked'] = x_test['embarked'].fillna('S')

print(x_train.isnull().sum().sum())
print(x_test.isnull().sum().sum())
print(y_train.isnull().sum().sum())

0
0
0


In [281]:
#-- 이상치 제거
print(x_train.describe())
print(x_test.describe())
print(y_train.describe())

           pclass         age       sibsp       parch        fare
count  712.000000  712.000000  712.000000  712.000000  712.000000
mean     2.307584   29.203188    0.518258    0.372191   31.741836
std      0.834926   12.956053    1.094522    0.792341   45.403910
min      1.000000    0.420000    0.000000    0.000000    0.000000
25%      2.000000   22.000000    0.000000    0.000000    7.895800
50%      3.000000   28.000000    0.000000    0.000000   14.454200
75%      3.000000   35.000000    1.000000    0.000000   31.275000
max      3.000000   74.000000    8.000000    6.000000  512.329200
           pclass         age       sibsp       parch        fare
count  179.000000  179.000000  179.000000  179.000000  179.000000
mean     2.312849   29.991620    0.541899    0.418994   34.043364
std      0.842950   13.287917    1.137797    0.859760   64.097184
min      1.000000    1.000000    0.000000    0.000000    0.000000
25%      2.000000   24.000000    0.000000    0.000000    7.925000
50%      3

In [291]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

In [292]:
#-- 데이터 분리
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['target'],
                                                  test_size = 0.2,
                                                  random_state = 2023,
                                                  stratify = y_train['target']
                                                  )

### 모델링 및 평가

In [307]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = 2023)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

In [308]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
f1 = f1_score(y_val, y_pred)
acc = accuracy_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred)
# print(help('sklearn.metrics'))

In [309]:
print(f1)
print(acc)
print(auc)

0.8108108108108109
0.8531468531468531
0.8465909090909092


### 3. 제출

In [316]:
y_result = model.predict(x_test)
y_result_prob = model.predict_proba(x_test)

result = pd.DataFrame({
    'result' : y_result
})
result.to_csv('test.csv', index=False)


In [317]:
df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,result
0,1
1,1
2,0
3,0
4,0
