# 빅데이터 분석 기사 실기 - 제2유형

---

## 머신 러닝 - 지도 학습
- 분류(범주 예측)
- 회귀(수치 예측)

## 작업 흐름

#### 1. 데이터 읽기

#### 2. 데이터 탐색
  - 데이터 크기 및 변수 타입 확인
  
#### 3. 데이터 전처리
  - 필요 없는 컬럼 삭제
  - 결측치 처리
  - 범주형 변수 인코딩
  - 수치형 변수 스케일링
  
#### 4. 데이터 분리
  - 학습 데이터, 평가 데이터 분리
  
#### 5. 모델 생성
  - 머신 러닝 모델 선택, 하이퍼 파라미터 설정
  
#### 6. 학습
  - 학습 데이터로 학습
  
#### 7. 예측
  - 평가 데이터로 예측
  
#### 8. 평가
  - 분류와 회귀에 따라 평가
  
#### 9. 최종 결과값 예측 및 제출

## sklearn 패키지 구성

```
sklearn
│
├── 01 preprocessing (전처리)
│   │
│   ├── 스케일러
│   │   ├── MinMaxScaler
│   │   ├── RobustScaler
│   │   └── StandardScaler
│   │
│   └── 인코더
│       ├── LabelEncoder
│       └── OneHotEncoder
│  
├── 02 모델학습
│   │
│   ├── ensemble
│   │   ├── AdaBoostClassifier
│   │   ├── GradientBoostingClassifier
│   │   ├── RandomForestClassifier
│   │   └── RandomForestRegressor
│   │
│   ├── linear_model
│   │   ├── LogisticRegression
│   │   └── RidgeClassifier
│   │
│   ├── svm
│   │   ├── SVC
│   │   └── SVR
│   │
│   └── tree
│       ├── DecisionTreeClassifier
│       ├── DecisionTreeRegressor
│       ├── ExtraTreeClassifier
│       └── ExtraTreeRegressor
│
├── 03 모델평가
│   │
│   ├── metrics
│   │   ├── accuracy_score
│   │   ├── classification_report
│   │   ├── confusion_matrix
│   │   ├── f1_score
│   │   ├── log_loss
│   │   ├── mean_absolute_error
│   │   ├── mean_squared_error
│   │   └── roc_auc_score
│   │
│   └── model (정의된 모델에서 추출)
│       ├── predict
│       └── predict_proba
│
```

## 1. 와인 등급 예측(분류)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_wine = pd.read_csv('data/wine.csv')
df_wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [3]:
df_wine.shape

(178, 14)

In [4]:
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  targe

In [5]:
df_wine.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


In [6]:
df_wine['target'].value_counts()

1    71
0    59
2    48
Name: target, dtype: int64

In [7]:
df_wine.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

In [8]:
# [1] Data Split
from sklearn.model_selection import train_test_split

X_col = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 
         'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 
         'od280/od315_of_diluted_wines', 'proline']

y_col = 'target'

X_train, X_test, y_train, y_test = train_test_split(df_wine[X_col], df_wine[y_col])


# [2] Classifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics

# Classification report
print(metrics.classification_report(y_test, pred))

print(round(metrics.accuracy_score(y_test, pred),2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00        14

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

1.0


## 2. 타이타닉 생존자 예측(분류)

### Read CSV file

In [9]:
df = pd.read_csv('data/titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [10]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### 컬럼 삭제: PassengerId, Name, Ticket, Cabin

In [11]:
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

### 상관계수

In [12]:
df[['Survived', 'Pclass', 'SibSp', 'Parch', 'Age', 'Fare']].corr()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare
Survived,1.0,-0.338481,-0.035322,0.081629,-0.077221,0.257307
Pclass,-0.338481,1.0,0.083081,0.018443,-0.369226,-0.5495
SibSp,-0.035322,0.083081,1.0,0.414838,-0.308247,0.159651
Parch,0.081629,0.018443,0.414838,1.0,-0.189119,0.216225
Age,-0.077221,-0.369226,-0.308247,-0.189119,1.0,0.096067
Fare,0.257307,-0.5495,0.159651,0.216225,0.096067,1.0


### 결측치 확인

In [13]:
df.isnull().sum()

Survived      0
Pclass        0
Gender        0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

### Embarked 결측치 채우기

In [14]:
# Embarked 비율(백분율)
df['Embarked'].value_counts() / df.shape[0] * 100

S    72.278339
C    18.855219
Q     8.641975
Name: Embarked, dtype: float64

In [15]:
df['Embarked'] = df['Embarked'].fillna('S')

In [16]:
df['Embarked'].isnull().sum()

0

### Age 결측치 채우기

In [17]:
# Age 결측치 비율(백분율)
df['Age'].isnull().sum() / len(df['Age']) * 100

19.865319865319865

In [18]:
# 나이 평균
df['Age'].mean()

29.69911764705882

#### 성별에 따른 나이 평균

In [19]:
# 남자 나이 평균
df[df['Gender'] == 'male']['Age'].mean()

30.72664459161148

In [20]:
# 여자 나이 평균
df[df['Gender'] == 'female']['Age'].mean()

27.915708812260537

#### 선실 등급('Pclass')별 나이 평균값

In [21]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [22]:
# Pclass 1 나이 평균값
df[df['Pclass'] == 1]['Age'].mean()

38.233440860215055

In [23]:
# Pclass 2 나이 평균값
df[df['Pclass']==2]['Age'].mean()

29.87763005780347

In [24]:
# Pclass 3 나이 평균값
df[df['Pclass']==3]['Age'].mean()

25.14061971830986

#### 성별(Gender), 선실 등급(Pclass)별 나이 평균값

In [25]:
# 나이 평균값: 남자, Pclass 1
df[(df['Gender']=='male') & (df['Pclass']==1)]['Age'].mean()

41.28138613861386

In [26]:
# 나이 평균값: 남자, Pclass 2
df[(df['Gender']=='male') & (df['Pclass']==2)]['Age'].mean()

30.74070707070707

In [27]:
# 나이 평균값: 남자, Pclass 3
df[(df['Gender']=='male') & (df['Pclass']==3)]['Age'].mean()

26.507588932806325

In [28]:
# 나이 평균값: 여자, Pclass 1
df[(df['Gender']=='female') & (df['Pclass']==1)]['Age'].mean()

34.61176470588235

In [29]:
# 나이 평균값: 여자, Pclass 2
df[(df['Gender']=='female') & (df['Pclass']==2)]['Age'].mean()

28.722972972972972

In [30]:
# 나이 평균값: 여자, Pclass 3
df[(df['Gender']=='female') & (df['Pclass']==3)]['Age'].mean()

21.75

#### Age 결측치 채우기

In [31]:
med_m_1 = df[(df['Gender']=='male')   & (df['Pclass']==1)]['Age'].mean()
med_m_2 = df[(df['Gender']=='male')   & (df['Pclass']==2)]['Age'].mean()
med_m_3 = df[(df['Gender']=='male')   & (df['Pclass']==3)]['Age'].mean()
med_f_1 = df[(df['Gender']=='female') & (df['Pclass']==1)]['Age'].mean()
med_f_2 = df[(df['Gender']=='female') & (df['Pclass']==2)]['Age'].mean()
med_f_3 = df[(df['Gender']=='female') & (df['Pclass']==3)]['Age'].mean()

In [32]:
df['Age'].isnull().sum()

177

In [33]:
df.loc[(df['Age'].isnull())&(df['Gender']=='male')  &(df['Pclass']==1),'Age'] = med_m_1
df.loc[(df['Age'].isnull())&(df['Gender']=='male')  &(df['Pclass']==2),'Age'] = med_m_2
df.loc[(df['Age'].isnull())&(df['Gender']=='male')  &(df['Pclass']==3),'Age'] = med_m_3
df.loc[(df['Age'].isnull())&(df['Gender']=='female')&(df['Pclass']==1),'Age'] = med_f_1
df.loc[(df['Age'].isnull())&(df['Gender']=='female')&(df['Pclass']==2),'Age'] = med_f_2
df.loc[(df['Age'].isnull())&(df['Gender']=='female')&(df['Pclass']==3),'Age'] = med_f_3

In [34]:
df['Age'].isnull().sum()

0

In [35]:
df.isnull().sum()

Survived    0
Pclass      0
Gender      0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [36]:
df

Unnamed: 0,Survived,Pclass,Gender,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.00,1,0,7.2500,S
1,1,1,female,38.00,1,0,71.2833,C
2,1,3,female,26.00,0,0,7.9250,S
3,1,1,female,35.00,1,0,53.1000,S
4,0,3,male,35.00,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.00,0,0,13.0000,S
887,1,1,female,19.00,0,0,30.0000,S
888,0,3,female,21.75,1,2,23.4500,S
889,1,1,male,26.00,0,0,30.0000,C


#### 컬럼 추가: Fsize = SibSp + Parch

In [37]:
df['Fsize'] = df['SibSp'] + df['Parch'] + 1

#### 컬럼 삭제: SibSp, Parch

In [38]:
df.drop(['SibSp','Parch'], axis=1, inplace=True)

In [39]:
df

Unnamed: 0,Survived,Pclass,Gender,Age,Fare,Embarked,Fsize
0,0,3,male,22.00,7.2500,S,2
1,1,1,female,38.00,71.2833,C,2
2,1,3,female,26.00,7.9250,S,1
3,1,1,female,35.00,53.1000,S,2
4,0,3,male,35.00,8.0500,S,1
...,...,...,...,...,...,...,...
886,0,2,male,27.00,13.0000,S,1
887,1,1,female,19.00,30.0000,S,1
888,0,3,female,21.75,23.4500,S,4
889,1,1,male,26.00,30.0000,C,1


### 생존자 예측

### 문자열을 숫자로 치환

In [40]:
df['Gender'] = df['Gender'].replace(['male','female'], [0, 1])
df['Embarked'] = df['Embarked'].replace(['S','C','Q'], [0, 1, 2])

### 생존여부 예측

#### 데이터 분리: 학습 데이터 + 테스트 데이터

In [41]:
X_col = ['Pclass','Gender','Age','Fare','Embarked','Fsize']

y_col = 'Survived'

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[X_col], df[y_col])

#### 1. RandomForest Classifier

In [43]:
# 모델 생성
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, max_depth=7, random_state=123)

In [44]:
# 모델 학습
model.fit(X_train, y_train)

In [45]:
# 결과 예측
pred = model.predict(X_test)

In [46]:
from sklearn import metrics
print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.77      0.90      0.83       131
           1       0.81      0.61      0.70        92

    accuracy                           0.78       223
   macro avg       0.79      0.75      0.76       223
weighted avg       0.78      0.78      0.77       223



In [47]:
# 정확도 확인
print(round(metrics.accuracy_score(pred, y_test),2))

0.78


In [48]:
# ROC-AUC Score 확인
print(round(metrics.roc_auc_score(pred, y_test),2))

0.79


## 3. 당뇨병 진척도 예측(회귀)

In [49]:
df = pd.read_csv('./data/diabetes.csv')
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [50]:
df.shape

(442, 11)

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [52]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-7.284269e-18,2.3485490000000002e-17,-2.08732e-16,-4.5715070000000006e-17,-9.293722e-18,4.420798e-17,2.135044e-18,2.913707e-17,9.143013e-17,1.4317360000000002e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0


In [53]:
df.isnull().sum()

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

In [54]:
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6',
       'target'],
      dtype='object')

In [55]:
# [1] Data Split
from sklearn.model_selection import train_test_split

X_col = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

y_col = 'target'

X_train, X_test, y_train, y_test = train_test_split(df[X_col], df[y_col])

# [2] Regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics

# R squared
print(round(metrics.r2_score(y_test, pred),2))

# RMSE(Root Mean Squared Error)
print(round(metrics.mean_squared_error(y_test, pred)**0.5),2)

0.42
57 2


## 4. 집값 예측(회귀)

In [56]:
df = pd.read_csv('./data/boston.csv')
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [57]:
df.shape

(506, 14)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  target   506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [59]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [60]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
target     0
dtype: int64

In [61]:
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'target'],
      dtype='object')

In [62]:
# [1] Data Split
from sklearn.model_selection import train_test_split

X_col = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']

y_col = 'target'

X_train, X_test, y_train, y_test = train_test_split(df[X_col], df[y_col])

# [2] Regressor
#from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

#model = RandomForestRegressor()
model = XGBRegressor()

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics

# R squared
print(round(metrics.r2_score(y_test, pred),2))

# RMSE(Root Mean Squared Error)
print(round(metrics.mean_squared_error(y_test, pred)**0.5),2)

0.91
3 2


## (체험) 제 2 유형 예제

#### 데이터 설명

|칼럼영|칼럼 설명|
|---|---|
|회원ID|회원고유번호|
|총구매액|총 구매금액(원)|
|최대구매액|구매건당 최대구매금액(원)|
|환불금액|환불금액(원)|
|주구매상품|주로 구매한 품목(42개 품목 범주)|
|주구매지점|주로 구매한 저점명(24개 저점 범부)|
|방문일수|고객이 방문한 일수(일)|
|방문당구매건수|총구매건수/방문일수|
|주말방문비율|주말에 방문한 비율|
|구매주기|구매 주기(일)|
|성별|고객 성별(0:여자, 1:남자)|

## 데이터 읽기

In [63]:
train = pd.read_csv('./data/customer_train.csv')
train

Unnamed: 0,회원ID,총구매액,최대구매액,환불금액,주구매상품,주구매지점,방문일수,방문당구매건수,주말방문비율,구매주기,성별
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17,0
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.500000,0.000000,1,0
2,2,3197000,1639000,,남성 캐주얼,관악점,2,2.000000,0.000000,1,1
3,3,16077620,4935000,,기타,광주점,18,2.444444,0.318182,16,1
4,4,29050000,24000000,,보석,본 점,2,1.500000,0.000000,85,0
...,...,...,...,...,...,...,...,...,...,...,...
3495,3495,3175200,3042900,,골프,본 점,1,2.000000,1.000000,0,1
3496,3496,29628600,7200000,6049600.0,시티웨어,부산본점,8,1.625000,0.461538,40,1
3497,3497,75000,75000,,주방용품,창원점,1,1.000000,0.000000,0,0
3498,3498,1875000,1000000,,화장품,본 점,2,1.000000,0.000000,39,0


In [64]:
test = pd.read_csv('./data/customer_test.csv')
test

Unnamed: 0,회원ID,총구매액,최대구매액,환불금액,주구매상품,주구매지점,방문일수,방문당구매건수,주말방문비율,구매주기
0,3500,70900400,22000000,4050000.0,골프,부산본점,13,1.461538,0.789474,26
1,3501,310533100,38558000,48034700.0,농산물,잠실점,90,2.433333,0.369863,3
2,3502,305264140,14825000,30521000.0,가공식품,본 점,101,14.623762,0.083277,3
3,3503,7594080,5225000,,주방용품,부산본점,5,2.000000,0.000000,47
4,3504,1795790,1411200,,수산품,청량리점,3,2.666667,0.125000,8
...,...,...,...,...,...,...,...,...,...,...
2477,5977,82581500,23976000,,골프,부산본점,8,1.750000,0.642857,40
2478,5978,480000,480000,,섬유잡화,광주점,1,1.000000,0.000000,0
2479,5979,260003790,25750000,,남성 캐주얼,본 점,19,3.736842,0.915493,18
2480,5980,88991520,18120000,,육류,본 점,5,3.600000,0.444444,60


In [65]:
train.shape

(3500, 11)

In [66]:
test.shape

(2482, 10)

In [67]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   회원ID     3500 non-null   int64  
 1   총구매액     3500 non-null   int64  
 2   최대구매액    3500 non-null   int64  
 3   환불금액     1205 non-null   float64
 4   주구매상품    3500 non-null   object 
 5   주구매지점    3500 non-null   object 
 6   방문일수     3500 non-null   int64  
 7   방문당구매건수  3500 non-null   float64
 8   주말방문비율   3500 non-null   float64
 9   구매주기     3500 non-null   int64  
 10  성별       3500 non-null   int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 300.9+ KB


In [68]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2482 entries, 0 to 2481
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   회원ID     2482 non-null   int64  
 1   총구매액     2482 non-null   int64  
 2   최대구매액    2482 non-null   int64  
 3   환불금액     871 non-null    float64
 4   주구매상품    2482 non-null   object 
 5   주구매지점    2482 non-null   object 
 6   방문일수     2482 non-null   int64  
 7   방문당구매건수  2482 non-null   float64
 8   주말방문비율   2482 non-null   float64
 9   구매주기     2482 non-null   int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 194.0+ KB


In [69]:
train.describe()

Unnamed: 0,회원ID,총구매액,최대구매액,환불금액,방문일수,방문당구매건수,주말방문비율,구매주기,성별
count,3500.0,3500.0,3500.0,1205.0,3500.0,3500.0,3500.0,3500.0,3500.0
mean,1749.5,91919250.0,19664240.0,24078220.0,19.253714,2.834963,0.307246,20.958286,0.376
std,1010.507298,163506500.0,31992350.0,47464530.0,27.174942,1.912368,0.289752,24.748682,0.484449
min,0.0,-52421520.0,-2992000.0,5600.0,1.0,1.0,0.0,0.0,0.0
25%,874.75,4747050.0,2875000.0,2259000.0,2.0,1.666667,0.027291,4.0,0.0
50%,1749.5,28222700.0,9837000.0,7392000.0,8.0,2.333333,0.25641,13.0,0.0
75%,2624.25,106507900.0,22962500.0,24120000.0,25.0,3.375,0.44898,28.0,1.0
max,3499.0,2323180000.0,706629000.0,563753000.0,285.0,22.083333,1.0,166.0,1.0


In [70]:
test.describe()

Unnamed: 0,회원ID,총구매액,최대구매액,환불금액,방문일수,방문당구매건수,주말방문비율,구매주기
count,2482.0,2482.0,2482.0,871.0,2482.0,2482.0,2482.0,2482.0
mean,4740.5,101027500.0,21770480.0,25547160.0,19.516922,2.819388,0.293812,20.28606
std,716.636007,173213200.0,35049190.0,59440740.0,25.973972,1.75455,0.2826,24.108756
min,3500.0,-37440000.0,-37440000.0,10000.0,1.0,1.0,0.0,0.0
25%,4120.25,5076868.0,2884350.0,2414000.0,2.0,1.75,0.023456,4.0
50%,4740.5,30516860.0,10752500.0,8100000.0,9.0,2.430952,0.25,13.0
75%,5360.75,126425500.0,26277000.0,22280900.0,26.75,3.375,0.423566,27.0
max,5981.0,2861238000.0,593225000.0,871514400.0,222.0,15.875,1.0,177.0


In [71]:
train.isnull().sum()

회원ID          0
총구매액          0
최대구매액         0
환불금액       2295
주구매상품         0
주구매지점         0
방문일수          0
방문당구매건수       0
주말방문비율        0
구매주기          0
성별            0
dtype: int64

In [72]:
test.isnull().sum()

회원ID          0
총구매액          0
최대구매액         0
환불금액       1611
주구매상품         0
주구매지점         0
방문일수          0
방문당구매건수       0
주말방문비율        0
구매주기          0
dtype: int64

In [73]:
# 컬럼 삭제
train.drop('회원ID', axis=1, inplace=True)
test.drop('회원ID', axis=1, inplace=True)

In [74]:
# 결측치 처리
median_tr = train['환불금액'].median()
train['환불금액'] = train['환불금액'].fillna(median_tr)

median_te = test['환불금액'].median()
test['환불금액'] = test['환불금액'].fillna(median_te)

In [75]:
# 피쳐 전처리
from sklearn.preprocessing import LabelEncoder

df_t = pd.concat([train,test])

# 범주형 변수 인코딩
le = LabelEncoder()
df_t['주구매상품'] = le.fit_transform(df_t['주구매상품'])
df_t['주구매지점'] = le.fit_transform(df_t['주구매지점'])

train = df_t.iloc[:3500].copy()
test = df_t.iloc[3500:].copy()

test.drop('성별', axis=1, inplace=True)

In [76]:
# 정규화
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
cols = ['총구매액','최대구매액','환불금액','방문일수','방문당구매건수','주말방문비율','구매주기']

train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

In [77]:
# [1] Data Split
from sklearn.model_selection import train_test_split

X_col = ['총구매액', '최대구매액', '환불금액', '주구매상품', '주구매지점', '방문일수',
         '방문당구매건수', '주말방문비율', '구매주기']

y_col = '성별'

X_train, X_test, y_train, y_test = train_test_split(train[X_col], train[y_col], test_size = 0.2)


# [2] Classifier
#from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

#model = XGBClassifier()
#model = RandomForestClassifier(n_estimators=100, max_depth=4)
model = RandomForestClassifier(random_state=123)

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics

# Classification report
print(metrics.classification_report(y_test, pred))

print(round(metrics.roc_auc_score(y_test, pred),2))

              precision    recall  f1-score   support

         0.0       0.66      0.82      0.73       440
         1.0       0.48      0.28      0.36       260

    accuracy                           0.62       700
   macro avg       0.57      0.55      0.54       700
weighted avg       0.59      0.62      0.59       700

0.55


#### 제출 파일 생성

In [78]:
# 예측
pred_test = model.predict(test)

# DataFrame 생성
result = pd.DataFrame({'pred': pred_test })

# 파일 쓰기
result.to_csv('result.csv', index=False) # 인덱스 제거

#### 제출 파일 확인

In [79]:
result_l = pd.read_csv('result.csv')
print(result_l)

      pred
0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
...    ...
2477   1.0
2478   0.0
2479   1.0
2480   0.0
2481   1.0

[2482 rows x 1 columns]


#### [참고] 확률값 예측하기

In [80]:
y_pred_probs = model.predict_proba(X_test)[:,1]

---

In [81]:
# End of file