In [57]:
import pandas as pd
X_train = pd.read_csv("../data/연습문제/College_X_train.csv", encoding="cp949")
X_test = pd.read_csv("../data/연습문제/College_X_test.csv", encoding="cp949")
y_train = pd.read_csv("../data/연습문제/College_y_train.csv", encoding="cp949")

In [58]:
### 데이터셋 전처리 

ID = X_test["ID"].copy() 

X_train = X_train.drop(columns=["ID", "Name"])
X_test = X_test.drop(columns=["ID", "Name"])
y_train = y_train.drop(columns="ID")

In [59]:
X_train.shape

(621, 17)

In [60]:
### 결측치 확인

In [61]:
### 수치형 컬럼 전처리 
## 비율에 관한 컬럼이므로 소수점으로 수정

X_train.select_dtypes(exclude="object").columns
col_per = ['Top10perc', 'Top25perc', 'PhD','Terminal', 'S.F.Ratio', 'Grad.Rate', 'perc.alumni']
X_train[col_per] = X_train[col_per]/100
X_test[col_per] = X_test[col_per]/100

In [62]:
X_train.select_dtypes(exclude="object").columns

Index(['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad',
       'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'PhD',
       'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend', 'Grad.Rate'],
      dtype='object')

In [63]:
col_del = ['Apps', 'Accept', 'F.Undergrad', 'Top25perc', 'Terminal']
X_train = X_train.drop(columns=col_del)
X_test = X_test.drop(columns=col_del)

In [64]:
### 데이터분할 
from sklearn.model_selection import train_test_split 

X_TRAIN, X_VAL, y_TRAIN, y_VAL = train_test_split(
  X_train, 
  y_train, 
  random_state=2022, 
  test_size=0.1, 
  stratify=y_train
) 

print(X_TRAIN.shape)
print(X_VAL.shape)
print(y_TRAIN.shape)
print(y_VAL.shape)

(558, 12)
(63, 12)
(558, 1)
(63, 1)


In [65]:
### 인코딩
## 카테고리형 컬럼이 존재하지 않으므로 생략

X_TRAIN.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558 entries, 91 to 23
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Enroll       558 non-null    int64  
 1   Top10perc    558 non-null    float64
 2   P.Undergrad  558 non-null    int64  
 3   Outstate     558 non-null    int64  
 4   Room.Board   558 non-null    int64  
 5   Books        558 non-null    int64  
 6   Personal     558 non-null    int64  
 7   PhD          558 non-null    float64
 8   S.F.Ratio    558 non-null    float64
 9   perc.alumni  558 non-null    float64
 10  Expend       558 non-null    int64  
 11  Grad.Rate    558 non-null    float64
dtypes: float64(5), int64(7)
memory usage: 56.7 KB


In [66]:
### 스케일링 
## 수치형 컬럼에 대해 표준화(z-점수)
from sklearn.preprocessing import StandardScaler 

scale = StandardScaler().fit(X_TRAIN)

X_TRAIN = scale.transform(X_TRAIN)
X_VAL = scale.transform(X_VAL)
X_TEST = scale.transform(X_test)

y_TRAIN = y_TRAIN.values.ravel()
y_VAL = y_VAL.values.ravel() 

print(y_TRAIN.shape)
print(y_VAL.shape)

(558,)
(63,)


In [67]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier 
from sklearn.metrics import roc_curve, auc 

In [68]:
### Random Forest 
## 에러 yes/no 는 양성/음성 클래스로 명확히 지정되어 있지 않다 
## 보통 0/1, -1/1 일 경우 1이 양성으로 표시 

rf = RandomForestClassifier(
  n_estimators=500, 
  max_depth=3, 
  min_samples_leaf=10, 
  max_features="sqrt", 
  random_state=2022
)
rf.fit(X_TRAIN, y_TRAIN)
score_rf = rf.predict_proba(X_VAL)[:, 1]

#
fpr, tpr, thresholds = roc_curve(y_VAL, score_rf, pos_label="Yes")
auc_rf = auc(fpr, tpr)
print(auc_rf)


0.9411764705882353


In [69]:
### Bagging

dtr = DecisionTreeClassifier(
  max_depth=3, 
  min_samples_leaf=10
)
bag = BaggingClassifier(
  estimator=dtr, 
  n_estimators=500, 
  random_state=2022
)
bag.fit(X_TRAIN, y_TRAIN)
score_bag = bag.predict_proba(X_VAL)[:, 1]

# 
fpr, tpr, thresholds = roc_curve(y_VAL, score_bag, pos_label="Yes")
auc_bag = auc(fpr, tpr)
print(auc_bag)

0.9360613810741688


In [70]:
### Adaboost 
dtr = DecisionTreeClassifier(
  max_depth=3, 
  min_samples_leaf=10
)
ada = AdaBoostClassifier(
  # algorithm='SAMME',
  estimator=dtr, 
  n_estimators=500, 
  learning_rate=0.5, 
  random_state=2022
)
ada.fit(X_TRAIN, y_TRAIN)
score_ada = ada.predict_proba(X_VAL)[:, 1]

# 
fpr, tpr, thresholds = roc_curve(y_VAL, score_ada, pos_label="Yes") 
auc_ada = auc(fpr, tpr)
print(auc_ada)





0.959079283887468


In [72]:
### 결과제출
y_score = ada.predict_proba(X_TEST)[:, 1]

obj = {
  "ID": ID, 
  "prob_Private": y_score
}
result = pd.DataFrame(obj)
result.to_csv("연습문제2.csv", index=False)