In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from dmba import classificationSummary
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("D:/dmba/eBayAuctions.csv")
df = df.rename(columns={'Competitive?' : 'Competitive'})

df.dtypes


Category         object
currency         object
sellerRating      int64
Duration          int64
endDay           object
ClosePrice      float64
OpenPrice       float64
Competitive       int64
dtype: object

In [77]:
# 1 변수 Competitive 를 결과 변수로 하는 데이터셋 학습 검증으로 분할
X = df.drop(columns=['Competitive', 'Category', 'ClosePrice', 'currency', 'endDay'])
y = df['Competitive']

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size= 0.4, random_state=1)


In [78]:
# 1-1 DecisionTreeClassifier 기본값으로 분류 트리 실행, 정확도
d_tree = DecisionTreeClassifier()
d_tree.fit(train_X, train_y)

classificationSummary(valid_y, d_tree.predict(valid_X))

Confusion Matrix (Accuracy 0.7034)

       Prediction
Actual   0   1
     0 261  92
     1 142 294


In [79]:
# 1-2 DecisionTreeClassifier 을 기본 추정량으로 AdaBoostClassifier 실행, 정확도
clf = AdaBoostClassifier(estimator=d_tree,
                         n_estimators=100,
                         random_state=10,
                         learning_rate=0.01)
clf.fit(train_X, train_y)
pred = clf.predict(valid_X)
print('AdaBoost 정확도 : {: .4f}'.format(accuracy_score(valid_y, pred)))




AdaBoost 정확도 :  0.7161


In [105]:
# 1-3. RandomForestClassifier 과 bagging 

rf_clf = RandomForestClassifier(random_state=3)
rf_clf.fit(train_X, train_y)
pred = rf_clf.predict(valid_X)
accuracy = accuracy_score(valid_y, pred)
print('랜덤 포레스트 정확도: {: .4f}'.format(accuracy))

uplift_df = valid_X.copy()
uplift_df.Competitive = 1
predTreatment = rf_clf.predict_proba(uplift_df)
uplift_df.Competitive = 0
predControl = rf_clf.predict_proba(uplift_df)

upliftResult_df = pd.DataFrame({
    'probCompetitive':predTreatment[:, 1],
    'probNonCompetitive':predControl[:, 1],
    'uplift':predTreatment[:,1] - predControl[:, 1],}, index=uplift_df.index)
print(upliftResult_df.head(5))

bagging = BaggingClassifier(DecisionTreeClassifier(random_state=1), 
                            n_estimators=100, random_state=1)
bagging.fit(train_X, train_y)
pred = bagging.predict(valid_X)
accuracy = accuracy_score(valid_y, pred)
classes = d_tree.classes_
classificationSummary(valid_y, bagging.predict(valid_X), class_names=classes)
print('bagging 정확도: {: .4f}'.format(accuracy))

랜덤 포레스트 정확도:  0.8933
      probCompetitive  probNonCompetitive  uplift
1276             0.95                0.95     0.0
1446             1.00                1.00     0.0
335              1.00                1.00     0.0
1458             0.97                0.97     0.0
2038             0.97                0.97     0.0
Confusion Matrix (Accuracy 0.8865)

        Prediction
 Actual delayed  ontime
delayed      92      75
 ontime      25     689
bagging 정확도:  0.8865


In [81]:
# 1-4 GradientBoostingClassifier 실행, 정확도
gbc = GradientBoostingClassifier(random_state=3)
gbc.fit(train_X, train_y)
pred = gbc.predict(valid_X)
accuracy = accuracy_score(valid_y, pred)
print('그래디언트 부스팅 정확도: {: .4f}'.format(accuracy))

그래디언트 부스팅 정확도:  0.7161


In [100]:
#2-1 데이터 전처리: 범주형 변수 변환, 예정된 출발 시간 구간 나누기

df = pd.read_csv("D:/dmba/FlightDelays.csv")

df.columns = [s.strip().replace(' ', '_') for s in df.columns]

df.DAY_WEEK = df.DAY_WEEK.astype('category')
print(df.DAY_WEEK.cat.categories)  # It can take one of three levels
print(df.DAY_WEEK.dtype)

df['new_dep_time'] = pd.cut(df['DEP_TIME'], bins=8)
df['new_dep_time_label'] =pd.cut(df['DEP_TIME'], bins=8, labels=['1', '2', '3', '4', '5', '6', '7', '8'])
print(df.dtypes)
df

Index([1, 2, 3, 4, 5, 6, 7], dtype='int64')
category
CRS_DEP_TIME             int64
CARRIER                 object
DEP_TIME                 int64
DEST                    object
DISTANCE                 int64
FL_DATE                 object
FL_NUM                   int64
ORIGIN                  object
Weather                  int64
DAY_WEEK              category
DAY_OF_MONTH             int64
TAIL_NUM                object
Flight_Status           object
new_dep_time          category
new_dep_time_label    category
dtype: object


Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight_Status,new_dep_time,new_dep_time_label
0,1455,OH,1455,JFK,184,01/01/2004,5935,BWI,0,4,1,N940CA,ontime,"(1170.0, 1460.0]",5
1,1640,DH,1640,JFK,213,01/01/2004,6155,DCA,0,4,1,N405FJ,ontime,"(1460.0, 1750.0]",6
2,1245,DH,1245,LGA,229,01/01/2004,7208,IAD,0,4,1,N695BR,ontime,"(1170.0, 1460.0]",5
3,1715,DH,1709,LGA,229,01/01/2004,7215,IAD,0,4,1,N662BR,ontime,"(1460.0, 1750.0]",6
4,1039,DH,1035,LGA,229,01/01/2004,7792,IAD,0,4,1,N698BR,ontime,"(880.0, 1170.0]",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196,645,RU,644,EWR,199,1/31/2004,2761,DCA,0,6,31,N15555,ontime,"(590.0, 880.0]",3
2197,1700,RU,1653,EWR,213,1/31/2004,2497,IAD,0,6,31,N16976,ontime,"(1460.0, 1750.0]",6
2198,1600,RU,1558,EWR,199,1/31/2004,2361,DCA,0,6,31,N14902,ontime,"(1460.0, 1750.0]",6
2199,1359,RU,1403,EWR,199,1/31/2004,2216,DCA,0,6,31,N16961,ontime,"(1170.0, 1460.0]",5


In [101]:
#2-1. 학습 검증 데이터셋 분할, 단일 분류 트리와 adaboost 분류 정확도 비교
X = df.drop(columns=['CARRIER', 'DEST', 'FL_DATE', 'ORIGIN', 'TAIL_NUM','new_dep_time', 'Flight_Status'])
y = df['Flight_Status']


train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size= 0.4, random_state=1)

d_tree = DecisionTreeClassifier()
d_tree.fit(train_X, train_y)

classificationSummary(valid_y, d_tree.predict(valid_X))

clf = AdaBoostClassifier(estimator=d_tree,
                         n_estimators=500,
                         random_state=1)
clf.fit(train_X, train_y)
pred = clf.predict(valid_X)
print('AdaBoost 정확도 : {: .4f}'.format(accuracy_score(valid_y, pred)))


Confusion Matrix (Accuracy 0.8377)

       Prediction
Actual   0   1
     0 100  67
     1  76 638
AdaBoost 정확도 :  0.8343




In [106]:
#2-2 단일 분류 트리와 gbm 

X = df.drop(columns=['CARRIER', 'DEST', 'FL_DATE', 'ORIGIN', 'TAIL_NUM','new_dep_time', 'Flight_Status'])
y = df['Flight_Status']

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size= 0.4, random_state=1)

d_tree = DecisionTreeClassifier()
d_tree.fit(train_X, train_y)

classificationSummary(valid_y, d_tree.predict(valid_X))


gbc = GradientBoostingClassifier(random_state=3)
gbc.fit(train_X, train_y)
pred = gbc.predict(valid_X)
accuracy = accuracy_score(valid_y, pred)
print('그래디언트 부스팅 정확도: {: .4f}'.format(accuracy))

Confusion Matrix (Accuracy 0.8411)

       Prediction
Actual   0   1
     0 100  67
     1  73 641
그래디언트 부스팅 정확도:  0.8933
