In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pylab as plt
from dmba import classificationSummary, gainsChart

In [4]:
delays_df = pd.read_csv("D:/dmba/FlightDelays.csv")
delays_df

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight Status
0,1455,OH,1455,JFK,184,01/01/2004,5935,BWI,0,4,1,N940CA,ontime
1,1640,DH,1640,JFK,213,01/01/2004,6155,DCA,0,4,1,N405FJ,ontime
2,1245,DH,1245,LGA,229,01/01/2004,7208,IAD,0,4,1,N695BR,ontime
3,1715,DH,1709,LGA,229,01/01/2004,7215,IAD,0,4,1,N662BR,ontime
4,1039,DH,1035,LGA,229,01/01/2004,7792,IAD,0,4,1,N698BR,ontime
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196,645,RU,644,EWR,199,1/31/2004,2761,DCA,0,6,31,N15555,ontime
2197,1700,RU,1653,EWR,213,1/31/2004,2497,IAD,0,6,31,N16976,ontime
2198,1600,RU,1558,EWR,199,1/31/2004,2361,DCA,0,6,31,N14902,ontime
2199,1359,RU,1403,EWR,199,1/31/2004,2216,DCA,0,6,31,N16961,ontime


In [5]:
delays_df.DAY_WEEK = delays_df.DAY_WEEK.astype('category')
delays_df.DAY_WEEK

0       4
1       4
2       4
3       4
4       4
       ..
2196    6
2197    6
2198    6
2199    6
2200    6
Name: DAY_WEEK, Length: 2201, dtype: category
Categories (7, int64): [1, 2, 3, 4, 5, 6, 7]

In [6]:
delays_df['Flight Status'] = delays_df['Flight Status'].astype('category')
delays_df['Flight Status']

0       ontime
1       ontime
2       ontime
3       ontime
4       ontime
         ...  
2196    ontime
2197    ontime
2198    ontime
2199    ontime
2200    ontime
Name: Flight Status, Length: 2201, dtype: category
Categories (2, object): ['delayed', 'ontime']

In [7]:
delays_df.CRS_DEP_TIME = [round(t/100) for t in delays_df.CRS_DEP_TIME]
delays_df.CRS_DEP_TIME = delays_df.CRS_DEP_TIME.astype('category')

In [8]:
predictors = ['DAY_WEEK', 'CRS_DEP_TIME', 'ORIGIN', 'DEST', 'CARRIER']
outcome = 'Flight Status'

In [9]:
X = pd.get_dummies(delays_df[predictors])
y = delays_df['Flight Status'].astype('category')
classes = list(y.cat.categories)

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=1)

In [11]:
delays_nb = MultinomialNB(alpha=0.01)
delays_nb.fit(X_train, y_train)

In [12]:
predProb_train = delays_nb.predict_proba(X_train)
predProb_valid = delays_nb.predict_proba(X_valid)

In [20]:
y_train_pred = delays_nb.predict(X_train)
y_valid_pred = delays_nb.predict(X_valid)

In [21]:
train_df, valid_df = train_test_split(delays_df, test_size=0.4, random_state=1)
print(train_df['Flight Status'].value_counts() / len(train_df))

Flight Status
ontime     0.802273
delayed    0.197727
Name: count, dtype: float64


In [27]:
df = train_df[['Flight Status', 'DAY_WEEK']]
df

Unnamed: 0,Flight Status,DAY_WEEK
1215,ontime,7
1476,ontime,3
1897,ontime,2
83,ontime,5
1172,delayed,6
...,...,...
960,ontime,3
905,ontime,2
1096,ontime,5
235,ontime,1


In [22]:
for predictor in predictors:
    df = train_df[['Flight Status', predictor]]
    freqTable = df.pivot_table(index='Flight Status', columns=predictor, aggfunc=len)
    propTable = freqTable.apply(lambda x: x / sum(x), axis=1)
    print(propTable)


DAY_WEEK              1         2         3         4         5         6  \
Flight Status                                                               
delayed        0.191571  0.149425  0.114943  0.126437  0.187739  0.068966   
ontime         0.124646  0.141643  0.144476  0.179415  0.169027  0.135977   

DAY_WEEK              7  
Flight Status            
delayed        0.160920  
ontime         0.104816  
CRS_DEP_TIME          6         7         8         9        10        11  \
Flight Status                                                               
delayed        0.034483  0.053640  0.065134  0.019157  0.030651  0.011494   
ontime         0.062323  0.063267  0.084986  0.056657  0.051936  0.033994   

CRS_DEP_TIME         12        13        14        15        16        17  \
Flight Status                                                               
delayed        0.049808  0.045977  0.038314  0.203065  0.072797  0.153257   
ontime         0.066100  0.074599  0.057602  0.

  freqTable = df.pivot_table(index='Flight Status', columns=predictor, aggfunc=len)
  freqTable = df.pivot_table(index='Flight Status', columns=predictor, aggfunc=len)
  freqTable = df.pivot_table(index='Flight Status', columns=predictor, aggfunc=len)
  freqTable = df.pivot_table(index='Flight Status', columns=predictor, aggfunc=len)
  freqTable = df.pivot_table(index='Flight Status', columns=predictor, aggfunc=len)


In [23]:
#나이브 베이즈를 사용해 데이터의 점수를 구해보자

In [24]:
df = pd.concat([pd.DataFrame({'actual':y_valid, 'predicted':y_valid_pred}), pd.DataFrame(predProb_valid, index=y_valid.index)], axis=1)
mask = ((X_valid.CARRIER_DL == 1) & (X_valid.DAY_WEEK_7 == 1) & (X_valid.CRS_DEP_TIME_10 == 1) & (X_valid.DEST_LGA == 1) & (X_valid.ORIGIN_DCA == 1)) 

df[mask]

Unnamed: 0,actual,predicted,0,1
1225,ontime,ontime,0.057989,0.942011


In [25]:
classificationSummary(y_train, y_train_pred, class_names=classes)
classificationSummary(y_valid, y_valid_pred, class_names=classes)

Confusion Matrix (Accuracy 0.7955)

        Prediction
 Actual delayed  ontime
delayed      52     209
 ontime      61     998
Confusion Matrix (Accuracy 0.7821)

        Prediction
 Actual delayed  ontime
delayed      26     141
 ontime      51     663
