### Use this dataset of airline arrival information to predict how late flights will be. A flight only counts as late if it is more than 30 minutes late.

In [15]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df=pd.read_csv('2007.csv')

In [3]:
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2007,1,1,1,1232.0,1225,1341.0,1340,WN,2891,...,4,11,0,,0,0,0,0,0,0
1,2007,1,1,1,1918.0,1905,2043.0,2035,WN,462,...,5,6,0,,0,0,0,0,0,0
2,2007,1,1,1,2206.0,2130,2334.0,2300,WN,1229,...,6,9,0,,0,3,0,0,0,31
3,2007,1,1,1,1230.0,1200,1356.0,1330,WN,1355,...,3,8,0,,0,23,0,0,0,3
4,2007,1,1,1,831.0,830,957.0,1000,WN,2278,...,3,9,0,,0,0,0,0,0,0


In [4]:
df.shape

(7453215, 29)

In [5]:
df['on_time'] = np.where(df['ArrDelay']<=30, 1, 0)
df['late'] = np.where(df['ArrDelay']>30, 1, 0)

In [6]:
df.drop(['UniqueCarrier','TailNum','Origin','Dest','Year','CancellationCode'],axis=1, inplace=True)

In [7]:
null_ctr=df.isnull().sum()
null_ctr

Month                     0
DayofMonth                0
DayOfWeek                 0
DepTime              160748
CRSDepTime                0
ArrTime              177927
CRSArrTime                0
FlightNum                 0
ActualElapsedTime    177927
CRSElapsedTime          994
AirTime              177927
ArrDelay             177927
DepDelay             160748
Distance                  0
TaxiIn                    0
TaxiOut                   0
Cancelled                 0
Diverted                  0
CarrierDelay              0
WeatherDelay              0
NASDelay                  0
SecurityDelay             0
LateAircraftDelay         0
on_time                   0
late                      0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(7275288, 25)

In [10]:
y = df['ArrDelay']
X = df.drop('ArrDelay', 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [11]:
# Linear Regression
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
print('Training set score:', regr.score(X_train, y_train))
print('\nR-squared:')
print(regr.score(X_test, y_test))

Training set score: 0.999999989810665

R-squared:
0.9999999850116698


In [12]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1e9)
y_train_lr = np.where(y_train > 0, 1, 0)
y_test_lr = np.where(y_test > 0, 1, 0)
fit = lr.fit(X_train, y_train_lr)
print('Training set score:', lr.score(X_train, y_train_lr))
print('\nR-squared:')
print(lr.score(X_test, y_test_lr))



Training set score: 1.0

R-squared:
1.0


In [18]:
train_samp = df.sample(1000)
y = train_samp['ArrDelay']
X = train_samp.drop('ArrDelay', 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [20]:
# Ridge Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1e9)
y_train_lr = np.where(y_train > 0, 1, 0)
y_test_lr = np.where(y_test > 0, 1, 0)
fit = lr.fit(X_train, y_train_lr)
print('Training set score:', lr.score(X_train, y_train_lr))
print('\nR-squared:')
print(lr.score(X_test, y_test_lr))

Training set score: 1.0

R-squared:
0.992




In [21]:
# Lasso Regression
from sklearn import linear_model
lass = linear_model.Lasso(alpha=.35)
lassfit = lass.fit(X_train, y_train_lr)
print('R² for the model with few features:')
print(lass.score(X_train, y_train_lr))
#origparams = np.append(lassfit.coef_, lassfit.intercept_)
#print('\nParameter estimates for the model with few features:')
#print(origparams)
print(lass.score(X_test, y_test))

R² for the model with few features:
0.4322995089558812
-0.04212809454175481


In [22]:
# SVM
from sklearn.svm import SVC
svc=SVC()
svc.fit(X_train, y_train)
print('Training set score:', svc.score(X_train, y_train))
print('\nR-squared:')
print(svc.score(X_test, y_test))



Training set score: 1.0

R-squared:
0.028


In [23]:
# Random Forest
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train, y_train)
print('Training set score:', rfc.score(X_train, y_train))
print('\nR-squared:')
print(rfc.score(X_test, y_test))

Training set score: 0.9973333333333333

R-squared:
0.08




In [24]:
# Gradient Boosting
params = {'n_estimators': 100,
          'max_depth': 4,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)
print('Training set score:', clf.score(X_train, y_train))
print('\nR-squared:')
print(clf.score(X_test, y_test))

Training set score: 1.0

R-squared:
0.044


In [13]:
# Xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=100, n_estimators=50, colsample_bytree=0.2, colsample_bylevel=0.1, colsample_bynode=0.1)
xgb.fit(X_train, y_train)
print('Training set score:', xgb.score(X_train, y_train))
print('\nTest set score:', xgb.score(X_test, y_test))



OSError: [WinError -529697949] Windows Error 0xe06d7363