In [1]:
import pandas as pd
import seaborn as sns

## Data processing

In [2]:
df = pd.read_csv("DelayedFlights.csv")
pd.set_option('display.max_columns', 50)
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,N,0,,,,,
3,4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,N464WN,90.0,90.0,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
4,5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,378,N726SW,101.0,115.0,87.0,11.0,25.0,IND,JAX,688,4.0,10.0,0,N,0,,,,,


In [3]:
# ELiminamos la variable unnamed, que es simplemente un índice
df.drop("Unnamed: 0", inplace=True, axis=1)

In [4]:
init_size = df.size
init_size

56165982

In [5]:
# Inspect the data type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1936758 entries, 0 to 1936757
Data columns (total 29 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Year               int64  
 1   Month              int64  
 2   DayofMonth         int64  
 3   DayOfWeek          int64  
 4   DepTime            float64
 5   CRSDepTime         int64  
 6   ArrTime            float64
 7   CRSArrTime         int64  
 8   UniqueCarrier      object 
 9   FlightNum          int64  
 10  TailNum            object 
 11  ActualElapsedTime  float64
 12  CRSElapsedTime     float64
 13  AirTime            float64
 14  ArrDelay           float64
 15  DepDelay           float64
 16  Origin             object 
 17  Dest               object 
 18  Distance           int64  
 19  TaxiIn             float64
 20  TaxiOut            float64
 21  Cancelled          int64  
 22  CancellationCode   object 
 23  Diverted           int64  
 24  CarrierDelay       float64
 25  WeatherDelay      

Hay un error de formato con la variable flight number, que es un indicador del vuelo y por lo tanto una variable categórica. Lo mismo ocurre con Cancelled y Diverted

In [6]:
# Añadimos el FlightNum, Cancelled and Diverted como variables categóricas
df['FlightNum']=df['FlightNum'].astype(object) 
df['Cancelled']=df['Cancelled'].astype(object) 
df['Diverted']=df['Diverted'].astype(object) 

In [7]:
df.isna().sum()

Year                      0
Month                     0
DayofMonth                0
DayOfWeek                 0
DepTime                   0
CRSDepTime                0
ArrTime                7110
CRSArrTime                0
UniqueCarrier             0
FlightNum                 0
TailNum                   5
ActualElapsedTime      8387
CRSElapsedTime          198
AirTime                8387
ArrDelay               8387
DepDelay                  0
Origin                    0
Dest                      0
Distance                  0
TaxiIn                 7110
TaxiOut                 455
Cancelled                 0
CancellationCode          0
Diverted                  0
CarrierDelay         689270
WeatherDelay         689270
NASDelay             689270
SecurityDelay        689270
LateAircraftDelay    689270
dtype: int64

Existen varios campos con valores nulos. Dado que partimos de un dataset muy grande, procedemos a eliminar las filas que contengan algún campo con valor nulo.

In [8]:
df = df.dropna()

# Comprobamos que efectivamente hemos eliminado las filas con valores nulos
df.isna().sum()

Year                 0
Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
CRSDepTime           0
ArrTime              0
CRSArrTime           0
UniqueCarrier        0
FlightNum            0
TailNum              0
ActualElapsedTime    0
CRSElapsedTime       0
AirTime              0
ArrDelay             0
DepDelay             0
Origin               0
Dest                 0
Distance             0
TaxiIn               0
TaxiOut              0
Cancelled            0
CancellationCode     0
Diverted             0
CarrierDelay         0
WeatherDelay         0
NASDelay             0
SecurityDelay        0
LateAircraftDelay    0
dtype: int64

In [9]:
final_size = df.size
final_size

36177094

In [10]:
print("Nos hemos quedado con un " +str(round((final_size/init_size*100),4))+"% de los datos")

Nos hemos quedado con un 64.411% de los datos


In [11]:
# Comprobamos que no haya filas duplicadas
df = df.drop_duplicates()
df.size

36177036

Prácticamente no hay duplicados

## Análisis de las variables

In [12]:
# Variables categóricas
df.select_dtypes('object').nunique()

UniqueCarrier         20
FlightNum           7481
TailNum             5349
Origin               302
Dest                 301
Cancelled              1
CancellationCode       1
Diverted               1
dtype: int64

Cancelled, CancelationCode y Diverted pueden ser eliminados, ya que no aportan ninguna información.

In [13]:
df.drop(["Cancelled","CancellationCode","Diverted"], inplace=True, axis=1)

In [14]:
# Variables numéricas
df.describe()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0,1247484.0
mean,2008.0,6.065405,15.7254,3.980082,1558.831,1487.948,1616.748,1652.457,135.3779,131.7641,107.4204,63.29107,59.67723,741.5869,7.297229,20.66035,19.17942,3.703361,15.02161,0.09013743,25.29653
std,0.0,3.508936,8.793001,1.993271,454.3301,421.1782,583.9417,461.7373,72.29641,69.53206,68.17459,60.75384,59.86886,559.3648,6.033244,16.67984,43.54626,21.49155,33.83309,2.022717,42.05491
min,2008.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,14.0,-21.0,0.0,15.0,6.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2008.0,3.0,8.0,2.0,1232.0,1150.0,1326.0,1340.0,83.0,80.0,58.0,26.0,24.0,334.0,4.0,11.0,0.0,0.0,0.0,0.0,0.0
50%,2008.0,6.0,16.0,4.0,1618.0,1529.0,1737.0,1722.0,118.0,115.0,90.0,43.0,41.0,595.0,6.0,16.0,2.0,0.0,2.0,0.0,8.0
75%,2008.0,9.0,23.0,6.0,1924.0,1830.0,2048.0,2022.0,167.0,161.0,136.0,79.0,75.0,972.0,8.0,24.0,21.0,0.0,15.0,0.0,33.0
max,2008.0,12.0,31.0,7.0,2400.0,2359.0,2400.0,2359.0,1114.0,660.0,1091.0,2461.0,2467.0,4962.0,240.0,422.0,2436.0,1352.0,1357.0,392.0,1316.0


Year también puede ser eliminada, ya que todos los vuelos son del 2008.

In [15]:
df.drop("Year", inplace=True, axis=1)

In [16]:
# Correlación entre variables númericamente
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(4)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
Month,1.0,0.067,0.0093,-0.0088,-0.0134,-0.002,-0.0018,0.004,0.0066,0.0001,0.0123,0.0154,0.0058,0.0239,0.0082,0.0004,0.0066,0.0114,-0.0031,0.005
DayofMonth,0.067,1.0,0.026,0.0034,0.0036,0.0034,0.0039,-0.0,0.0006,0.0004,0.0091,0.0099,-0.0002,-0.0014,-0.001,-0.0009,0.0009,0.0056,-0.0003,0.0091
DayOfWeek,0.0093,0.026,1.0,0.0214,0.0278,0.0073,0.0173,0.0045,0.0062,0.0058,0.0119,0.0138,0.009,0.012,-0.0085,0.0102,0.0056,-0.0066,0.0047,0.0088
DepTime,-0.0088,0.0034,0.0214,1.0,0.8403,0.3732,0.7142,-0.0568,-0.0415,-0.0586,0.094,0.1158,-0.0549,-0.0251,0.0024,-0.0519,0.0053,0.0225,-0.0153,0.1695
CRSDepTime,-0.0134,0.0036,0.0278,0.8403,1.0,0.2876,0.7256,-0.0404,-0.0221,-0.0373,0.0171,0.0405,-0.0252,-0.0433,-0.0067,-0.1073,-0.0093,-0.0327,-0.0171,0.1677
ArrTime,-0.002,0.0034,0.0073,0.3732,0.2876,1.0,0.481,-0.0283,-0.0293,-0.0319,-0.0792,-0.0802,-0.0442,0.0431,-0.0078,-0.084,-0.0299,0.0324,-0.0074,-0.0379
CRSArrTime,-0.0018,0.0039,0.0173,0.7142,0.7256,0.481,1.0,0.0345,0.0481,0.0298,0.025,0.0396,0.0323,0.0039,0.0263,-0.0962,-0.005,0.0092,-0.0133,0.1315
ActualElapsedTime,0.004,-0.0,0.0045,-0.0568,-0.0404,-0.0283,0.0345,1.0,0.9647,0.9691,0.0837,-0.0022,0.9444,0.1616,0.3151,0.01,0.0041,0.1771,0.0057,-0.0342
CRSElapsedTime,0.0066,0.0006,0.0062,-0.0415,-0.0221,-0.0293,0.0481,0.9647,1.0,0.9839,0.0316,0.0284,0.9813,0.0902,0.1273,0.0266,-0.0165,0.047,0.0061,-0.0116
AirTime,0.0001,0.0004,0.0058,-0.0586,-0.0373,-0.0319,0.0298,0.9691,0.9839,1.0,0.0275,0.0005,0.9783,0.074,0.0863,0.0195,-0.0183,0.0628,0.0072,-0.022


In [17]:
# Faltan plots para distribuciones de variables categóricas y numéricas

In [18]:
## Guardamos los datos preprocesados, para ser utilizados en la predicción posterior
df.to_csv("DelayedFlightsPreprocessed.csv", index=False)