Here we implement a model to study the data

In [1]:
import pandas as pd
import numpy as np

df=pd.read_csv('2013.csv')

# Data formatting

Firstly we should drop the unnecessary column

In [2]:
df = df.drop(columns=['Unnamed: 27'], errors='ignore') # Removing the useless column

We standardize numeric formats to int:

In [3]:
for column in df.columns:
    if df[column].dtype == 'float64':
        # Convert float to int by rounding
        df[column] = df[column].round().astype('Int64')


Next we change our two flag attributes from int to boolean format (boolean is usually preferred in ML applications):

In [4]:
# Convert CANCELLED and DIVERTED to boolean
boolean_columns = ['CANCELLED', 'DIVERTED']
for col in boolean_columns:
    df[col] = df[col].astype(bool)

Next we find the attributes that are in the military time format

In [5]:
import pandas as pd
import numpy as np

def identify_military_time_columns(df):
    def is_military_time_format(series):
        
        # Convert to numeric, coercing errors to NaN
        numeric_series = pd.to_numeric(series, errors='coerce')
        
        # Filter out non-numeric columns
        if numeric_series.isnull().all():
            return False
        
        range_values = series[(series >= 60) & (series < 100)]

        if range_values.any():
            return False
        
        return True

    print("Columns in military time format:")
    military_time_columns = []
    
    for column in df.columns:
        if is_military_time_format(df[column]):
            print(f"- {column}")
            military_time_columns.append(column)
    
    if not military_time_columns:
        print("No columns found in military time format.")
    
    return military_time_columns

# Example usage
result = identify_military_time_columns(df)

Columns in military time format:
- CRS_DEP_TIME
- DEP_TIME
- WHEELS_OFF
- WHEELS_ON
- CRS_ARR_TIME
- ARR_TIME
- CANCELLED
- DIVERTED


Next we convert the military time format columns to regular minutes, E.G : 1315 = 13 * 60 + 15, continous numeric features are preferable

In [6]:
def convert_military_time_to_minutes(df, military_time_columns):
    def military_to_minutes(time_val):
        # Convert military time to total minutes
        if pd.isna(time_val):
            return np.nan
        time_str = str(int(time_val)).zfill(4)
        hours = int(time_str[:2])
        minutes = int(time_str[2:])
        return int(hours * 60 + minutes)

    for col in military_time_columns:
        df[col] = df[col].apply(military_to_minutes)
    
    return df

df = convert_military_time_to_minutes(df,['CRS_DEP_TIME','DEP_TIME','WHEELS_OFF','WHEELS_ON','CRS_ARR_TIME','ARR_TIME'])

consider which attributes could be unnecessary (they are expressed through other attributes already.)

In [10]:
for column in df.columns:
    if df[column].dtype == 'float64':
        # Convert float to int by rounding
        df[column] = df[column].round().astype('Int64')


In [12]:
print(df.loc[101])

FL_DATE                2013-01-01
OP_CARRIER                     VX
OP_CARRIER_FL_NUM             784
ORIGIN                        SEA
DEST                          LAX
CRS_DEP_TIME                  765
DEP_TIME                      767
DEP_DELAY                       2
TAXI_OUT                       12
WHEELS_OFF                    779
WHEELS_ON                     898
TAXI_IN                         9
CRS_ARR_TIME                  920
ARR_TIME                      907
ARR_DELAY                     -13
CANCELLED                       0
CANCELLATION_CODE             NaN
DIVERTED                        0
CRS_ELAPSED_TIME              155
ACTUAL_ELAPSED_TIME           140
AIR_TIME                      119
DISTANCE                      954
CARRIER_DELAY                <NA>
WEATHER_DELAY                <NA>
NAS_DELAY                    <NA>
SECURITY_DELAY               <NA>
LATE_AIRCRAFT_DELAY          <NA>
Name: 101, dtype: object


CARRIER_DELAY, WEATHER_DELAY, NAS_DELAY, SECURITY_DELAY, LATE_AIRCRAFT_DELAY, values of <NA> should be replaced with 0-s?



# Machine learning

Ideas to look into: 
- **Using only the data we know before the flight, to predict something about the flight**
- **Using data about the flight, to predict something about the delays, how long is some delay based on whole delay**



In [9]:

columns_to_int = [
    'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ARR_DELAY',
    'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY',
    'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'
] # Selecting the columns that should be changed to int64
df[columns_to_int] = df[columns_to_int].astype('Int64') # Applying changes

binary_columns = [
    'CANCELLED', 'DIVERTED'
] # Selecting the columns that should be changed to binary

df[binary_columns] = df[binary_columns].astype('Int64') # Applying changes