In [1]:
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('flights.csv')

In [3]:
df.sample(5)

Unnamed: 0,year,month,day,dep_delay,arr_delay,origin,dest,flight,tailnum,distance,air_time,carrier,name,date
273082,2013,7,24,0.0,20.0,JFK,DEN,97,N657JB,1626,223.0,B6,JetBlue Airways,2013-07-24
336434,2013,9,30,-3.0,-8.0,JFK,BUF,2302,N273JB,301,56.0,B6,JetBlue Airways,2013-09-30
334015,2013,9,27,-5.0,-23.0,LGA,CLE,3674,N532MQ,419,60.0,MQ,Envoy Air,2013-09-27
160186,2013,3,26,-9.0,-14.0,JFK,DCA,4410,N856MQ,213,39.0,MQ,Envoy Air,2013-03-26
316818,2013,9,9,1.0,-20.0,JFK,SJU,703,N656JB,1598,195.0,B6,JetBlue Airways,2013-09-09


In [4]:
# Identify numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Numerical columns:", numerical_columns)

# Check missing values in numerical columns specifically
print("\nMissing values in numerical columns:")
for col in numerical_columns:
    missing_count = df[col].isnull().sum()
    if missing_count > 0:
        print(f"{col}: {missing_count} missing values ({missing_count/len(df)*100:.2f}%)")


Numerical columns: ['year', 'month', 'day', 'dep_delay', 'arr_delay', 'flight', 'distance', 'air_time']

Missing values in numerical columns:
dep_delay: 8255 missing values (2.45%)
arr_delay: 9430 missing values (2.80%)
air_time: 9430 missing values (2.80%)


In [5]:
# Impute missing values in numerical columns with median
print("Before imputation - Missing values:")
print(df[numerical_columns].isnull().sum())

# Create a copy of the dataframe to preserve original data
df_imputed = df.copy()

# Impute missing values with median for each numerical column
for col in numerical_columns:
    if df_imputed[col].isnull().sum() > 0:
        median_value = df_imputed[col].median()
        df_imputed[col] = df_imputed[col].fillna(median_value)
        print(f"Imputed {col} with median: {median_value}")

print("\nAfter imputation - Missing values:")
print(df_imputed[numerical_columns].isnull().sum())


Before imputation - Missing values:
year            0
month           0
day             0
dep_delay    8255
arr_delay    9430
flight          0
distance        0
air_time     9430
dtype: int64
Imputed dep_delay with median: -2.0
Imputed arr_delay with median: -5.0
Imputed air_time with median: 129.0

After imputation - Missing values:
year         0
month        0
day          0
dep_delay    0
arr_delay    0
flight       0
distance     0
air_time     0
dtype: int64
