In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [52]:
train  = pd.read_csv("train.csv")

In [53]:
features = ['Month', 'Estimated_Departure_Time', 'Origin_Airport_ID',
       'Destination_Airport_ID', 'Distance', 'Carrier_ID(DOT)', 'Flight_Time']

In [54]:
train['Delay'] = train['Delay'].replace({'Not_Delayed': 0, 'Delayed': 1})

In [58]:
# reference : https://www.kaggle.com/code/rinnqd/reduce-memory-usage/notebook
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [59]:
train = reduce_mem_usage(train)

Memory usage after optimization is: 14.31 MB
Decreased by 76.6%


In [60]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Month                     1000000 non-null  int8   
 1   Estimated_Departure_Time  1000000 non-null  int16  
 2   Origin_Airport_ID         1000000 non-null  int16  
 3   Destination_Airport_ID    1000000 non-null  int16  
 4   Distance                  1000000 non-null  float16
 5   Carrier_ID(DOT)           1000000 non-null  float16
 6   Flight_Time               1000000 non-null  float16
 7   Delay                     255001 non-null   float16
dtypes: float16(4), int16(3), int8(1)
memory usage: 14.3 MB


In [61]:
semi_train = train.dropna()

In [62]:
semi_test = train[train.isna().any(axis=1)].drop([target_variable], axis=1)


In [63]:
print(train['Delay'].value_counts().get(0., 0),train['Delay'].value_counts().get(1., 0))
# 4 : 1

210001 45000


In [64]:
from sklearn.cluster import MiniBatchKMeans

# Semi-supervised learning
mbk_model = MiniBatchKMeans(n_clusters=2)
mbk_model.fit(semi_train[features], semi_train[target_variable])
predictions = mbk_model.predict(semi_test[features])




In [65]:
print(list(predictions).count(0), list(predictions).count(1))

# 1 : 1

359120 385879


In [69]:
359120 + 385879


744999

In [66]:
from sklearn.mixture import GaussianMixture

# Semi-supervised learning
gm_model = GaussianMixture(n_components=2)
gm_model.fit(semi_train[features], semi_train[target_variable])
predictions3 = gm_model.predict(semi_test[features])


In [67]:
print(list(predictions3).count(0), list(predictions3).count(1))

# 4 : 1

590193 154806


In [68]:
591225 + 153774

744999

In [71]:
semi_test[target_variable] = predictions3

In [72]:
semi_test

Unnamed: 0,Month,Estimated_Departure_Time,Origin_Airport_ID,Destination_Airport_ID,Distance,Carrier_ID(DOT),Flight_Time,Delay
0,4,600,13851,12191,419.0,19392.0,97.625,0
1,8,740,13930,14869,1250.0,20304.0,164.000,1
2,9,1610,11057,12953,544.0,19808.0,115.000,0
3,7,905,12892,11618,2454.0,19984.0,510.000,1
4,1,900,14771,10157,250.0,20304.0,79.000,0
...,...,...,...,...,...,...,...,...
999995,9,936,13930,14100,678.0,19984.0,187.000,0
999996,5,920,11637,13487,223.0,20304.0,68.000,0
999997,6,800,13796,12191,1642.0,19392.0,340.000,1
999998,9,1613,10693,10397,214.0,19792.0,131.000,0


In [73]:
merged_df = pd.concat([semi_train, semi_test], axis=0)


In [75]:
merged_df.reset_index(drop=True, inplace=True)


In [76]:
merged_df

Unnamed: 0,Month,Estimated_Departure_Time,Origin_Airport_ID,Destination_Airport_ID,Distance,Carrier_ID(DOT),Flight_Time,Delay
0,4,1545,11618,11278,199.0,20448.0,46.34375,0.0
1,1,1742,11618,10721,200.0,19984.0,81.00000,0.0
2,6,1420,10821,11057,361.0,19392.0,90.00000,0.0
3,8,1730,11278,14122,204.0,20448.0,74.00000,1.0
4,1,1015,11042,11292,1201.0,19392.0,90.00000,0.0
...,...,...,...,...,...,...,...,...
999995,9,936,13930,14100,678.0,19984.0,187.00000,0.0
999996,5,920,11637,13487,223.0,20304.0,68.00000,0.0
999997,6,800,13796,12191,1642.0,19392.0,340.00000,1.0
999998,9,1613,10693,10397,214.0,19792.0,131.00000,0.0


In [77]:
merged_df.to_csv("train.csv", index=False)