Here we train our initial machine learning model and then implement gradual training.

In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.externals import joblib
import random
import numpy as np
import gc

In [2]:
airline = ['TZ', 'FL', 'AS', 'AQ', 'HP', 'AA', 'MQ', 'DH', 'EV', 'CO', 'DL', '9E', 'MQ', 'EV', 'XE', 'F9', 'HA', 'DH', 'B6', 'YV', 'NW', '9E', 'OO', 'WN', 'NK', 'US', 'UA', 'VX']
airlineEncoder = LabelEncoder()
airlineEncoder.fit(airline)
joblib.dump(airlineEncoder, "Airline_Encoder.pkl", protocol = 2)
print(airlineEncoder.transform(airline))

[17 10  3  2 12  1 13  6  8  5  7  0 13  8 22  9 11  6  4 23 15  0 16 21 14
 19 18 20]


In [3]:
#Things to Label Encode... All Airports, All Tail numbers
airports = pd.read_csv("Airport_to_Weather_Stations.txt")["LocationID"].tolist() + ['FCA', 'PFN', 'BKG','CLD', 'HDN', 'HHH', 'MQT', 'AZA', 'ROP', 'SPN', 
                'UST', 'SCE', 'UTM', 'YAP', 'YUM']
airportEncoder = LabelEncoder()
airportEncoder.fit(airports)
joblib.dump(airportEncoder, "Airport_Encoder.pkl", protocol = 2)
print(airports)


['ADK', 'ANC', 'ANI', 'BRW', 'BET', 'CDV', 'SCC', 'DLG', 'FAI', 'GST', 'JNU', 'KTN', 'AKN', 'ADQ', 'OTZ', 'OME', 'PSG', 'SIT', 'KSM', 'DUT', 'WRG', 'YAK', 'BHM', 'DHN', 'HSV', 'MOB', 'MGM', 'XNA', 'FSM', 'LIT', 'TXK', 'PPG', 'IFP', 'FLG', 'GCN', 'PHX', 'TUS', 'ACV', 'BFL', 'BUR', 'CIC', 'CCR', 'CEC', 'FAT', 'IPL', 'IYK', 'LGB', 'LAX', 'MMH', 'MOD', 'MRY', 'OAK', 'ONT', 'OXR', 'PSP', 'PMD', 'RDD', 'SMF', 'SAN', 'SFO', 'SJC', 'SBP', 'SNA', 'SBA', 'SMX', 'STS', 'TVL', 'SCK', 'VIS', 'ASE', 'COS', 'DEN', 'DRO', 'EGE', 'GJT', 'GUC', 'MTJ', 'PUB', 'TEX', 'HVN', 'BDL', 'DCA', 'IAD', 'ILG', 'DAB', 'FLL', 'RSW', 'GNV', 'JAX', 'EYW', 'MTH', 'MLB', 'MIA', 'APF', 'MCO', 'ECP', 'PNS', 'PGD', 'SRQ', 'PIE', 'TLH', 'TPA', 'VPS', 'PBI', 'ABY', 'ATL', 'AGS', 'BQK', 'CSG', 'MCN', 'SAV', 'VLD', 'GUM', 'ITO', 'HNL', 'OGG', 'KOA', 'MKK', 'LNY', 'LIH', 'CID', 'DSM', 'DBQ', 'SUX', 'ALO', 'BOI', 'SUN', 'IDA', 'LWS', 'PIH', 'TWF', 'BMI', 'CMI', 'MDW', 'ORD', 'RFD', 'MLI', 'PIA', 'SPI', 'EVV', 'FWA', 'IND', 'SBN'

In [4]:
#Encode TailNumbers
TailNumbers = pd.read_csv("Final_Tail_Number_List.txt", header = None)[0].tolist()
TailEncoder = LabelEncoder()
TailEncoder.fit(TailNumbers)
joblib.dump(TailEncoder, "Tail_Number_Encoder.pkl", protocol = 2)
print(TailEncoder.transform(TailNumbers))

[7332 8265 6797 ..., 7758 4970 3054]


In [20]:
filename = "Final_historical_data.csv"
#f = open(filename)
#n = sum(1 for line in f) #number of records in file (excludes header)
#print(n)
#f.close()
n = 34962162 #Number of rows!!
s = 1000000 #desired sample size
skip = sorted(random.sample(range(0,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pd.read_csv(filename, skiprows=skip, names = ["Airline", "Year", "Month", "Day"
                                    ,"Tail", "Source", "Destination",  "Departure_hour", "Departure_minute"
                                    , "Delay", "Station", "Previous_Delay", 'PRCP', 'SNOW', 'TMAX', 'TMIN'])

df.to_csv("Random_Sample_data.csv")

In [19]:
df["Airline"] = airlineEncoder.transform(df["Airline"])
df['Source'] = airportEncoder.transform(df['Source'])
df['Destination']  = airportEncoder.transform(df['Destination'])
df['Tail'] = TailEncoder.transform(df['Tail'])
df['Y'] = df['Delay'] > 10


In [25]:
LR = LogisticRegression()
LR.fit(df[["Airline", "Year", "Month", "Day", "Source", 
           "Destination",  "Departure_hour", "Departure_minute", "Previous_Delay", 'PRCP', 'SNOW', 'TMAX', 'TMIN']], df['Y'])
joblib.dump(LR, "LR.pkl")

['LR.pkl']

In [5]:
minimums = {"Airline": 0.0, "Year": 2005, "Month": 1, "Tail_Number": 0, "depAirport": 0, "arrAirport": 0,  
           "Departure_hour": 0, "Departure_minute": 0, 'PRCP': 0, 'SNOW': 0, 'TMAX': -367, 'TMIN': -439}
maximums = {"Airline": 23, "Year": 2017, "Month": 12, "Tail_Number": 9128, "depAirport": 395, "arrAirport": 395,  
           "Departure_hour": 23, "Departure_minute": 59, 'PRCP': 3335, 'SNOW': 465, 'TMAX': 578, 'TMIN': 556}

filename = "Final_historical_data.csv"
shuffle = np.random.permutation(np.arange(34962162)) #number of rows.
print("Randomization order complete.")
model = SGDClassifier(loss = 'log', n_jobs = -1, warm_start = True)
for i in range(34962162//500000 + 1):
    skip = shuffle[i*500000: min((i+1)*500000, 34962162)]
    df = pd.read_csv(filename, skiprows=skip, names = ["Airline", "Year", "Month", "Day"
                                    ,"Tail", "Source", "Destination",  "Departure_hour", "Departure_minute"
                                    , "Delay", "Station", "Previous_Delay", 'PRCP', 'SNOW', 'TMAX', 'TMIN'])
    del skip
    gc.collect()
    print(i, "loaded pd dataframe")
    df["Airline"] = airlineEncoder.transform(df["Airline"])
    df['Source'] = airportEncoder.transform(df['Source'])
    df['Destination']  = airportEncoder.transform(df['Destination'])
    df['Tail'] = TailEncoder.transform(df['Tail'])
    df['Y'] = df['Delay'] > 10
    #normalization

    df['Airline'] = (df['Airline'] - minimums['Airline'])/(maximums['Airline'] - minimums['Airline'])
    df['Year'] = (df['Year'] - minimums['Year'])/(maximums['Year'] - minimums['Year'])
    df['Month'] = (df['Month'] - minimums['Month'])/(maximums['Month'] - minimums['Month'])
    df['Tail'] = (df['Tail'] - minimums['Tail_Number'])/(maximums['Tail_Number'] - minimums['Tail_Number'])
    df['Source'] = (df['Source'] - minimums['depAirport'])/(maximums['depAirport'] - minimums['depAirport'])
    df['Destination'] = (df['Destination'] - minimums['arrAirport'])/(maximums['arrAirport'] - minimums['arrAirport'])
    df['Departure_hour'] = (df['Departure_hour'] - minimums['Departure_hour'])/(maximums['Departure_hour'] - minimums['Departure_hour'])
    df['Departure_minute'] = (df['Departure_minute'] - minimums['Departure_minute'])/(maximums['Departure_minute'] - minimums['Departure_minute'])
    df['PRCP'] = (df['PRCP'] - minimums['PRCP'])/(maximums['PRCP'] - minimums['PRCP'])
    df['SNOW'] = (df['SNOW'] - minimums['SNOW'])/(maximums['SNOW'] - minimums['SNOW'])
    df['TMAX'] = (df['TMAX'] - minimums['TMAX'])/(maximums['TMAX'] - minimums['TMAX'])
    df['TMIN'] = (df['TMIN'] - minimums['TMIN'])/(maximums['TMIN'] - minimums['TMIN'])

    model.fit(df[["Airline", "Year", "Month", "Source", 
           "Destination",  "Departure_hour", "Departure_minute", 'PRCP', 'SNOW', 'TMAX', 'TMIN']], df['Y'])
    joblib.dump(model, "Batch_trained_model3.pkl", protocol = 2)
    print("Fitted lines", min((i+1)*500000, 34962162))
    del df
    gc.collect()

    

Randomization order complete.
0 loaded pd dataframe
Fitted lines 500000
1 loaded pd dataframe
Fitted lines 1000000
2 loaded pd dataframe
Fitted lines 1500000
3 loaded pd dataframe
Fitted lines 2000000
4 loaded pd dataframe
Fitted lines 2500000
5 loaded pd dataframe
Fitted lines 3000000
6 loaded pd dataframe
Fitted lines 3500000
7 loaded pd dataframe
Fitted lines 4000000
8 loaded pd dataframe
Fitted lines 4500000
9 loaded pd dataframe
Fitted lines 5000000
10 loaded pd dataframe
Fitted lines 5500000
11 loaded pd dataframe
Fitted lines 6000000
12 loaded pd dataframe
Fitted lines 6500000
13 loaded pd dataframe
Fitted lines 7000000
14 loaded pd dataframe
Fitted lines 7500000
15 loaded pd dataframe
Fitted lines 8000000
16 loaded pd dataframe
Fitted lines 8500000
17 loaded pd dataframe
Fitted lines 9000000
18 loaded pd dataframe
Fitted lines 9500000
19 loaded pd dataframe
Fitted lines 10000000
20 loaded pd dataframe
Fitted lines 10500000
21 loaded pd dataframe
Fitted lines 11000000
22 loaded 

In [None]:
gc.collect()
minimum = df.min(numeric_only = True)
maximum = df.max(numeric_only = True)
print(minimum, maximum)

In [8]:
model = joblib.load("Batch_trained_model.pkl")
joblib.dump(model, "Batch_train_model2.pkl", protocol = 2)

['Batch_train_model2.pkl']

In [6]:

keys = ["Airline", "Year", "Month", "Source", "Destination",  "Departure_hour", "Departure_minute", 'PRCP', 'SNOW', 'TMAX', 'TMIN']
values = model.coef_[0]
d = dict(zip(keys, values))
print(d)

{'Airline': 0.057053980414945346, 'Year': -0.049124588862196866, 'Month': -0.014941928825090905, 'Source': -0.16963852127306547, 'Destination': -0.029645540205650391, 'Departure_hour': 2.0954838630434307, 'Departure_minute': 0.19009012930179284, 'PRCP': 3.6506510700692814, 'SNOW': 1.586307027576932, 'TMAX': -1.7795258342986453, 'TMIN': 1.2848025970450985}
