In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from keras.layers import Dense, Activation
from keras.models import Sequential
import tensorflow as tf
import keras
from keras.regularizers import l2
from tensorflow.keras import regularizers

In [None]:
flights_test = pd.read_csv('flights_test.csv')
flights_train = pd.read_csv('flights_train.csv') #extract the zip file first

In [None]:
flights_train = flights_train.drop(columns = ['Unnamed: 0'])
flights_test = flights_test.drop(columns = ['Unnamed: 0'])

# Linear Regression (baseline)

In [None]:
MSE_arr_test = []
MSE_arr_train = []
r2score = []
for x in range(10):  #Increase number of loops to test the regressor on different splits
    df = flights_train.sample(frac=0.5, replace=True)
    X = df
    y = df.ARRIVAL_DELAY
    X = X.drop(columns = ['ARRIVAL_DELAY'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 42)
    
    reg = LinearRegression()
    reg = reg.fit(X_train, y_train)
    
    MSE_arr_test.append(mean_squared_error(reg.predict(X_test), y_test))
    MSE_arr_train.append(mean_squared_error(reg.predict(X_train), y_train))
    r2score.append(reg.score(X_test, y_test))
    
    print(x)

In [None]:
print("Average train MSE of Linear Regression: ", np.mean(MSE_arr_train)) #Not linear data
print("Average test MSE of Linear Regression: ", np.mean(MSE_arr_test)) #Not linear data
print("Average R2 score of Linear Regression: ", np.mean(r2score)) #Not linear data

# Random Forest Regressor

In [None]:
MSE_arr_test = []
MSE_arr_train = []
r2score = []

for x in range(10):  #Increase number of loops to test the regressor on different splits
    df = flights_train.sample(frac=0.1, replace=True)
    X = df
    y = df.ARRIVAL_DELAY
    X = X.drop(columns = ['ARRIVAL_DELAY'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 42)
    
    reg = RandomForestRegressor(n_estimators=30, max_depth = 10, min_samples_split = 7, random_state=0, n_jobs = -1)
    reg = reg.fit(X_train, y_train)
    
    MSE_arr_test.append(mean_squared_error(reg.predict(X_test), y_test))
    MSE_arr_train.append(mean_squared_error(reg.predict(X_train), y_train))
    r2score.append(reg.score(X_test, y_test))
    
    print(x)

In [None]:
print("Average train MSE of Linear Regression: ", np.mean(MSE_arr_train))
print("Average test MSE of Linear Regression: ", np.mean(MSE_arr_test))
print("Average R2 score of Linear Regression: ", np.mean(r2score))

### Random Forest Prediction

In [None]:
res = reg.predict(flights_test)
res

In [None]:
pd.Series(res.reshape((514384))).to_csv('answer_sheet_forest_best.csv')

# Neural Network

In [None]:
df = flights_train.sample(frac=0.25, replace=True, random_state = 1)
X = df
y = df.ARRIVAL_DELAY
X = X.drop(columns = ['ARRIVAL_DELAY'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
model = Sequential()

model.add(Dense(32, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),activity_regularizer=regularizers.l2(1e-5), activation = 'relu', use_bias=True, bias_initializer='zeros', input_dim = 341))

model.add(Dense(units = 32, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),activity_regularizer=regularizers.l2(1e-5), activation = 'relu', use_bias=True, bias_initializer='zeros'))

model.add(Dense(units = 32, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),activity_regularizer=regularizers.l2(1e-5), activation = 'relu', use_bias=True, bias_initializer='zeros'))

model.add(Dense(units = 32, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),activity_regularizer=regularizers.l2(1e-5), activation = 'relu', use_bias=True, bias_initializer='zeros'))

model.add(Dense(units = 32, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),activity_regularizer=regularizers.l2(1e-5), activation = 'relu', use_bias=True, bias_initializer='zeros'))

model.add(Dense(units = 32, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),activity_regularizer=regularizers.l2(1e-5), activation = 'relu', use_bias=True, bias_initializer='zeros'))

model.add(Dense(units = 32, kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),activity_regularizer=regularizers.l2(1e-5), activation = 'relu', use_bias=True, bias_initializer='zeros'))

model.add(Dense(units = 1))

In [None]:
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

model.fit(x = X_train, y= y_train, epochs = 500, batch_size=2500) #Use GPU to train faster

In [None]:
mean_squared_error(model.predict(X_test), y_test)

###  Neural Network Predictions

In [None]:
res = reg.predict(flights_test)
res

In [None]:
pd.Series(res.reshape((514384))).to_csv('answer_sheet_NN_best.csv')