In [142]:
# If you do not already have pytorch installed, first run:
# !pip install torch
import torch


In [143]:
from torch.utils.data import Dataset, DataLoader


class TabularDataset(Dataset):
  def __init__(self, data, cat_cols=None, output_col=None):
    """
    Characterizes a Dataset for PyTorch

    Parameters
    ----------

    data: pandas data frame
      The data frame object for the input data. It must
      contain all the continuous, categorical and the
      output columns to be used.

    cat_cols: List of strings
      The names of the categorical columns in the data.
      These columns will be passed through the embedding
      layers in the model. These columns must be
      label encoded beforehand. 

    output_col: string
      The name of the output variable column in the data
      provided.
    """

    self.n = data.shape[0]

    if output_col:
      self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
    else:
      self.y =  np.zeros((self.n, 1))

    self.cat_cols = cat_cols if cat_cols else []
    self.cont_cols = [col for col in data.columns
                      if col not in self.cat_cols + [output_col]]

    if self.cont_cols:
      self.cont_X = data[self.cont_cols].astype(np.float32).values
    else:
      self.cont_X = np.zeros((self.n, 1))

    if self.cat_cols:
      self.cat_X = data[cat_cols].astype(np.int64).values
    else:
      self.cat_X =  np.zeros((self.n, 1))

  def __len__(self):
    """
    Denotes the total number of samples.
    """
    return self.n

  def __getitem__(self, idx):
    """
    Generates one sample of data.
    """
    return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

In [144]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class FeedForwardNN(nn.Module):

  def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
               output_size, emb_dropout, lin_layer_dropouts):

    """
    Parameters
    ----------

    emb_dims: List of two element tuples
      This list will contain a two element tuple for each
      categorical feature. The first element of a tuple will
      denote the number of unique values of the categorical
      feature. The second element will denote the embedding
      dimension to be used for that feature.

    no_of_cont: Integer
      The number of continuous features in the data.

    lin_layer_sizes: List of integers.
      The size of each linear layer. The length will be equal
      to the total number
      of linear layers in the network.

    output_size: Integer
      The size of the final output.

    emb_dropout: Float
      The dropout to be used after the embedding layers.

    lin_layer_dropouts: List of floats
      The dropouts to be used after each linear layer.
    """

    super().__init__()

    # Embedding layers
    self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                     for x, y in emb_dims])

    no_of_embs = sum([y for x, y in emb_dims])
    self.no_of_embs = no_of_embs
    self.no_of_cont = no_of_cont

    # Linear Layers
    first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
                                lin_layer_sizes[0])

    self.lin_layers =\
     nn.ModuleList([first_lin_layer] +\
          [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
           for i in range(len(lin_layer_sizes) - 1)])
    
    for lin_layer in self.lin_layers:
      nn.init.kaiming_normal_(lin_layer.weight.data)

    # Output Layer
    self.output_layer = nn.Linear(lin_layer_sizes[-1],
                                  output_size)
    nn.init.kaiming_normal_(self.output_layer.weight.data)

    # Batch Norm Layers
    self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
    self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                    for size in lin_layer_sizes])

    # Dropout Layers
    self.emb_dropout_layer = nn.Dropout(emb_dropout)
    self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                  for size in lin_layer_dropouts])

  def forward(self, cont_data, cat_data):

    if self.no_of_embs != 0:
      x = [emb_layer(cat_data[:, i])
           for i,emb_layer in enumerate(self.emb_layers)]
      x = torch.cat(x, 1)
      x = self.emb_dropout_layer(x)

    if self.no_of_cont != 0:
      normalized_cont_data = self.first_bn_layer(cont_data)

      if self.no_of_embs != 0:
        x = torch.cat([x, normalized_cont_data], 1) 
      else:
        x = normalized_cont_data

    for lin_layer, dropout_layer, bn_layer in\
        zip(self.lin_layers, self.droput_layers, self.bn_layers):
      
      x = F.relu(lin_layer(x))
      x = bn_layer(x)
      x = dropout_layer(x)

    x = self.output_layer(x)

    return x

In [146]:

import pandas as pd
import numpy as np


(39286, 85)
(39286,)
(4910, 85)
(4910,)
(4912, 85)
(4912,)


(None, None)

In [84]:
#pip install fast_ml

Note: you may need to restart the kernel to use updated packages.


In [85]:
#Splitting the dataset randomly
#from fast_ml.model_development import train_valid_test_split

#X_train2, y_train2, X_valid2, y_valid2, X_test2, y_test2 = train_valid_test_split(df, target = 'target', 
#                                                                            method='sorted', sort_by_col='ev_date',
#                                                                            train_size=0.8, valid_size=0.1, test_size=0.1)

#print(X_train2.shape), print(y_train2.shape)
#print(X_valid2.shape), print(y_valid2.shape)
#print(X_test2.shape), print(y_test2.shape)

(39286, 85)
(39286,)
(4910, 85)
(4910,)
(4912, 85)
(4912,)


(None, None)

In [123]:
df.columns

Index(['Unnamed: 0', 'aircraft_ev_id', 'far_part', 'flight_plan_activated',
       'damage', 'acft_make', 'acft_model', 'cert_max_gr_wt', 'num_eng',
       'type_last_insp', 'date_last_insp', 'afm_hrs_last_insp', 'afm_hrs',
       'type_fly', 'dprt_apt_id', 'dprt_city', 'dprt_state', 'dprt_time',
       'dest_same_local', 'dest_apt_id', 'dest_city', 'dest_state',
       'phase_flt_spec', 'afm_hrs_since', 'rwy_num', 'rwy_len', 'rwy_width',
       'ifr_equipped_cert', 'eng_no', 'eng_type', 'eng_mfgr', 'hp_or_lbs',
       'carb_fuel_injection', 'eng_time_total', 'eng_time_last_insp',
       'ntsb_no', 'ev_type', 'ev_date', 'ev_dow', 'ev_time', 'ev_city',
       'ev_state', 'ev_year', 'ev_month', 'latitude', 'longitude', 'apt_name',
       'apt_dist', 'apt_dir', 'apt_elev', 'wx_src_iic', 'wx_obs_time',
       'wx_obs_dir', 'wx_obs_fac_id', 'wx_obs_elev', 'wx_obs_dist',
       'light_cond', 'sky_cond_nonceil', 'sky_nonceil_ht', 'sky_ceil_ht',
       'sky_cond_ceil', 'vis_rvr', 'vis_sm', 'wx

In [87]:
categorical_columns = ['flight_plan_activated', 'damage', 'acft_make', 'acft_model', 'cert_max_gr_wt', 'type_last_insp', 'type_fly', 'dprt_apt_id', 'dprt_city', 'dprt_state', 
                       'phase_flt_spec', 'afm_hrs_since', 'rwy_num', 'ifr_equipped_cert', 'eng_type', 'eng_mfgr', 'hp_or_lbs','carb_fuel_injection', 'ev_dow','ev_time','rwy_len', 'rwy_width', 'ev_state', 'ev_year', 'ev_month', 'latitude', 
                       'longitude', 'apt_name','wx_src_iic','wx_obs_fac_id','light_cond', 'sky_cond_nonceil','sky_cond_ceil','wind_dir_ind','wind_vel_ind',
                       'gust_ind', 'wx_int_precip','wx_cond_basic','pilot_flying']
                       
                       

In [147]:
pd.set_option('display.max_rows', 100)
df.dtypes

Unnamed: 0                        int64
aircraft_ev_id                   object
far_part                          int64
flight_plan_activated            object
damage                           object
acft_make                        object
acft_model                       object
cert_max_gr_wt                   object
num_eng                           int64
type_last_insp                   object
date_last_insp                   object
afm_hrs_last_insp               float64
afm_hrs                           int64
type_fly                         object
dprt_apt_id                      object
dprt_city                        object
dprt_state                       object
dprt_time                        object
dest_same_local                  object
dest_apt_id                      object
dest_city                        object
dest_state                       object
phase_flt_spec                  float64
afm_hrs_since                    object
rwy_num                          object


In [88]:
numerical_columns = ['afm_hrs_last_insp', 'afm_hrs', 'eng_no', 'eng_time_total', 'eng_time_last_insp', 'apt_dist', 'apt_dir', 
                     'wx_obs_time', 'wx_obs_dir', 'wx_obs_elev', 'wx_obs_dist', 'sky_nonceil_ht', 'sky_ceil_ht', 'vis_rvr', 'vis_sm', 'wx_temp', 'wx_dew_pt',
                   'wind_dir_deg','wind_vel_kts','gust_kts', 'altimeter', 'wx_dens_alt']
                     

In [89]:
outputs = ['target']

In [139]:
df = pd.read_csv('labeled_clean_ntsb.csv', parse_dates=['ev_date'], low_memory=False)
pd.set_option('display.max_rows', 100)
df.dtypes

Unnamed: 0                        int64
aircraft_ev_id                   object
far_part                          int64
flight_plan_activated            object
damage                           object
acft_make                        object
acft_model                       object
cert_max_gr_wt                   object
num_eng                           int64
type_last_insp                   object
date_last_insp                   object
afm_hrs_last_insp               float64
afm_hrs                           int64
type_fly                         object
dprt_apt_id                      object
dprt_city                        object
dprt_state                       object
dprt_time                        object
dest_same_local                  object
dest_apt_id                      object
dest_city                        object
dest_state                       object
phase_flt_spec                  float64
afm_hrs_since                    object
rwy_num                          object


In [157]:
#Dropping columns. wx_obs_fac_id used for joining with weather data, needed for analysis? last two for class, not needed for model?
df = df.drop(['far_part', 'date_last_insp', 'dest_same_local', 'dest_apt_id', 'dest_city',  'dest_state',  
              'phase_flt_spec',  'eng_time_last_insp', 'ev_time',  'apt_dist',  'apt_dir',  'apt_name', 
              'wx_obs_dir', 'wx_obs_fac_id', 'wx_obs_elev','wx_obs_dist', 'ev_highest_injury', 'inj_tot_t', 
              'crew_inj_level','flight_plan_activated'], axis=1)

In [158]:
df.dtypes

Unnamed: 0                      int64
aircraft_ev_id                 object
damage                         object
acft_make                      object
acft_model                     object
cert_max_gr_wt                 object
num_eng                         int64
type_last_insp                 object
afm_hrs_last_insp             float64
afm_hrs                         int64
type_fly                       object
dprt_apt_id                    object
dprt_city                      object
dprt_state                     object
dprt_time                      object
afm_hrs_since                  object
rwy_num                        object
rwy_len                       float64
rwy_width                     float64
ifr_equipped_cert                bool
eng_no                        float64
eng_type                       object
eng_mfgr                       object
hp_or_lbs                      object
carb_fuel_injection            object
eng_time_total                float64
ntsb_no     

In [163]:
# Correcting floats to integer data types, and dropping any rows with NA in these colums
df2_formatted = df.astype({"afm_hrs_last_insp":'int64', 
                "phase_flt_spec":'int64', "rwy_len":'category', 
                "rwy_width":'category', "eng_no":'int64', "eng_time_total":'int64', 
                "eng_time_last_insp":'int64', "ev_time":'int64', "apt_dist":'int64', 
                "apt_dir":'int64', "apt_elev":'int64',  "wx_obs_time":'int64', 
                "wx_obs_dir":'int64', "wx_obs_elev":'int64', "wx_obs_dist":'int64', 
                "sky_nonceil_ht":'int64', "sky_ceil_ht":'int64', "vis_rvr":'int64', 
                "vis_sm":'int64', "wx_temp":'int64', "wx_dew_pt":'int64', "wind_dir_deg":'int64', 
                "wind_vel_kts":'int64', "gust_kts":'int64', "altimeter":'int64', "wx_dens_alt":'int64', 
                "inj_tot_t":'int64', "crew_no":'int64', "crew_age":'int64', "secondary_eng_mfgr":'category'})


KeyError: "Only a column name can be used for the key in a dtype mappings argument. 'phase_flt_spec' not found in columns."

In [159]:
df.isna().sum()                      

Unnamed: 0                 0
aircraft_ev_id             0
damage                     0
acft_make                  0
acft_model                 0
cert_max_gr_wt             0
num_eng                    0
type_last_insp             0
afm_hrs_last_insp      11578
afm_hrs                    0
type_fly                   0
dprt_apt_id            11991
dprt_city              11809
dprt_state             11254
dprt_time               4135
afm_hrs_since          32668
rwy_num                 7580
rwy_len                23148
rwy_width              24450
ifr_equipped_cert          0
eng_no                   706
eng_type                1726
eng_mfgr                1747
hp_or_lbs               2702
carb_fuel_injection     5987
eng_time_total         46841
ntsb_no                    0
ev_type                    0
ev_date                    0
ev_dow                     0
ev_city                    2
ev_state                  32
ev_year                    0
ev_month                   0
latitude      

In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()


Unnamed: 0,aircraft_ev_id,latitude,longitude


In [162]:
df2_formatted = df.astype({"damage":'category', "acft_make":'category',
                                      "acft_model":'category', "cert_max_gr_wt":'category',  "type_last_insp":'category',  
                                      "type_fly":'category', "dprt_apt_id":'category',  "dprt_city":'category',  
                                      "dprt_state":'category',  "dprt_time":'category',  "afm_hrs_since":'category',  
                                      "rwy_num":'category',  "ifr_equipped_cert":'category',  "eng_type":'category', 
                                      "afm_hrs":'category',  "dprt_time":'category',  "eng_mfgr":'category',
                                      "hp_or_lbs":'category',  "carb_fuel_injection":'category',  "eng_time_last_insp":'category',  
                                      "eng_time_total":'category', "ev_type":'category', "ev_dow":'category',
                                      "ev_city":'category', "ev_state":'category', "ev_month":'category', "rwy_len":'category', 
                                      "rwy_width":'category', "secondary_eng_mfgr":'category',"wx_src_iic":'category', "light_cond":'category',
                                      "sky_cond_nonceil":'category', "sky_cond_ceil":'category', "wind_dir_ind":'category', "wind_vel_ind": 'category', 
                                      "gust_ind":'category', "wx_int_precip":'category', "wx_cond_basic":'category', "crew_category":'category', "crew_sex":'category',
                                       "med_certf":'category', "pilot_flying":'category', "secondary_eng_mfgr":'category'})
                                 

KeyError: "Only a column name can be used for the key in a dtype mappings argument. 'eng_time_last_insp' not found in columns."

In [121]:
df2_formatted

Unnamed: 0.1,Unnamed: 0,aircraft_ev_id,far_part,flight_plan_activated,damage,acft_make,acft_model,cert_max_gr_wt,num_eng,type_last_insp,...,wx_cond_basic,crew_no,crew_category,crew_age,crew_sex,med_certf,pilot_flying,crew_inj_level,secondary_eng_mfgr,target


In [None]:
    
                                      
                                      

                                
df2_formatted['flight_plan_activated'].fillna('Unknown', inplace=True)
df2_formatted['afm_hrs_since'].fillna('Unknown', inplace=True)                                   
df2_formatted['hp_or_lbs'].fillna('Unknown', inplace=True)
df2_formatted['carb_fuel_injection'].fillna('Unknown', inplace=True)
df2_formatted['damage'].fillna('Unknown', inplace=True)
df2_formatted['acft_make'].fillna('Unknown', inplace=True)
df2_formatted['acft_model'].fillna('Unknown', inplace=True)                                      
df2_formatted['cert_max_gr_wt'].fillna('Unknown', inplace=True)
df2_formatted['type_last_insp'].fillna('Unknown', inplace=True)                                      
df2_formatted['type_fly'].fillna('Unknown', inplace=True)                                      
df2_formatted['dprt_apt_id'].fillna('Unknown', inplace=True)                                      
df2_formatted['dprt_city'].fillna('Unknown', inplace=True)                                      
df2_formatted['dprt_state'].fillna('Unknown', inplace=True)                                      
df2_formatted['dprt_time'].fillna('Unknown', inplace=True)                                      
df2_formatted['rwy_num'].fillna('Unknown', inplace=True)                                      
df2_formatted['ifr_equipped_cert'].fillna('Unknown', inplace=True)                                      
df2_formatted['eng_type'].fillna('Unknown', inplace=True)
df2_formatted['afm_hrs'].fillna('Unknown', inplace=True)                                      
df2_formatted['eng_mfgr'].fillna('Unknown', inplace=True) 
df2_formatted['eng_time_last_insp'].fillna('Unknown', inplace=True)                                       
df2_formatted['eng_time_total'].fillna('Unknown', inplace=True)                                       
df2_formatted['ev_type'].fillna('Unknown', inplace=True)                                       
df2_formatted['ev_dow'].fillna('Unknown', inplace=True)                                       
df2_formatted['ev_city'].fillna('Unknown', inplace=True)                                           
df2_formatted['ev_state'].fillna('Unknown', inplace=True)      
df2_formatted['ev_month'].fillna('Unknown', inplace=True)
df2_formatted['rwy_len'].fillna('Unknown', inplace=True)
df2_formatted['rwy_width'].fillna('Unknown', inplace=True)
df2_formatted['secondary_eng_mfgr'].fillna('Unknown', inplace=True)
df2_formatted['crew_no'].fillna('Unknown', inplace=True)                                    
                                                                           

# Clustering Lat/Long per https://levelup.gitconnected.com/clustering-gps-co-ordinates-forming-regions-4f50caa7e4a1

df2_formatted = df2_formatted.dropna(axis=0,how='any',subset=['latitude','longitude'],inplace=True)
# Variable with the Longitude and Latitude
X=df2_formatted.loc[:,['aircraft_ev_id','latitude','longitude']]
X.head(10)    

                                      
df2_formatted['date_last_insp'] = df2_formatted['date_last_insp'].to_datetime()                                     
#Bin: afm_hrs, afm_hrs_last_insp, dprt_time ['1201-1800', '1801-2400', '0001-0600', '0601-1200', nan],  "rwy_len":'category'
#wx_obs_time, sky_nonceil_ht, sky_ceil_ht, vis_rvr, vis_sm, wx_temp, wx_dew_pt, wind_dir_deg  
#gust_kts, altimeter, wx_dens_alt, crew_age                                    
                                      

df2_formatted = df2_formatted.astype({"afm_hrs_last_insp":'int64', "phase_flt_spec":'int64', "eng_no":'int64', "eng_time_total":'int64', "eng_time_last_insp":'int64', "apt_elev":'int64', "wx_obs_time":'int64', "wx_obs_dir":'int64', "wx_obs_elev":'int64', "wx_obs_dist":'int64', "sky_nonceil_ht":'int64', "sky_ceil_ht":'int64', "vis_rvr":'int64', "vis_sm":'int64', "wx_temp":'int64', "wx_dew_pt":'int64', "wind_dir_deg":'int64', "wind_vel_kts":'int64', "gust_kts":'int64', "altimeter":'int64', "wx_dens_alt":'int64', "inj_tot_t":'int64', "crew_no":'int64', "crew_age":'int64'})
df2_formatted=df.dropna(subset=['afm_hrs_last_insp','phase_flt_spec', 'eng_no', 'eng_time_total', 'eng_time_last_insp', 'apt_elev', 'wx_obs_time', 'wx_obs_dir', 'wx_obs_elev', 'wx_obs_dist', 'sky_nonceil_ht', 'sky_ceil_ht', 'vis_rvr', 'vis_sm', 'wx_temp', 'wx_dew_pt', 'wind_dir_deg', 'wind_vel_kts', 'gust_kts', 'altimeter', 'wx_dens_alt', 'inj_tot_t', 'crew_no', 'crew_age'])
                                        
                                      
                            

In [101]:
print(df.latitude)

28888        NaN
38654    365520n
31333        NaN
31334        NaN
31335        NaN
          ...   
40212    321926n
40207    384327n
40211    405957n
40205    332754n
40208    282825n
Name: latitude, Length: 49108, dtype: object


In [165]:
nan_count = df['latitude'].isna().sum()
print(nan_count)

df['phase_flt_spec'].describe(include='all') 

33088


KeyError: 'phase_flt_spec'

In [164]:
df['phase_flt_spec'].unique()

KeyError: 'phase_flt_spec'

In [109]:
# Declaring a value that we need to find
val = '0nan'

# Finding our value in dataframe
res = df[df.eq(val).any(1)]

# Display result
print("Result:\n",res)

Result:
        Unnamed: 0  aircraft_ev_id  far_part flight_plan_activated damage  \
26265       26265  20001214x42125       135                     n   subs   
36022       36022  20040218x00198        91                     n   subs   
35431       35431  20030721x01158        91                     n   dest   
12200       12200  20001212x21263        91                     n   subs   
12560       12560  20001212x22111        91                     n   dest   
...           ...             ...       ...                   ...    ...   
40209       40209  20221128106368        91                   NaN   subs   
40214       40214  20221128106373        91                     n   subs   
40206       40206  20221206106424        91                     n   subs   
40212       40212  20221205106408        91                     y   dest   
40207       40207  20221206106422        91                   NaN   subs   

               acft_make   acft_model  cert_max_gr_wt  num_eng type_last_insp 

In [170]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [171]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


In [173]:
# example of learned embedding encoding for a neural network
from numpy import unique
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import concatenate
from keras.utils import plot_model
 
# load the dataset
def load_dataset(filename):
 # load the dataset as a pandas DataFrame
 data = df
 # retrieve numpy array
 dataset = data.values
 # split into input (X) and output (y) variables
 X = dataset[:, :-1]
 y = dataset[:,-1]
 # format all fields as string
 X = X.astype(str)
 # reshape target to be a 2d array
 y = y.reshape((len(y), 1))
 return X, y
 
# prepare input data
def prepare_inputs(X_train, X_test):
 X_train_enc, X_test_enc = list(), list()
 # label encode each column
 for i in range(X_train.shape[1]):
     le = LabelEncoder()
 le.fit(X_train[:, i])
 # encode
 train_enc = le.transform(X_train[:, i])
 test_enc = le.transform(X_test[:, i])
 # store
 X_train_enc.append(train_enc)
 X_test_enc.append(test_enc)
 return X_train_enc, X_test_enc
 
# prepare target
def prepare_targets(y_train, y_test):
 le = LabelEncoder()
 le.fit(y_train)
 y_train_enc = le.transform(y_train)
 y_test_enc = le.transform(y_test)
 return y_train_enc, y_test_enc
 
# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# make output 3d
y_train_enc = y_train_enc.reshape((len(y_train_enc), 1, 1))
y_test_enc = y_test_enc.reshape((len(y_test_enc), 1, 1))
# prepare each input head
in_layers = list()
em_layers = list()
for i in range(len(X_train_enc)):
 # calculate the number of unique inputs
 n_labels = len(unique(X_train_enc[i]))
 # define input layer
 in_layer = Input(shape=(1,))
 # define embedding layer
 em_layer = Embedding(n_labels, 10)(in_layer)
 # store layers
 in_layers.append(in_layer)
 em_layers.append(em_layer)
# concat all embeddings
merge = concatenate(em_layers)
dense = Dense(10, activation='relu', kernel_initializer='he_normal')(merge)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# plot graph
plot_model(model, show_shapes=True, to_file='embeddings.png')
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


2023-04-24 01:36:48.657859: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/20
2057/2057 - 2s - loss: -5.8413e+02 - accuracy: 0.1697 - 2s/epoch - 1ms/step
Epoch 2/20
2057/2057 - 2s - loss: -6.4713e+03 - accuracy: 0.1703 - 2s/epoch - 816us/step
Epoch 3/20
2057/2057 - 2s - loss: -2.2003e+04 - accuracy: 0.1703 - 2s/epoch - 847us/step
Epoch 4/20
2057/2057 - 2s - loss: -4.9395e+04 - accuracy: 0.1703 - 2s/epoch - 784us/step
Epoch 5/20
2057/2057 - 2s - loss: -9.0746e+04 - accuracy: 0.1703 - 2s/epoch - 829us/step
Epoch 6/20
2057/2057 - 2s - loss: -1.4841e+05 - accuracy: 0.1703 - 2s/epoch - 851us/step
Epoch 7/20
2057/2057 - 2s - loss: -2.2460e+05 - accuracy: 0.1703 - 2s/epoch - 796us/step
Epoch 8/20
2057/2057 - 2s - loss: -3.2135e+05 - accuracy: 0.1703 - 2s/epoch - 880us/step
Epoch 9/20
2057/2057 - 2s - loss: -4.4092e+05 - accuracy: 0.1703 - 2s/epoch - 781us/step
Epoch 10/20
2057/2057 - 2s - loss: -5.8574e+05 - accuracy: 0.1703 - 2s/epoch - 820us/step
Epoch 11/20
2057/2057 - 2s - loss: -7.5751e+05 - accuracy: 0.1703 - 2s/epoch - 1ms/step
Epoch 12/20
2057/2057 -

In [174]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [175]:
#Splitting the dataset into Test/Train/Validate based on time 

df = pd.read_csv('labeled_clean_ntsb.csv', parse_dates=['ev_date'], low_memory=False)

# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
train_size = 0.8
valid_size=0.1

train_index = int(len(df)*train_size)

# First we need to sort the dataset by the desired column 
df.sort_values(by = 'ev_date', ascending=True, inplace=True)

df_train = df[0:train_index]
df_rem = df[train_index:]

valid_index = int(len(df)*valid_size)

df_valid = df[train_index:train_index+valid_index]
df_test = df[train_index+valid_index:]

X_train, y_train = df_train.drop(columns='target').copy(), df_train['target'].copy()
X_valid, y_valid = df_valid.drop(columns='target').copy(), df_valid['target'].copy()
X_test, y_test = df_test.drop(columns='target').copy(), df_test['target'].copy()
        
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(39286, 85)
(39286,)
(4910, 85)
(4910,)
(4912, 85)
(4912,)


(None, None)

In [176]:
#https://www.tensorflow.org/tutorials/structured_data/feature_columns

# In the original dataset "4" indicates the pet was not adopted.
df['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)

# Drop un-used columns.
dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description'])