# NTSB Notebook #1: data joining and initial cleaning

### Import pandas, data and config

In [1]:
import pandas as pd

In [2]:
# configure pandas settings
pd.options.display.max_info_columns = 120

In [3]:
# read in post 2008 data
df_new = pd.read_csv('ntsb_post_2008.csv')

  df_new = pd.read_csv('ntsb_post_2008.csv')


In [4]:
df_new.shape

(65534, 109)

In [5]:
# read in pre-2008 data
df_old = pd.read_csv('NTSB_pre_2008.csv')

  df_old = pd.read_csv('NTSB_pre_2008.csv')


In [6]:
df_old.shape

(66857, 109)

### Join both dataframes

In [7]:
# check to see if the columns from both dataframes are identical
col_compare = list(zip(df_old.columns, df_new.columns))

for x in col_compare:
    if x[0] != x[1]:
        print(x)


('aircraft_ev_id', 'aircraft.ev_id')
('engines_ev_id', 'engines.ev_id')
('events_ev_id', 'events.ev_id')
('Findings_ev_id', 'Findings.ev_id')
('Flight_Crew_ev_id', 'Flight_Crew.ev_id')


In [8]:
# fix the column labels
new_labels = [x.replace('.', '_') for x in list(df_new.columns)]

# create mapper:
label_map = dict(zip(df_new.columns, new_labels))

In [9]:
df_new = df_new.rename(columns=label_map)

In [10]:
# validate column rename worked
col_compare = list(zip(df_old.columns, df_new.columns))

for x in col_compare:
    if x[0] != x[1]:
        print(x)

In [11]:
# concatenate both data sets
df = pd.concat([df_old, df_new])

In [12]:
# concatenated df shape
df.shape

(132391, 109)

### Filter by far_part

In [13]:
# view all part data in df
df['far_part'].value_counts()

091     103229
121       7654
135       6551
137       5800
NUSN      2230
PUBU      1615
NUSC      1553
129       1399
133        583
UNK        514
103        207
091K       124
125         77
ARMF        73
091F        40
437         14
107          5
Name: far_part, dtype: int64

In [14]:
# filter out part 121, 125, 129, UNK, 133, , NUSN, PUBU, NUSC, 107
mask = ['121', '125', '129', 'UNK', '133', 'NUSN', 'PUBU', 'NUSC', '107']
df = df[~df['far_part'].isin(mask)]

In [15]:
# drop irrelevant columns or very high null counts
df = df.drop(['metar', 'fuel_on_board', 'dest_country', 'dprt_country', 'crew_res_state', 'crew_city', 'Findings_ev_id',
              'Flight_Crew_ev_id', 'Flight_Crew_Aircraft_Key', 'Findings_Aircraft_Key', 'vis_rvv', 'wx_obs_tmzn', 
              'inj_f_grnd', 'inj_m_grnd', 'inj_s_grnd', 'inj_tot_f', 'inj_tot_m', 'inj_tot_n','inj_tot_s', 'dprt_timezn',
              'wx_brief_comp'], axis=1)

In [16]:
#filter out records where the phase was not taxt or inflight
phase_mask = [0.0, 610.0, 600.0, 500.0, 501.0, 502.0, 503.0, 504.0, 505.0] 
df = df[~df['phase_flt_spec'].isin(phase_mask)]

In [22]:
# drop all records that do not have at least 60 columns worth of data in them
df1 = df.dropna(thresh=60)

In [23]:
df1.shape

(104120, 88)

In [28]:
df1['ev_highest_injury'].value_counts()

NONE    54732
FATL    23655
MINR    14807
SERS    10839
Name: ev_highest_injury, dtype: int64

In [25]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104120 entries, 0 to 65533
Data columns (total 88 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   aircraft_ev_id         104120 non-null  object 
 1   far_part               104120 non-null  object 
 2   flight_plan_activated  45517 non-null   object 
 3   damage                 103326 non-null  object 
 4   acft_make              104108 non-null  object 
 5   acft_model             104096 non-null  object 
 6   cert_max_gr_wt         93685 non-null   float64
 7   num_eng                102746 non-null  float64
 8   type_last_insp         99030 non-null   object 
 9   date_last_insp         83764 non-null   object 
 10  afm_hrs_last_insp      58888 non-null   float64
 11  afm_hrs                84919 non-null   float64
 12  type_fly               100532 non-null  object 
 13  dprt_apt_id            85958 non-null   object 
 14  dprt_city              88866 non-null

In [18]:
# save this version of the data
df1.to_csv('NTSB_for_cleaning.csv')

### Outcome columns for labeling purposes

In [27]:
# the columns to be used to create the labels are: inj_tot_t, damage, crew_inj_level, ev_highest_injury