<a href="https://colab.research.google.com/github/kd365/faafall22/blob/TEST_BRANCH/cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NTSB Cleaning Notebook

## Steps:
1. Import libraries and raw data
2. Initial column drops
3. Cleaning steps that wil apply to entire dataframe
4. Cleaning each column

**Cleaning Plan:**
Hunter:
- cols 0-22
Kathleen:
- col 23-45
Eric:
- col 46-68
Chi:
- col 69-85


test cell

### 1. Import libraries

In [1]:
from google.colab import files
import pandas as pd
 
uploaded = files.upload()

Saving NTSB_for_cleaning.csv to NTSB_for_cleaning.csv


In [4]:
# setting pandas display options
pd.set_option('display.max_rows', 150)

# reading in NTSB csv file
ntsb_raw = pd.read_csv("NTSB_for_cleaning.csv", encoding='latin-1', low_memory=False)
ntsb_raw = ntsb_raw.drop(['engines_ev_id', 'events_ev_id', 'Unnamed: 0'], axis=1)
# view the dataframe info
ntsb_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58688 entries, 0 to 58687
Data columns (total 86 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   aircraft_ev_id         58688 non-null  object 
 1   far_part               58688 non-null  object 
 2   flight_plan_activated  37159 non-null  object 
 3   damage                 58197 non-null  object 
 4   acft_make              58682 non-null  object 
 5   acft_model             58680 non-null  object 
 6   cert_max_gr_wt         51883 non-null  float64
 7   num_eng                57954 non-null  float64
 8   type_last_insp         55899 non-null  object 
 9   date_last_insp         50765 non-null  object 
 10  afm_hrs_last_insp      28970 non-null  float64
 11  afm_hrs                51868 non-null  float64
 12  type_fly               55905 non-null  object 
 13  dprt_apt_id            55148 non-null  object 
 14  dprt_city              57714 non-null  object 
 15  dp

### 2. Initial Column Drop

In [5]:
# view data
ntsb_raw.head()

Unnamed: 0,aircraft_ev_id,far_part,flight_plan_activated,damage,acft_make,acft_model,cert_max_gr_wt,num_eng,type_last_insp,date_last_insp,...,inj_tot_t,wx_cond_basic,Cause_Factor,crew_no,crew_category,crew_age,crew_sex,med_certf,pilot_flying,crew_inj_level
0,20001204X00006,91,,DEST,Beech,300,14100.0,2.0,AAIP,1/14/1999,...,2.0,IMC,,,,,,,False,
1,20001204X00008,91,,DEST,Aero Commander,560A,6000.0,2.0,ANNL,10/1/1998,...,4.0,VMC,,,,,,,False,
2,20001204X00016,91,,DEST,Beech,95-C55,5300.0,2.0,ANNL,6/22/1998,...,3.0,IMC,,,,,,,False,
3,20001204X00017,91,,DEST,Beech,BE-55,5121.0,2.0,ANNL,11/4/1998,...,3.0,IMC,,,,,,,False,
4,20001204X00031,91,Y,SUBS,Piper,PA-46-350P,4300.0,1.0,COAW,11/24/1998,...,1.0,IMC,,,,,,,False,


In [6]:
# drop any columns that have more than 40% null values
total_rows = ntsb_raw.shape[0]

def drop_cols(df, df_size):
    # create a list of columns to drop
    drop = []
    # iterate over each column
    for x in df.columns:
        #determine if the ratio of nulls is greater than 30%
        ratio = (df[x].isna().sum()) / df_size
        if ratio > 0.4:
            drop.append(x)
    # drop the columns in the drop list
    df.drop(labels=drop, axis=1)
    print('Cols dropped from df:', drop)
    print(df.shape)
    return df

ntsb_dropped_cols = drop_cols(ntsb_raw, total_rows)      


Cols dropped from df: ['afm_hrs_last_insp', 'dest_same_local', 'phase_flt_spec', 'apt_dir', 'vis_rvr', 'wx_dens_alt', 'wx_int_precip', 'crew_sex']
(58688, 86)


In [7]:
print(ntsb_dropped_cols.shape)

(58688, 86)


-Add step to lower case entire dataframe

### 3. Cleaning Each Column

In [8]:
# function to show all the metrics you will want to see for each column
def col_metrics(df, col='none'):
  print('Unique data: ', df[col].unique())
  print('Value counts: ', df[col].value_counts())
  print('Total nulls: ', df[col].isna().sum())
  print('Data type: ', df[col].type())

## HUNTER'S SECTION

In [None]:
# column 1: far part
col_metrics(ntsb_dropped_cols, col='far_part')      

# making a copy of the data set as ntsb_1
ntsb_1 = pd.DataFrame.copy(ntsb_dropped_cols)

# remove the K and F from part 091
ntsb_1['far_part'] = ntsb_1['far_part'].str.replace('K', '').str.replace('F', '')
print(ntsb_1['far_part'].unique())

Unique data:  ['091' '135' '137' 'ARM' '103']
Value counts:  091    53791
135     3322
137     1537
ARM       28
103       10
Name: far_part, dtype: int64
Total nulls:  0
['091' '135' '137' 'ARM' '103']


In [None]:
# column 2: 'flight_plan_activated'
col_metrics(ntsb_1, 'flight_plan_activated')

# fill na with "U"
ntsb_1['flight_plan_activated'] = ntsb_1['flight_plan_activated'].fillna('U')
print(ntsb_1['flight_plan_activated'].isna().sum())

Unique data:  ['U' 'Y' 'N']
Value counts:  N    26101
U    22763
Y     9824
Name: flight_plan_activated, dtype: int64
Total nulls:  0
0


In [None]:
# column 3: 'damage'
col_metrics(ntsb_1, 'damage')

# change the one Unk to None
ntsb_1['damage'] = ntsb_1['damage'].str.replace('UNK', 'NONE')

Unique data:  ['DEST' 'SUBS' 'MINR' 'NONE' 'UNK' nan]
Value counts:  SUBS    47583
DEST     8494
MINR     2065
NONE       54
UNK         1
Name: damage, dtype: int64
Total nulls:  491


In [None]:
# column 4: 'acft_make'
col_metrics(ntsb_1, 'acft_make')

# there are 6 rosw with nulls, will drop those
ntsb_1 = ntsb_1.dropna(subset=['acft_make'])
print(ntsb_1['acft_make'].isna().sum())

# force all strings to lower to promote uniformity
ntsb_1['acft_make'] = ntsb_1['acft_make'].str.lower()
print(ntsb_1['acft_make'].unique())

Unique data:  ['beech' 'aero commander' 'piper' ... 'gonzalez manuel a' 'brandt'
 'harris-runyan']
Value counts:  cessna            17442
piper             10125
beech              4798
mooney              975
bell                756
                  ...  
arnet pereyra         1
empson                1
purvis/thorpe         1
barackman vans        1
teal harry h          1
Name: acft_make, Length: 2811, dtype: int64
Total nulls:  0
0
['beech' 'aero commander' 'piper' ... 'gonzalez manuel a' 'brandt'
 'harris-runyan']


In [None]:
# column 5: 'acft_model'
col_metrics(ntsb_1, 'acft_model')

# drop the 5 rows that have nulls
ntsb_1 = ntsb_1.dropna(subset=['acft_model'])

Unique data:  ['300' '560A' '95-C55' ... 'RV 7' 'F4U 5' 'Skybolt 300']
Value counts:  172                    988
152                    927
172S                   836
172N                   714
172M                   704
                      ... 
JODEL F-9                1
305 A                    1
STOLP STARLET SA500      1
VM-1 Esqual              1
Searey Amphibian         1
Name: acft_model, Length: 4466, dtype: int64
Total nulls:  5


In [None]:
# column 6: 'cert_max_gr_wt'
col_metrics(ntsb_1, 'cert_max_gr_wt')
print(ntsb_1['cert_max_gr_wt'].max(), ntsb_1['cert_max_gr_wt'].min())

def weight_bins(df, col):
    avg_wt = avg(df[col])
    for x in list(df[col]):
        if x <= 12500:
            x = 'small'
        elif x > 12500 and x < 41000:
            x = 'medium'
        elif x == 'nan':
            x = 
            x = 'large commuter'
    return df

Unique data:  [14100.  6000.  5300. ...  1830.  3305.  2251.]
Value counts:  2300.0     1910
2550.0     1573
3600.0     1371
1600.0     1364
3400.0     1266
           ... 
10294.0       1
615.0         1
10775.0       1
1332.0        1
7952.0        1
Name: cert_max_gr_wt, Length: 1382, dtype: int64
Total nulls:  6805
503500.0 0.0


## KATHLEEN'S SECITON

In [None]:
ntsb_2 = weight_bins(ntsb_1, 'cert_max_gr_wt')

In [None]:
ntsb_2['cert_max_gr_wt']

0        14100.0
1         6000.0
2         5300.0
3         5121.0
4         4300.0
          ...   
58683        NaN
58684        NaN
58685        NaN
58686        NaN
58687        NaN
Name: cert_max_gr_wt, Length: 58677, dtype: float64

In [None]:
ntsb_1.columns

Index(['aircraft_ev_id', 'far_part', 'flight_plan_activated', 'damage',
       'acft_make', 'acft_model', 'cert_max_gr_wt', 'num_eng',
       'type_last_insp', 'date_last_insp', 'afm_hrs_last_insp', 'afm_hrs',
       'type_fly', 'dprt_apt_id', 'dprt_city', 'dprt_state', 'dprt_time',
       'dest_same_local', 'dest_apt_id', 'dest_city', 'dest_state',
       'phase_flt_spec', 'afm_hrs_since', 'rwy_num', 'rwy_len', 'rwy_width',
       'ifr_equipped_cert', 'eng_no', 'eng_type', 'eng_mfgr', 'hp_or_lbs',
       'carb_fuel_injection', 'eng_time_total', 'eng_time_last_insp',
       'ntsb_no', 'ev_type', 'ev_date', 'ev_dow', 'ev_time', 'ev_city',
       'ev_state', 'ev_year', 'ev_month', 'latitude', 'longitude', 'apt_name',
       'ev_nr_apt_id', 'ev_nr_apt_loc', 'apt_dist', 'apt_dir', 'apt_elev',
       'wx_src_iic', 'wx_obs_time', 'wx_obs_dir', 'wx_obs_fac_id',
       'wx_obs_elev', 'wx_obs_dist', 'light_cond', 'sky_cond_nonceil',
       'sky_nonceil_ht', 'sky_ceil_ht', 'sky_cond_ceil', 'vis_

In [None]:
ntsb_raw = ntsb_raw.loc[:, ~ntsb_raw.columns.str.contains('^Unnamed')]

In [None]:
print(ntsb_raw['damage'].unique())

In [None]:
print(ntsb_raw['type_last_insp'].unique())

In [None]:
ntsb_rv1 = ntsb_raw.dropna(axis=1, how='all')
#removed one column
ntsb_rv1.describe(include='all') 

In [None]:
ntsb_rv1.info()

In [None]:

print(ntsb_rv1.dtypes)

In [None]:
ntsb_rv1.head()

In [None]:
print(ntsb_rv1[['ev_time']].to_string(index=False)) 

In [None]:
ntsb_rv1['ev_time'] = pd.to_numeric(ntsb_rv1['ev_time'])

In [None]:
import numpy as np
ntsb_rv1['ev_time'] = ntsb_rv1['ev_time'].astype(np.int64)

In [None]:
nan_count = ntsb_rv1['ev_time'].isna().sum()
print(nan_count)

In [None]:
print((ntsb_rv1['ev_time'] == 'NaN').sum())

In [None]:
#adding leading zeros for military time format
ntsb_rv1['ev_time'] = ntsb_rv1['ev_time'].apply(lambda x: '{0:0>4}'.format(x))
print(ntsb_rv1[['ev_time']].to_string(index=False)) 


In [None]:
ntsb_rv1['ev_time'] = pd.to_datetime(ntsb_rv1['ev_time'], format = '%H%m')

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
ntsb_rv1['date_last_insp'] =  pd.to_datetime(ntsb_rv1['date_last_insp'], infer_datetime_format=True)


In [None]:
#ntsb_rv2= ntsb_rv1.drop(['Aircraft_Key'], axis=1)

In [None]:
print(ntsb_rv1.dtypes)

In [None]:
ntsb_rv1['wx_obs_time'] =  pd.to_datetime(ntsb_rv1['wx_obs_time'], format = "%H%M")

In [None]:
#Walkthrough at https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline_column_transformer.html
#preprocessing different for categorical vs numerical columns

from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(ntsb_raw)
categorical_columns = categorical_columns_selector(ntsb_raw)

In [9]:
# the columns to be used to create the labels are: inj_tot_t, damage, crew_inj_level, ev_highest_injury, damage

## ERICK'S SECITON

### CHI'S SECTION