<a href="https://colab.research.google.com/github/kd365/faafall22/blob/hunter_19mar/cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NTSB Cleaning Notebook

## Steps:
1. Import libraries and raw data
2. Initial column drops
3. Cleaning steps that wil apply to entire dataframe
4. Cleaning each column

**Cleaning Plan:**
Hunter:
- cols 0-22
Kathleen:
- col 23-45
Eric:
- col 46-68
Chi:
- col 69-85


### 1. Import libraries

In [None]:
from google.colab import files
import pandas as pd
 
uploaded = files.upload()

Saving NTSB_for_cleaning (1).csv to NTSB_for_cleaning (1).csv


In [None]:
# setting pandas display options
pd.set_option('display.max_rows', 150)

# reading in NTSB csv file
ntsb_raw = pd.read_csv("NTSB_for_cleaning (1).csv", encoding='latin-1', low_memory=False)
ntsb_raw = ntsb_raw.drop(['engines_ev_id', 'events_ev_id', 'Unnamed: 0'], axis=1)
# view the dataframe info
ntsb_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104120 entries, 0 to 104119
Data columns (total 86 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   aircraft_ev_id         104120 non-null  object 
 1   far_part               104120 non-null  object 
 2   flight_plan_activated  45517 non-null   object 
 3   damage                 103326 non-null  object 
 4   acft_make              104108 non-null  object 
 5   acft_model             104096 non-null  object 
 6   cert_max_gr_wt         93685 non-null   float64
 7   num_eng                102746 non-null  float64
 8   type_last_insp         99030 non-null   object 
 9   date_last_insp         83764 non-null   object 
 10  afm_hrs_last_insp      58888 non-null   float64
 11  afm_hrs                84919 non-null   float64
 12  type_fly               100532 non-null  object 
 13  dprt_apt_id            85958 non-null   object 
 14  dprt_city              88866 non-nul

### 2. Initial Drop

In [None]:
# drop all records that do not have at least 50 columns worth of data in them
ntsb_raw = ntsb_raw.dropna(thresh=50)

In [None]:
# view data
ntsb_raw.head()

Unnamed: 0,aircraft_ev_id,far_part,flight_plan_activated,damage,acft_make,acft_model,cert_max_gr_wt,num_eng,type_last_insp,date_last_insp,...,inj_tot_t,wx_cond_basic,Cause_Factor,crew_no,crew_category,crew_age,crew_sex,med_certf,pilot_flying,crew_inj_level
0,20001204X00000,135,,SUBS,Cessna,207,3800.0,1.0,100H,12/16/1998,...,1.0,VMC,,,,,,,False,
1,20001204X00004,135,,SUBS,Cessna,207,3800.0,1.0,AAIP,12/14/1998,...,,VMC,,,,,,,False,
2,20001204X00005,91,,SUBS,Piper,PA-22-160,1840.0,1.0,ANNL,3/14/1998,...,,VMC,,,,,,,False,
3,20001204X00006,91,,DEST,Beech,300,14100.0,2.0,AAIP,1/14/1999,...,2.0,IMC,,,,,,,False,
4,20001204X00007,91,,DEST,Piper,PA-28-181,2550.0,1.0,UNK,,...,1.0,VMC,,,,,,,False,


In [None]:
# drop any columns that have more than 40% null values
total_rows = ntsb_raw.shape[0]

def drop_cols(df, df_size):
    # create a list of columns to drop
    drop = []
    # iterate over each column
    for x in df.columns:
        #determine if the ratio of nulls is greater than 30%
        ratio = (df[x].isna().sum()) / df_size
        if ratio > 0.4:
            drop.append(x)
    # drop the columns in the drop list
    df.drop(labels=drop, axis=1)
    print('Cols dropped from df:', drop)
    print(df.shape)
    return df

ntsb_dropped_cols = drop_cols(ntsb_raw, total_rows)      


Cols dropped from df: ['flight_plan_activated', 'afm_hrs_last_insp', 'dest_same_local', 'phase_flt_spec', 'afm_hrs_since', 'eng_time_total', 'eng_time_last_insp', 'latitude', 'longitude', 'apt_dir', 'vis_rvr', 'wx_dens_alt', 'wx_int_precip', 'Cause_Factor', 'crew_no', 'crew_category', 'crew_age', 'crew_sex', 'med_certf', 'crew_inj_level']
(104120, 86)


In [None]:
print(ntsb_dropped_cols.shape)

(104120, 86)


## 3. Cleaning that applies to entire data set

In [None]:
# function to initially clean up all strings in df
def df_string_transform(df):
    ''' 
    force all text data to lowercase and strip left and right side of strings
    in all string columns

    args: dataframe

    returns: dataframe
    '''
    df1 = df.applymap(lambda x: x.lower() if type(x) == str else x)
    df1 = df1.applymap(lambda x: x.strip() if type(x) == str else x)
    return df1

In [None]:
# applying the funcrition from above
ntsb_1 = df_string_transform(ntsb_dropped_cols)

**NOTE**: for section 3 use the dataframe `ntsb_1` for cleaning

### 3. Cleaning Each Column

In [None]:
# function to show all the metrics you will want to see for each column
def col_metrics(df, col='none'):
  print('Unique data: ', df[col].unique())
  print('Value counts: ', df[col].value_counts())
  print('Total nulls: ', df[col].isna().sum())
  print('Data type: ', df[col].dtypes)

### HUNTER'S SECTION

In [None]:
# column 1: far part
col_metrics(ntsb_1, col='far_part')      

# making a copy of the data set as ntsb_1
ntsb_1 = pd.DataFrame.copy(ntsb_1)

# remove the K and F from part 091
ntsb_1['far_part'] = ntsb_1['far_part'].str.replace('k', '').str.replace('f', '').str.replace('arm', '091')
print(ntsb_1['far_part'].unique())

Unique data:  ['135' '091' '137' '103' 'armf' '091f' '091k' '437']
Value counts:  091     93244
135      5767
137      4737
103       147
091k      123
armf       50
091f       38
437        14
Name: far_part, dtype: int64
Total nulls:  0
Data type:  object
['135' '091' '137' '103' '437']


In [None]:
# column 2: 'flight_plan_activated'
col_metrics(ntsb_1, 'flight_plan_activated')

# dropping this column, too many nulls, and filing a flight plan is superfluous 
#because you can file in flight or you can use flight following
ntsb_1 = ntsb_1.drop(['flight_plan_activated'], axis=1)

Unique data:  [nan 'y' 'n' 'u']
Value counts:  n    32367
y    11622
u     1528
Name: flight_plan_activated, dtype: int64
Total nulls:  58603
Data type:  object


In [None]:
# column 3: 'damage'
col_metrics(ntsb_1, 'damage')

# change the unk to none
ntsb_1['damage'] = ntsb_1['damage'].str.replace('unk', 'none')
# drop the 491 rows that do not have a damage listing
ntsb_1 = ntsb_1.dropna(subset='damage')
print('final null count:', ntsb_1['damage'].isna().sum())

Unique data:  ['subs' 'dest' 'none' 'minr']
Value counts:  subs    80863
dest    18947
minr     3086
none      430
Name: damage, dtype: int64
Total nulls:  0
Data type:  object
final null count: 0


In [None]:
# column 4: 'acft_make'
col_metrics(ntsb_1, 'acft_make')

# there are 6 rosw with nulls, will drop those
ntsb_1 = ntsb_1.dropna(subset=['acft_make'])
print(ntsb_1['acft_make'].isna().sum())

# force all strings to lower to promote uniformity
ntsb_1['acft_make'] = ntsb_1['acft_make'].str.lower()
print(ntsb_1['acft_make'].unique())

Unique data:  ['beech' 'aero commander' 'piper' ... 'gonzalez manuel a' 'brandt'
 'harris-runyan']
Value counts:  cessna            17442
piper             10125
beech              4798
mooney              975
bell                756
                  ...  
arnet pereyra         1
empson                1
purvis/thorpe         1
barackman vans        1
teal harry h          1
Name: acft_make, Length: 2811, dtype: int64
Total nulls:  0
0
['beech' 'aero commander' 'piper' ... 'gonzalez manuel a' 'brandt'
 'harris-runyan']


In [None]:
# column 5: 'acft_model'
col_metrics(ntsb_1, 'acft_model')

# drop the 5 rows that have nulls
ntsb_1 = ntsb_1.dropna(subset=['acft_model'])

Unique data:  ['300' '560A' '95-C55' ... 'RV 7' 'F4U 5' 'Skybolt 300']
Value counts:  172                    988
152                    927
172S                   836
172N                   714
172M                   704
                      ... 
JODEL F-9                1
305 A                    1
STOLP STARLET SA500      1
VM-1 Esqual              1
Searey Amphibian         1
Name: acft_model, Length: 4466, dtype: int64
Total nulls:  5


In [None]:
# column 6: 'cert_max_gr_wt'
col_metrics(ntsb_1, 'cert_max_gr_wt')
print(ntsb_1['cert_max_gr_wt'].max(), ntsb_1['cert_max_gr_wt'].min())

def weight_bins(df, col):
    avg_wt = avg(df[col])
    for x in list(df[col]):
        if x <= 12500:
            x = 'small'
        elif x > 12500 and x < 41000:
            x = 'medium'
        elif x == 'nan':
            x = 
            x = 'large commuter'
    return df

Unique data:  [14100.  6000.  5300. ...  1830.  3305.  2251.]
Value counts:  2300.0     1910
2550.0     1573
3600.0     1371
1600.0     1364
3400.0     1266
           ... 
10294.0       1
615.0         1
10775.0       1
1332.0        1
7952.0        1
Name: cert_max_gr_wt, Length: 1382, dtype: int64
Total nulls:  6805
503500.0 0.0


## KATHLEEN'S SECITON

In [None]:
ntsb_2 = weight_bins(ntsb_1, 'cert_max_gr_wt')

In [None]:
ntsb_2['cert_max_gr_wt']

0        14100.0
1         6000.0
2         5300.0
3         5121.0
4         4300.0
          ...   
58683        NaN
58684        NaN
58685        NaN
58686        NaN
58687        NaN
Name: cert_max_gr_wt, Length: 58677, dtype: float64

In [None]:
ntsb_1.columns

Index(['aircraft_ev_id', 'far_part', 'flight_plan_activated', 'damage',
       'acft_make', 'acft_model', 'cert_max_gr_wt', 'num_eng',
       'type_last_insp', 'date_last_insp', 'afm_hrs_last_insp', 'afm_hrs',
       'type_fly', 'dprt_apt_id', 'dprt_city', 'dprt_state', 'dprt_time',
       'dest_same_local', 'dest_apt_id', 'dest_city', 'dest_state',
       'phase_flt_spec', 'afm_hrs_since', 'rwy_num', 'rwy_len', 'rwy_width',
       'ifr_equipped_cert', 'eng_no', 'eng_type', 'eng_mfgr', 'hp_or_lbs',
       'carb_fuel_injection', 'eng_time_total', 'eng_time_last_insp',
       'ntsb_no', 'ev_type', 'ev_date', 'ev_dow', 'ev_time', 'ev_city',
       'ev_state', 'ev_year', 'ev_month', 'latitude', 'longitude', 'apt_name',
       'ev_nr_apt_id', 'ev_nr_apt_loc', 'apt_dist', 'apt_dir', 'apt_elev',
       'wx_src_iic', 'wx_obs_time', 'wx_obs_dir', 'wx_obs_fac_id',
       'wx_obs_elev', 'wx_obs_dist', 'light_cond', 'sky_cond_nonceil',
       'sky_nonceil_ht', 'sky_ceil_ht', 'sky_cond_ceil', 'vis_

In [None]:
ntsb_raw = ntsb_raw.loc[:, ~ntsb_raw.columns.str.contains('^Unnamed')]

In [None]:
print(ntsb_raw['damage'].unique())

In [None]:
print(ntsb_raw['type_last_insp'].unique())

In [None]:
ntsb_rv1 = ntsb_raw.dropna(axis=1, how='all')
#removed one column
ntsb_rv1.describe(include='all') 

In [None]:
ntsb_rv1.info()

In [None]:

print(ntsb_rv1.dtypes)

In [None]:
ntsb_rv1.head()

In [None]:
print(ntsb_rv1[['ev_time']].to_string(index=False)) 

In [None]:
ntsb_rv1['ev_time'] = pd.to_numeric(ntsb_rv1['ev_time'])

In [None]:
import numpy as np
ntsb_rv1['ev_time'] = ntsb_rv1['ev_time'].astype(np.int64)

In [None]:
nan_count = ntsb_rv1['ev_time'].isna().sum()
print(nan_count)

In [None]:
print((ntsb_rv1['ev_time'] == 'NaN').sum())

In [None]:
#adding leading zeros for military time format
ntsb_rv1['ev_time'] = ntsb_rv1['ev_time'].apply(lambda x: '{0:0>4}'.format(x))
print(ntsb_rv1[['ev_time']].to_string(index=False)) 


In [None]:
ntsb_rv1['ev_time'] = pd.to_datetime(ntsb_rv1['ev_time'], format = '%H%m')

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
ntsb_rv1['date_last_insp'] =  pd.to_datetime(ntsb_rv1['date_last_insp'], infer_datetime_format=True)


In [None]:
#ntsb_rv2= ntsb_rv1.drop(['Aircraft_Key'], axis=1)

In [None]:
print(ntsb_rv1.dtypes)

In [None]:
ntsb_rv1['wx_obs_time'] =  pd.to_datetime(ntsb_rv1['wx_obs_time'], format = "%H%M")

In [None]:
#Walkthrough at https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline_column_transformer.html
#preprocessing different for categorical vs numerical columns

from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(ntsb_raw)
categorical_columns = categorical_columns_selector(ntsb_raw)

In [None]:
# the columns to be used to create the labels are: inj_tot_t, damage, crew_inj_level, ev_highest_injury, damage

## ERICK'S SECITON

### CHI'S SECTION