In [1]:
import pandas as pd 
import numpy as np

# call the 00-load-raw-data notebook to bring the pecarn_tbi dataframe into the environment
%run 00-load-raw-data.ipynb

print("START: 01-data-cleaning.ipynb")

START: 00-load-raw-data.ipynb
  PECARN TBI data read from c:\Jan\Capstone\data/TBI PUD 10-08-2013.csv into "pecarn_tbi" dataframe
START: 01-data-cleaning.ipynb


# Data Cleaning

In [2]:
pecarn_tbi.dtypes

AMS            category
AMSAgitated    category
AMSOth         category
AMSRepeat      category
AMSSleep       category
                 ...   
SeizOccur      category
Vomit          category
VomitLast      category
VomitNbr       category
VomitStart     category
Length: 124, dtype: object

In [3]:
data = pecarn_tbi

## Glasgow Coma Score (GCS)
- drop records where *GCSTotal* is less than 14
- drop *GCSGroup* as redundant

In [4]:
print("  Dropping records where GCS < 14")
data = data[data['GCSGroup'] == 2]

print("  Dropping GCS columns as they are now redundant")
data = data.drop(columns=['GCSGroup', 'GCSTotal', 'GCSEye', 'GCSVerbal', 'GCSMotor'])

#print("  Dropping GCSGroup as it is now redundant")
#data = data.drop(columns='GCSGroup')

# print("  Filling missing GCSEye, GCSVerbal, GCSMotor when GCSTotal is 15")
# gcs_fill = data['GCSTotal'].eq(15) & (data['GCSEye'].isna() | data['GCSVerbal'].isna() | data['GCSMotor'].isna())
# data.loc[gcs_fill, 'GCSEye'] = 4
# data.loc[gcs_fill, 'GCSVerbal'] = 5
# data.loc[gcs_fill, 'GCSMotor'] = 6

Dropping records where GCS < 14
  Dropping GCS columns as they are now redundant


## Age
 - drop one of *AgeInMonth* or *AgeinYears* as they are effectively the same
 - rename *AgeinYears* to *Age*
 - remove the *AgeTwoPlus* category, this is something that a machine learning algorithm should learn

In [5]:
print("  Dropping AgeInMonth")
data = data.drop(columns='AgeInMonth')
    
print("  Renaming AgeinYears to Age")
data.rename(columns={'AgeinYears': 'Age'}, inplace=True)

#print("  Dropping AgeTwoPlus")
#data = data.drop(columns='AgeTwoPlus')

Dropping AgeInMonth
  Renaming AgeinYears to Age


## Employee Type and Certification
- drop *EmplType* and *Certification* as not relevant (?)

In [6]:
print("  Dropping EmplType")
data = data.drop(columns='EmplType')

print("  Dropping AgeInMonth")
data = data.drop(columns='Certification')

Dropping EmplType
  Dropping AgeInMonth


## Injury Mechanism
- drop *High_impact_InjSev* as the information is encoded in *InjuryMech*
- TODO consider filtering out *High_impact_InjSev*=1

In [7]:
print("  Dropping High_impact_InjSev")
data = data.drop(columns='High_impact_InjSev')

print("  Renaming InjuryMech to Injury_Mechanism")
data.rename(columns={'InjuryMech': 'Injury_Mechanism'}, inplace=True)

Dropping High_impact_InjSev
  Renaming InjuryMech to Injury_Mechanism


## Acting Normal
- rename *ActNorm* to *Acting_Normal*
- where *ActNorm* is NaN assume that if it was of note it would have been answered as "No", and thus missing data can be set to "Yes"

In [8]:
print("  Renaming ActNorm to Acting_Normal")
data.rename(columns={'ActNorm': 'Acting_Normal'}, inplace=True)

print("  Setting Acting_Normal missing data to 1 (Yes)")
data.loc[data['Acting_Normal'].isna(), 'Acting_Normal'] = 1

Renaming ActNorm to Acting_Normal
  Setting Acting_Normal missing data to 1 (Yes)


## Findings
- drop Finding## columns

In [18]:
print("  Dropping Findings## columns")
data = data.drop(columns=[col for col in list(data.columns) if col.startswith('Finding')])

Dropping Findings## columns


# Column Ordering

In [10]:
data = data.reindex(sorted(data.columns), axis=1)

# End

In [11]:
print('  The cleaned dataset is now available in a dataframe named "data"')

The cleaned dataset is now available in a dataframe named "data"
