# Load data

In [1]:
import pandas

In [2]:
# https://www.cpsc.gov/cgibin/NEISSQuery/UserCriteria.aspx?UserAff=CvbkBwSYvXoJ%2blc0Tfzwdg%3d%3d&UserAffOther=9OYR9kUytIsLilKZieD5xg%3d%3d
# this is latin-1 encoded file - most us government data is latin-1 
fp = 'NEISS.TXT'
df = pandas.read_csv(fp, sep='\t', encoding='latin-1')  # so the original file is in the 'latin-1' encoding, so we must read decode it from this

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
low_memory=False

## Inspect Data

see dimensionality of data

In [4]:
print(df.shape)

(3811389, 19)


see the null values

In [6]:
df.isnull().sum()

CPSC_Case_Number          1
Treatment_Date            0
Age                       0
Sex                       0
Race                      1
Other_Race          3501320
Body_Part                 4
Diagnosis                 4
Other_Diagnosis     3392466
Disposition               4
Location                  4
Fire_Involvement          4
Product_1                 4
Product_2                 4
Narrative_1               6
Narrative_2         1467893
Stratum                   8
PSU                       8
Weight                    8
dtype: int64

quick check for datatypes

In [13]:
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight
0,70100518,01/01/2007,10,2.0,1.0,,92.0,57.0,,1.0,1.0,0.0,1211.0,0.0,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,L SUSTAINING A FRACTURED RIGHT PINKY FINGER,V,61.0,15.1766
1,70100520,01/01/2007,43,2.0,1.0,,83.0,64.0,,1.0,1.0,0.0,1842.0,0.0,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,,V,61.0,15.1766
2,70100521,01/01/2007,85,2.0,1.0,,79.0,53.0,,1.0,1.0,0.0,1864.0,0.0,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,,V,61.0,15.1766
3,70100523,01/01/2007,45,2.0,1.0,,94.0,71.0,TRAUMA EAR,1.0,1.0,0.0,1710.0,0.0,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,NG TRAUMA TO EAR CANAL,V,61.0,15.1766
4,70100525,01/01/2007,212,1.0,2.0,,92.0,59.0,,1.0,1.0,0.0,4074.0,0.0,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,,V,61.0,15.1766


# Clean up

In [17]:
# change the floats to ints
# Note: 'Weight' is a float
cols_float_to_int = ['Sex', 'Race', 'Body_Part', 'Diagnosis', 'Disposition', 
                     'Location','Fire_Involvement', 'Product_1', 'Product_2', 'PSU']

In [18]:
for col_name in cols_float_to_int:
    df[col_name] = df[col_name].fillna(999).astype(int)

In [19]:
# converted to ints
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight
0,70100518,01/01/2007,10,2,1,,92,57,,1,1,0,1211,0,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,L SUSTAINING A FRACTURED RIGHT PINKY FINGER,V,61,15.1766
1,70100520,01/01/2007,43,2,1,,83,64,,1,1,0,1842,0,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,,V,61,15.1766
2,70100521,01/01/2007,85,2,1,,79,53,,1,1,0,1864,0,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,,V,61,15.1766
3,70100523,01/01/2007,45,2,1,,94,71,TRAUMA EAR,1,1,0,1710,0,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,NG TRAUMA TO EAR CANAL,V,61,15.1766
4,70100525,01/01/2007,212,1,2,,92,59,,1,1,0,4074,0,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,,V,61,15.1766


In [20]:
df.isnull().sum()


CPSC_Case_Number          1
Treatment_Date            0
Age                       0
Sex                       0
Race                      0
Other_Race          3501320
Body_Part                 0
Diagnosis                 0
Other_Diagnosis     3392466
Disposition               0
Location                  0
Fire_Involvement          0
Product_1                 0
Product_2                 0
Narrative_1               6
Narrative_2         1467893
Stratum                   8
PSU                       0
Weight                    8
dtype: int64

In [21]:
df.groupby('Stratum')['Weight'].sum()

Stratum
C    3.790573e+06
L    3.494865e+07
M    3.759190e+07
S    4.008357e+07
V    2.405878e+07
Name: Weight, dtype: float64

In [23]:
#HELP
##drop data with missing values
df.dropna(subset = ['CPSC_Case_Number'])
df.isnull().sum()
# doesnt seem to work?

CPSC_Case_Number          1
Treatment_Date            0
Age                       0
Sex                       0
Race                      0
Other_Race          3501320
Body_Part                 0
Diagnosis                 0
Other_Diagnosis     3392466
Disposition               0
Location                  0
Fire_Involvement          0
Product_1                 0
Product_2                 0
Narrative_1               6
Narrative_2         1467893
Stratum                   8
PSU                       0
Weight                    8
dtype: int64

## feature creation

In [24]:
#merge Narr1 and Narr2
df["Narrative"] = df["Narrative_1"].map(str) + df["Narrative_2"].map(str)

df.drop(["Narrative_1", "Narrative_2"], axis=1)

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Stratum,PSU,Weight,Narrative
0,70100518,01/01/2007,10,2,1,,92,57,,1,1,0,1211,0,V,61,15.1766,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...
1,70100520,01/01/2007,43,2,1,,83,64,,1,1,0,1842,0,V,61,15.1766,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...
2,70100521,01/01/2007,85,2,1,,79,53,,1,1,0,1864,0,V,61,15.1766,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...
3,70100523,01/01/2007,45,2,1,,94,71,TRAUMA EAR,1,1,0,1710,0,V,61,15.1766,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...
4,70100525,01/01/2007,212,1,2,,92,59,,1,1,0,4074,0,V,61,15.1766,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...
5,70100526,01/01/2007,33,1,2,,82,59,,1,1,0,4004,0,V,61,15.1766,33 YOM PUNCHED A MIRROR LACERATING RIGHT HANDnan
6,70100528,01/01/2007,6,1,3,UNKNOWN,76,59,,1,1,0,4076,0,V,61,15.1766,6 YOM WAS PLAYING WITH BROTHER ON BED AND HIT ...
7,70100529,01/01/2007,46,2,1,,76,59,,1,1,0,4056,0,V,61,15.1766,46 YOF TRIPPED AND FELL HITTING FOREHEAD ON ED...
8,70100530,01/01/2007,39,1,1,,77,53,,1,1,0,899,0,V,61,15.1766,39 YOM WAS GRINDING METAL ON FRIDAY AND GOT SO...
9,70100532,01/01/2007,29,1,2,,92,59,,1,1,0,464,0,V,61,15.1766,29 YOM LACERATED LEFT INDEX FINGER WITH A KNIF...


In [28]:
print(list(df.columns.values))

['CPSC_Case_Number', 'Treatment_Date', 'Age', 'Sex', 'Race', 'Other_Race', 'Body_Part', 'Diagnosis', 'Other_Diagnosis', 'Disposition', 'Location', 'Fire_Involvement', 'Product_1', 'Product_2', 'Narrative_1', 'Narrative_2', 'Stratum', 'PSU', 'Weight', 'Narrative']


In [29]:
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight,Narrative
0,70100518,01/01/2007,10,2,1,,92,57,,1,1,0,1211,0,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,L SUSTAINING A FRACTURED RIGHT PINKY FINGER,V,61,15.1766,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...
1,70100520,01/01/2007,43,2,1,,83,64,,1,1,0,1842,0,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,,V,61,15.1766,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...
2,70100521,01/01/2007,85,2,1,,79,53,,1,1,0,1864,0,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,,V,61,15.1766,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...
3,70100523,01/01/2007,45,2,1,,94,71,TRAUMA EAR,1,1,0,1710,0,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,NG TRAUMA TO EAR CANAL,V,61,15.1766,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...
4,70100525,01/01/2007,212,1,2,,92,59,,1,1,0,4074,0,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,,V,61,15.1766,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...


In [30]:
# convert treatment_date to datetime

df['Treatment_Date'] = pandas.to_datetime(df['Treatment_Date'], errors='coerce')


In [31]:
#Add day of week

df['Day_of_week'] = df['Treatment_Date'].dt.weekday_name

In [32]:
#check null day of week and datetime
df.isnull().sum()

CPSC_Case_Number          1
Treatment_Date            4
Age                       0
Sex                       0
Race                      0
Other_Race          3501320
Body_Part                 0
Diagnosis                 0
Other_Diagnosis     3392466
Disposition               0
Location                  0
Fire_Involvement          0
Product_1                 0
Product_2                 0
Narrative_1               6
Narrative_2         1467893
Stratum                   8
PSU                       0
Weight                    8
Narrative                 0
Day_of_week               4
dtype: int64

In [34]:
df[pandas.to_numeric(df['CPSC_Case_Number'], errors='coerce').notnull()]

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,...,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight,Narrative,Day_of_week
0,70100518,2007-01-01,10,2,1,,92,57,,1,...,0,1211,0,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,L SUSTAINING A FRACTURED RIGHT PINKY FINGER,V,61,15.1766,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,Monday
1,70100520,2007-01-01,43,2,1,,83,64,,1,...,0,1842,0,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,,V,61,15.1766,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,Monday
2,70100521,2007-01-01,85,2,1,,79,53,,1,...,0,1864,0,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,,V,61,15.1766,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,Monday
3,70100523,2007-01-01,45,2,1,,94,71,TRAUMA EAR,1,...,0,1710,0,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,NG TRAUMA TO EAR CANAL,V,61,15.1766,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,Monday
4,70100525,2007-01-01,212,1,2,,92,59,,1,...,0,4074,0,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,,V,61,15.1766,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,Monday
5,70100526,2007-01-01,33,1,2,,82,59,,1,...,0,4004,0,33 YOM PUNCHED A MIRROR LACERATING RIGHT HAND,,V,61,15.1766,33 YOM PUNCHED A MIRROR LACERATING RIGHT HANDnan,Monday
6,70100528,2007-01-01,6,1,3,UNKNOWN,76,59,,1,...,0,4076,0,6 YOM WAS PLAYING WITH BROTHER ON BED AND HIT ...,LACERATION TO EDGE OF RIGHT UPPER EYELID,V,61,15.1766,6 YOM WAS PLAYING WITH BROTHER ON BED AND HIT ...,Monday
7,70100529,2007-01-01,46,2,1,,76,59,,1,...,0,4056,0,46 YOF TRIPPED AND FELL HITTING FOREHEAD ON ED...,A LACERATION,V,61,15.1766,46 YOF TRIPPED AND FELL HITTING FOREHEAD ON ED...,Monday
8,70100530,2007-01-01,39,1,1,,77,53,,1,...,0,899,0,39 YOM WAS GRINDING METAL ON FRIDAY AND GOT SO...,AINING A CORNEAL ABRASION,V,61,15.1766,39 YOM WAS GRINDING METAL ON FRIDAY AND GOT SO...,Monday
9,70100532,2007-01-01,29,1,2,,92,59,,1,...,0,464,0,29 YOM LACERATED LEFT INDEX FINGER WITH A KNIF...,E FROZEN HAMBURGER PATTIES,V,61,15.1766,29 YOM LACERATED LEFT INDEX FINGER WITH A KNIF...,Monday


In [37]:
# replace commas with spaces
# i think i can do this later
#df['Narrative'] = df['Narrative'].str.replace(r"[,]", ' ')

In [38]:
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,...,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight,Narrative,Day_of_week
0,70100518,2007-01-01,10,2,1,,92,57,,1,...,0,1211,0,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,L SUSTAINING A FRACTURED RIGHT PINKY FINGER,V,61,15.1766,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,Monday
1,70100520,2007-01-01,43,2,1,,83,64,,1,...,0,1842,0,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,,V,61,15.1766,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,Monday
2,70100521,2007-01-01,85,2,1,,79,53,,1,...,0,1864,0,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,,V,61,15.1766,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,Monday
3,70100523,2007-01-01,45,2,1,,94,71,TRAUMA EAR,1,...,0,1710,0,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,NG TRAUMA TO EAR CANAL,V,61,15.1766,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,Monday
4,70100525,2007-01-01,212,1,2,,92,59,,1,...,0,4074,0,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,,V,61,15.1766,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,Monday


In [39]:
df.to_csv('Neiss_NLP_processed_0.csv', index=False)

### Save a sanitized dataset for further work


In [41]:
import codecs
#sanitize charactures
input_file = codecs.open("Neiss_NLP_processed_0.csv", "r",encoding='latin-1', errors='replace')
output_file = open("Neiss_NLP_processed_1.csv", "w")

def sanitize_characters(raw, clean):    
    for line in input_file:
        out = line
        output_file.write(line)
sanitize_characters(input_file, output_file)

In [2]:
import pandas
fp = 'Neiss_NLP_processed_1.csv'
df = pandas.read_csv(fp, encoding='latin-1')  # so the original file is in the 'latin-1' encoding, so we must read decode it from this

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.dtypes

CPSC_Case_Number     object
Treatment_Date       object
Age                  object
Sex                   int64
Race                  int64
Other_Race           object
Body_Part             int64
Diagnosis             int64
Other_Diagnosis      object
Disposition           int64
Location              int64
Fire_Involvement      int64
Product_1             int64
Product_2             int64
Narrative_1          object
Narrative_2          object
Stratum              object
PSU                   int64
Weight              float64
Narrative            object
Day_of_week          object
dtype: object

In [5]:
# Gotta come back and fix Age to Int and Cass number to Int, that Case number hsass one bad entry...

In [6]:
import numpy as np


In [8]:
#df[~df.applymap(np.isreal).all("CPSC_Case_Number")]

ValueError: No axis named CPSC_Case_Number for object type <class 'pandas.core.frame.DataFrame'>

In [9]:
df.dtypes

CPSC_Case_Number     object
Treatment_Date       object
Age                  object
Sex                   int64
Race                  int64
Other_Race           object
Body_Part             int64
Diagnosis             int64
Other_Diagnosis      object
Disposition           int64
Location              int64
Fire_Involvement      int64
Product_1             int64
Product_2             int64
Narrative_1          object
Narrative_2          object
Stratum              object
PSU                   int64
Weight              float64
Narrative            object
Day_of_week          object
dtype: object

In [44]:
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,...,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight,Narrative,Day_of_week
0,70100518,2007-01-01,10,2,1,,92,57,,1,...,0,1211,0,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,L SUSTAINING A FRACTURED RIGHT PINKY FINGER,V,61,15.1766,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,Monday
1,70100520,2007-01-01,43,2,1,,83,64,,1,...,0,1842,0,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,,V,61,15.1766,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,Monday
2,70100521,2007-01-01,85,2,1,,79,53,,1,...,0,1864,0,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,,V,61,15.1766,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,Monday
3,70100523,2007-01-01,45,2,1,,94,71,TRAUMA EAR,1,...,0,1710,0,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,NG TRAUMA TO EAR CANAL,V,61,15.1766,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,Monday
4,70100525,2007-01-01,212,1,2,,92,59,,1,...,0,4074,0,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,,V,61,15.1766,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,Monday
