# Load data

In [65]:
import pandas

In [66]:
# https://www.cpsc.gov/cgibin/NEISSQuery/UserCriteria.aspx?UserAff=CvbkBwSYvXoJ%2blc0Tfzwdg%3d%3d&UserAffOther=9OYR9kUytIsLilKZieD5xg%3d%3d
# this is latin-1 encoded file - most us government data is latin-1 
fp = 'NEISS.TXT'
df = pandas.read_csv(fp, sep='\t', encoding='latin-1')  # so the original file is in the 'latin-1' encoding, so we must read decode it from this

  interactivity=interactivity, compiler=compiler, result=result)


In [67]:
low_memory=False

## Inspect Data

see dimensionality of data

In [68]:
print(df.shape)

(1873129, 19)


see the null values

In [69]:
df.isnull().sum()

CPSC_Case_Number          0
Treatment_Date            0
Age                       0
Sex                       0
Race                      0
Other_Race          1742962
Body_Part                 1
Diagnosis                 1
Other_Diagnosis     1632142
Disposition               1
Location                  1
Fire_Involvement          1
Product_1                 1
Product_2                 1
Narrative_1               2
Narrative_2          666073
Stratum                   2
PSU                       2
Weight                    2
dtype: int64

quick check for datatypes

In [70]:
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight
0,120103891,01/02/2012,54,2,1.0,,75.0,59.0,,1.0,1.0,0.0,4057.0,0.0,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE T","ABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN",S,71.0,69.9872
1,120103905,01/01/2012,35,2,2.0,,85.0,65.0,,1.0,0.0,3.0,1143.0,0.0,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATER","TO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD",S,71.0,69.9872
2,120103908,01/02/2012,18,1,2.0,,30.0,55.0,,1.0,8.0,0.0,1205.0,0.0,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDER",DISLOCATION,S,71.0,69.9872
3,120103909,01/02/2012,17,1,2.0,,93.0,64.0,,1.0,9.0,0.0,1211.0,0.0,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN",,S,71.0,69.9872
4,120103911,01/02/2012,49,1,2.0,,37.0,64.0,,1.0,5.0,0.0,1807.0,0.0,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHEST",CONTUSION,S,71.0,69.9872


# Clean up

In [71]:
# change the floats to ints
# Note: 'Weight' is a float
cols_float_to_int = ['Sex', 'Race', 'Body_Part', 'Diagnosis', 'Disposition', 
                     'Location','Fire_Involvement', 'Product_1', 'Product_2', 'PSU']

In [72]:
for col_name in cols_float_to_int:
    df[col_name] = df[col_name].fillna(999).astype(int)

In [73]:
# converted to ints
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight
0,120103891,01/02/2012,54,2,1,,75,59,,1,1,0,4057,0,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE T","ABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN",S,71,69.9872
1,120103905,01/01/2012,35,2,2,,85,65,,1,0,3,1143,0,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATER","TO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD",S,71,69.9872
2,120103908,01/02/2012,18,1,2,,30,55,,1,8,0,1205,0,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDER",DISLOCATION,S,71,69.9872
3,120103909,01/02/2012,17,1,2,,93,64,,1,9,0,1211,0,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN",,S,71,69.9872
4,120103911,01/02/2012,49,1,2,,37,64,,1,5,0,1807,0,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHEST",CONTUSION,S,71,69.9872


In [74]:
df.isnull().sum()


CPSC_Case_Number          0
Treatment_Date            0
Age                       0
Sex                       0
Race                      0
Other_Race          1742962
Body_Part                 0
Diagnosis                 0
Other_Diagnosis     1632142
Disposition               0
Location                  0
Fire_Involvement          0
Product_1                 0
Product_2                 0
Narrative_1               2
Narrative_2          666073
Stratum                   2
PSU                       0
Weight                    2
dtype: int64

In [75]:
df.groupby('Stratum')['Weight'].sum()

Stratum
C    1.829899e+06
L    1.836508e+07
M    1.963897e+07
S    1.905677e+07
V    1.207022e+07
Name: Weight, dtype: float64

In [77]:
#HELP
##drop data with missing values
df.dropna(subset = ['CPSC_Case_Number'])
df.isnull().sum()
# doesnt seem to work?

CPSC_Case_Number          0
Treatment_Date            0
Age                       0
Sex                       0
Race                      0
Other_Race          1742962
Body_Part                 0
Diagnosis                 0
Other_Diagnosis     1632142
Disposition               0
Location                  0
Fire_Involvement          0
Product_1                 0
Product_2                 0
Narrative_1               2
Narrative_2          666073
Stratum                   2
PSU                       0
Weight                    2
dtype: int64

## feature creation

In [78]:
#get those columns 0 and 2 fixed to a single type!
print(df.shape)

(1873129, 19)


In [79]:
to_drop = ['[a-zA-Z]']
df[~df['CPSC_Case_Number'].isin(to_drop)]

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight
0,120103891,01/02/2012,54,2,1,,75,59,,1,1,0,4057,0,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE T","ABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN",S,71,69.9872
1,120103905,01/01/2012,35,2,2,,85,65,,1,0,3,1143,0,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATER","TO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD",S,71,69.9872
2,120103908,01/02/2012,18,1,2,,30,55,,1,8,0,1205,0,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDER",DISLOCATION,S,71,69.9872
3,120103909,01/02/2012,17,1,2,,93,64,,1,9,0,1211,0,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN",,S,71,69.9872
4,120103911,01/02/2012,49,1,2,,37,64,,1,5,0,1807,0,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHEST",CONTUSION,S,71,69.9872
5,120103912,01/02/2012,67,2,2,,35,53,,1,1,0,4074,0,"67YOF, CHAIR COLLASPED AFTER PATIENT SITTING DOWN IN IT. DX: R KNEE CON",TUSION,S,71,69.9872
6,120103913,01/02/2012,15,1,2,,32,58,,1,9,0,1211,0,"15YOM, HEMATOMA TO ELBOW WHILE PLAYING FOOTBALL. DX: INFECTED R ELBOW H",EMATOMA,S,71,69.9872
7,120104529,01/01/2012,42,1,0,,34,57,,1,0,0,1842,0,"A 42YOM HAD BEEN DRINKING ETOH, FELL DOWN SEVERAL STAIRS, FX TO WRIST",,V,25,16.2344
8,120104544,01/01/2012,69,2,2,,85,74,,1,0,0,983,0,"A 69YOF CHANGED BODY WASH 2-3 WKS AGO(USES HIGHLY SCENTED SOAP NOW), DE",VELOPED BODY RASH - BODY WASH REACTION,V,25,16.2344
9,120104546,01/01/2012,23,2,2,,75,71,HEADACHE,1,4,0,1141,0,A 23YOF IN LOW SPEED MVA WHEN CASE OF WATER FLEW FROM BACK SEAT AND HIT,"PTS HEAD, DX HEADACHE",V,25,16.2344


In [80]:
print(df.shape)

(1873129, 19)


In [81]:
df.dtypes


CPSC_Case_Number     object
Treatment_Date       object
Age                  object
Sex                   int64
Race                  int64
Other_Race           object
Body_Part             int64
Diagnosis             int64
Other_Diagnosis      object
Disposition           int64
Location              int64
Fire_Involvement      int64
Product_1             int64
Product_2             int64
Narrative_1          object
Narrative_2          object
Stratum              object
PSU                   int64
Weight              float64
dtype: object

In [82]:
pandas.to_numeric('CPSC_Case_Number', errors='ignore')


'CPSC_Case_Number'

In [83]:
pandas.to_numeric('CPSC_Case_Number', errors='coerce')


nan

In [84]:
df.dtypes

CPSC_Case_Number     object
Treatment_Date       object
Age                  object
Sex                   int64
Race                  int64
Other_Race           object
Body_Part             int64
Diagnosis             int64
Other_Diagnosis      object
Disposition           int64
Location              int64
Fire_Involvement      int64
Product_1             int64
Product_2             int64
Narrative_1          object
Narrative_2          object
Stratum              object
PSU                   int64
Weight              float64
dtype: object

In [85]:
narrative_series_1 = df['Narrative_1']
print(type(narrative_series_1))

<class 'pandas.core.series.Series'>


In [86]:
narrative_series_2 = df['Narrative_2']
print(type(narrative_series_2))

<class 'pandas.core.series.Series'>


In [87]:
print(narrative_series_2)

0                          ABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN
1           TO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD
2                                                                      DISLOCATION
3                                                                              NaN
4                                                                        CONTUSION
5                                                                           TUSION
6                                                                          EMATOMA
7                                                                              NaN
8                                           VELOPED BODY RASH - BODY WASH REACTION
9                                                            PTS HEAD, DX HEADACHE
10                                                                             NaN
11                                                                             NaN
12  

In [88]:
# add trailing space to narr1
def add_a_trailing_space(_str):
    _str = str(_str) + "[\s]"
    return _str

In [89]:
print(narrative_series_1.apply(add_a_trailing_space))

0          54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE T[\s]
1           35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATER[\s]
2          18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDER[\s]
3                  17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN[\s]
4           49YOM, SLIPPED IN WATER ON FLOOR  IN JAIL. DX: R ANKLE SPRAIN, R CHEST[\s]
5          67YOF, CHAIR COLLASPED AFTER PATIENT SITTING DOWN IN IT. DX: R KNEE CON[\s]
6          15YOM, HEMATOMA TO ELBOW WHILE PLAYING FOOTBALL. DX: INFECTED R ELBOW H[\s]
7            A 42YOM HAD BEEN DRINKING ETOH, FELL DOWN SEVERAL STAIRS, FX TO WRIST[\s]
8          A 69YOF CHANGED BODY WASH 2-3 WKS AGO(USES HIGHLY SCENTED SOAP NOW), DE[\s]
9          A 23YOF IN LOW SPEED MVA WHEN CASE OF WATER FLEW FROM BACK SEAT AND HIT[\s]
10                 A 3YOM'S SOAP WAS CHANGED FROM *** TO ***, TO ER WITH BODY RASH[\s]
11                                         

In [90]:
# remove Nan from narr2

In [91]:
df.Narrative_2 = df.Narrative_2.fillna('')

In [92]:
print(narrative_series_2)

0                          ABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN
1           TO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD
2                                                                      DISLOCATION
3                                                                                 
4                                                                        CONTUSION
5                                                                           TUSION
6                                                                          EMATOMA
7                                                                                 
8                                           VELOPED BODY RASH - BODY WASH REACTION
9                                                            PTS HEAD, DX HEADACHE
10                                                                                
11                                                                                
12  

In [93]:
#merge Narr1 and Narr2
df["Narrative"] = df["Narrative_1"].map(str) + df["Narrative_2"].map(str)

df.drop(["Narrative_1", "Narrative_2"], axis=1)

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Stratum,PSU,Weight,Narrative
0,120103891,01/02/2012,54,2,1,,75,59,,1,1,0,4057,0,S,71,69.9872,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE TABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN"
1,120103905,01/01/2012,35,2,2,,85,65,,1,0,3,1143,0,S,71,69.9872,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATERTO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD"
2,120103908,01/02/2012,18,1,2,,30,55,,1,8,0,1205,0,S,71,69.9872,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDERDISLOCATION"
3,120103909,01/02/2012,17,1,2,,93,64,,1,9,0,1211,0,S,71,69.9872,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN"
4,120103911,01/02/2012,49,1,2,,37,64,,1,5,0,1807,0,S,71,69.9872,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHESTCONTUSION"
5,120103912,01/02/2012,67,2,2,,35,53,,1,1,0,4074,0,S,71,69.9872,"67YOF, CHAIR COLLASPED AFTER PATIENT SITTING DOWN IN IT. DX: R KNEE CONTUSION"
6,120103913,01/02/2012,15,1,2,,32,58,,1,9,0,1211,0,S,71,69.9872,"15YOM, HEMATOMA TO ELBOW WHILE PLAYING FOOTBALL. DX: INFECTED R ELBOW HEMATOMA"
7,120104529,01/01/2012,42,1,0,,34,57,,1,0,0,1842,0,V,25,16.2344,"A 42YOM HAD BEEN DRINKING ETOH, FELL DOWN SEVERAL STAIRS, FX TO WRIST"
8,120104544,01/01/2012,69,2,2,,85,74,,1,0,0,983,0,V,25,16.2344,"A 69YOF CHANGED BODY WASH 2-3 WKS AGO(USES HIGHLY SCENTED SOAP NOW), DEVELOPED BODY RASH - BODY WASH REACTION"
9,120104546,01/01/2012,23,2,2,,75,71,HEADACHE,1,4,0,1141,0,V,25,16.2344,"A 23YOF IN LOW SPEED MVA WHEN CASE OF WATER FLEW FROM BACK SEAT AND HITPTS HEAD, DX HEADACHE"


In [94]:
print(list(df.columns.values))

['CPSC_Case_Number', 'Treatment_Date', 'Age', 'Sex', 'Race', 'Other_Race', 'Body_Part', 'Diagnosis', 'Other_Diagnosis', 'Disposition', 'Location', 'Fire_Involvement', 'Product_1', 'Product_2', 'Narrative_1', 'Narrative_2', 'Stratum', 'PSU', 'Weight', 'Narrative']


In [95]:
narrative_series = df['Narrative']
print(type(narrative_series)) 

<class 'pandas.core.series.Series'>


In [96]:
pandas.options.display.max_colwidth = 200

In [97]:
print(narrative_series)

0                          54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE TABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN
1            35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATERTO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD
2                                                                      18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDERDISLOCATION
3                                                                                         17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN
4                                                                         49YOM, SLIPPED IN WATER ON FLOOR  IN JAIL. DX: R ANKLE SPRAIN, R CHESTCONTUSION
5                                                                           67YOF, CHAIR COLLASPED AFTER PATIENT SITTING DOWN IN IT. DX: R KNEE CONTUSION
6                                                                          1

In [98]:
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight,Narrative
0,120103891,01/02/2012,54,2,1,,75,59,,1,1,0,4057,0,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE T","ABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN",S,71,69.9872,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE TABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN"
1,120103905,01/01/2012,35,2,2,,85,65,,1,0,3,1143,0,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATER","TO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD",S,71,69.9872,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATERTO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD"
2,120103908,01/02/2012,18,1,2,,30,55,,1,8,0,1205,0,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDER",DISLOCATION,S,71,69.9872,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDERDISLOCATION"
3,120103909,01/02/2012,17,1,2,,93,64,,1,9,0,1211,0,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN",,S,71,69.9872,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN"
4,120103911,01/02/2012,49,1,2,,37,64,,1,5,0,1807,0,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHEST",CONTUSION,S,71,69.9872,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHESTCONTUSION"


In [99]:
# Delete the narrtive single colmns from the dataframe
del df['Narrative_1']
del df['Narrative_2']

In [100]:
# convert treatment_date to datetime

df['Treatment_Date'] = pandas.to_datetime(df['Treatment_Date'], errors='coerce')


In [101]:
#Add day of week

df['Day_of_week'] = df['Treatment_Date'].dt.weekday_name

In [102]:
#check null day of week and datetime
df.dtypes


CPSC_Case_Number            object
Treatment_Date      datetime64[ns]
Age                         object
Sex                          int64
Race                         int64
Other_Race                  object
Body_Part                    int64
Diagnosis                    int64
Other_Diagnosis             object
Disposition                  int64
Location                     int64
Fire_Involvement             int64
Product_1                    int64
Product_2                    int64
Stratum                     object
PSU                          int64
Weight                     float64
Narrative                   object
Day_of_week                 object
dtype: object

In [103]:
df[pandas.to_numeric(df['CPSC_Case_Number'], errors='coerce').notnull()]

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Stratum,PSU,Weight,Narrative,Day_of_week
0,120103891,2012-01-02,54,2,1,,75,59,,1,1,0,4057,0,S,71,69.9872,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE TABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN",Monday
1,120103905,2012-01-01,35,2,2,,85,65,,1,0,3,1143,0,S,71,69.9872,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATERTO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD",Sunday
2,120103908,2012-01-02,18,1,2,,30,55,,1,8,0,1205,0,S,71,69.9872,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDERDISLOCATION",Monday
3,120103909,2012-01-02,17,1,2,,93,64,,1,9,0,1211,0,S,71,69.9872,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN",Monday
4,120103911,2012-01-02,49,1,2,,37,64,,1,5,0,1807,0,S,71,69.9872,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHESTCONTUSION",Monday
5,120103912,2012-01-02,67,2,2,,35,53,,1,1,0,4074,0,S,71,69.9872,"67YOF, CHAIR COLLASPED AFTER PATIENT SITTING DOWN IN IT. DX: R KNEE CONTUSION",Monday
6,120103913,2012-01-02,15,1,2,,32,58,,1,9,0,1211,0,S,71,69.9872,"15YOM, HEMATOMA TO ELBOW WHILE PLAYING FOOTBALL. DX: INFECTED R ELBOW HEMATOMA",Monday
7,120104529,2012-01-01,42,1,0,,34,57,,1,0,0,1842,0,V,25,16.2344,"A 42YOM HAD BEEN DRINKING ETOH, FELL DOWN SEVERAL STAIRS, FX TO WRIST",Sunday
8,120104544,2012-01-01,69,2,2,,85,74,,1,0,0,983,0,V,25,16.2344,"A 69YOF CHANGED BODY WASH 2-3 WKS AGO(USES HIGHLY SCENTED SOAP NOW), DEVELOPED BODY RASH - BODY WASH REACTION",Sunday
9,120104546,2012-01-01,23,2,2,,75,71,HEADACHE,1,4,0,1141,0,V,25,16.2344,"A 23YOF IN LOW SPEED MVA WHEN CASE OF WATER FLEW FROM BACK SEAT AND HITPTS HEAD, DX HEADACHE",Sunday


In [104]:
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Stratum,PSU,Weight,Narrative,Day_of_week
0,120103891,2012-01-02,54,2,1,,75,59,,1,1,0,4057,0,S,71,69.9872,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE TABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN",Monday
1,120103905,2012-01-01,35,2,2,,85,65,,1,0,3,1143,0,S,71,69.9872,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATERTO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD",Sunday
2,120103908,2012-01-02,18,1,2,,30,55,,1,8,0,1205,0,S,71,69.9872,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDERDISLOCATION",Monday
3,120103909,2012-01-02,17,1,2,,93,64,,1,9,0,1211,0,S,71,69.9872,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN",Monday
4,120103911,2012-01-02,49,1,2,,37,64,,1,5,0,1807,0,S,71,69.9872,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHESTCONTUSION",Monday


In [105]:
df.dtypes


CPSC_Case_Number            object
Treatment_Date      datetime64[ns]
Age                         object
Sex                          int64
Race                         int64
Other_Race                  object
Body_Part                    int64
Diagnosis                    int64
Other_Diagnosis             object
Disposition                  int64
Location                     int64
Fire_Involvement             int64
Product_1                    int64
Product_2                    int64
Stratum                     object
PSU                          int64
Weight                     float64
Narrative                   object
Day_of_week                 object
dtype: object

In [106]:
df.groupby('Age').count()

Unnamed: 0_level_0,CPSC_Case_Number,Treatment_Date,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Stratum,PSU,Weight,Narrative,Day_of_week
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,125,125,125,125,5,125,125,17,125,125,125,125,125,125,125,125,125,125
2,62933,62933,62933,62933,6614,62933,62933,2904,62933,62933,62933,62933,62933,62933,62933,62933,62933,62933
3,52924,52924,52924,52924,5482,52924,52924,2552,52924,52924,52924,52924,52924,52924,52924,52924,52924,52924
4,44692,44692,44692,44692,4715,44692,44692,2264,44692,44692,44692,44692,44692,44692,44692,44692,44692,44692
5,40701,40701,40701,40701,4351,40701,40701,2421,40701,40701,40701,40701,40701,40701,40701,40701,40701,40701
6,37687,37687,37687,37687,3918,37687,37687,2619,37687,37687,37687,37687,37687,37687,37687,37687,37687,37687
7,36016,36016,36016,36016,3641,36016,36016,2934,36016,36016,36016,36016,36016,36016,36016,36016,36016,36016
8,35822,35822,35822,35822,3392,35822,35822,3379,35822,35822,35822,35822,35822,35822,35822,35822,35822,35822
9,37358,37358,37358,37358,3431,37358,37358,3799,37358,37358,37358,37358,37358,37358,37358,37358,37358,37358
10,39691,39691,39691,39691,3627,39691,39691,4221,39691,39691,39691,39691,39691,39691,39691,39691,39691,39691


In [107]:
to_drop = ['L', 'V']
df[~df['Age'].isin(to_drop)]

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Stratum,PSU,Weight,Narrative,Day_of_week
0,120103891,2012-01-02,54,2,1,,75,59,,1,1,0,4057,0,S,71,69.9872,"54YOF, GOT UP AT HOME TO GO TO BATHROOM, FELL AND HIT HEAD ON BEDSIDE TABLE. DX: LACERATION TO SCALP, HEAD TRAUMA, NECK STRAIN",Monday
1,120103905,2012-01-01,35,2,2,,85,65,,1,0,3,1143,0,S,71,69.9872,"35YOF, SMOKE INHALATION & SHOULDER PAIN FROM CARRYING BUCKETS OF WATERTO GRASS FIRE. DX: SMOKE INHALATION, R SHOULDER PAIN, UNKNOWN ABOUT FD",Sunday
2,120103908,2012-01-02,18,1,2,,30,55,,1,8,0,1205,0,S,71,69.9872,"18YOM, HURT SHOULDER WHILE PLAYING BASKETBALL AT SCHOOL. DX: R SHOULDERDISLOCATION",Monday
3,120103909,2012-01-02,17,1,2,,93,64,,1,9,0,1211,0,S,71,69.9872,"17YOM, STEPPED IN HOLE WHILE PLAYING FOOTBALL. DX: R TOE SPRAIN",Monday
4,120103911,2012-01-02,49,1,2,,37,64,,1,5,0,1807,0,S,71,69.9872,"49YOM, SLIPPED IN WATER ON FLOOR IN JAIL. DX: R ANKLE SPRAIN, R CHESTCONTUSION",Monday
5,120103912,2012-01-02,67,2,2,,35,53,,1,1,0,4074,0,S,71,69.9872,"67YOF, CHAIR COLLASPED AFTER PATIENT SITTING DOWN IN IT. DX: R KNEE CONTUSION",Monday
6,120103913,2012-01-02,15,1,2,,32,58,,1,9,0,1211,0,S,71,69.9872,"15YOM, HEMATOMA TO ELBOW WHILE PLAYING FOOTBALL. DX: INFECTED R ELBOW HEMATOMA",Monday
7,120104529,2012-01-01,42,1,0,,34,57,,1,0,0,1842,0,V,25,16.2344,"A 42YOM HAD BEEN DRINKING ETOH, FELL DOWN SEVERAL STAIRS, FX TO WRIST",Sunday
8,120104544,2012-01-01,69,2,2,,85,74,,1,0,0,983,0,V,25,16.2344,"A 69YOF CHANGED BODY WASH 2-3 WKS AGO(USES HIGHLY SCENTED SOAP NOW), DEVELOPED BODY RASH - BODY WASH REACTION",Sunday
9,120104546,2012-01-01,23,2,2,,75,71,HEADACHE,1,4,0,1141,0,V,25,16.2344,"A 23YOF IN LOW SPEED MVA WHEN CASE OF WATER FLEW FROM BACK SEAT AND HITPTS HEAD, DX HEADACHE",Sunday


In [108]:
df.groupby('Age').count()

Unnamed: 0_level_0,CPSC_Case_Number,Treatment_Date,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,Location,Fire_Involvement,Product_1,Product_2,Stratum,PSU,Weight,Narrative,Day_of_week
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,125,125,125,125,5,125,125,17,125,125,125,125,125,125,125,125,125,125
2,62933,62933,62933,62933,6614,62933,62933,2904,62933,62933,62933,62933,62933,62933,62933,62933,62933,62933
3,52924,52924,52924,52924,5482,52924,52924,2552,52924,52924,52924,52924,52924,52924,52924,52924,52924,52924
4,44692,44692,44692,44692,4715,44692,44692,2264,44692,44692,44692,44692,44692,44692,44692,44692,44692,44692
5,40701,40701,40701,40701,4351,40701,40701,2421,40701,40701,40701,40701,40701,40701,40701,40701,40701,40701
6,37687,37687,37687,37687,3918,37687,37687,2619,37687,37687,37687,37687,37687,37687,37687,37687,37687,37687
7,36016,36016,36016,36016,3641,36016,36016,2934,36016,36016,36016,36016,36016,36016,36016,36016,36016,36016
8,35822,35822,35822,35822,3392,35822,35822,3379,35822,35822,35822,35822,35822,35822,35822,35822,35822,35822
9,37358,37358,37358,37358,3431,37358,37358,3799,37358,37358,37358,37358,37358,37358,37358,37358,37358,37358
10,39691,39691,39691,39691,3627,39691,39691,4221,39691,39691,39691,39691,39691,39691,39691,39691,39691,39691


In [109]:
df.Age.unique()

array([54, 35, 18, 17, 49, 67, 15, 42, 69, 23, 3, 82, 22, 222, 61, 21, 44,
       36, 29, 58, 40, 28, 78, 80, 73, 39, 11, 5, 33, 65, 220, 45, 34, 76,
       212, 16, 74, 25, 20, 7, 52, 48, 204, 4, 2, 216, 214, 32, 8, 68, 38,
       30, 27, 88, 56, 26, 53, 207, 24, 6, 63, 59, 70, 31, 89, 51, 57, 10,
       205, 9, 37, 46, 75, 85, 14, 210, 87, 72, 43, 77, 215, 101, 62, 66,
       90, 19, 55, 13, 91, 12, 60, 218, 223, 47, 217, 84, 41, 50, 213, 71,
       103, 219, 221, 86, 211, 201, 79, 96, 83, 81, 209, 64, 95, 92, 94,
       203, 202, 100, 98, 97, 93, 208, 206, 102, 99, 106, 0, 107, 105,
       104, 109, '56', '85', '9', '65', '4', '32', '21', '75', '55', '11',
       '89', '8', '48', '51', '63', '20', '81', '57', '71', '54', '67',
       '5', '22', '35', '15', '72', '17', '79', '14', '24', '90', '29',
       '16', '78', '220', '201', '43', '52', '61', '12', '204', '46',
       '216', '41', '215', '40', '53', '83', '25', '10', '28', '73', '31',
       '37', '221', '66', '13', '38', '68',

In [110]:
df.to_csv('Neiss_NLP_processed_0.csv', index=False)

### Save a sanitized dataset for further work


In [111]:
import codecs
#sanitize charactures
input_file = codecs.open("Neiss_NLP_processed_0.csv", "r",encoding='latin-1', errors='replace')
output_file = open("Neiss_NLP_processed_1.csv", "w")

def sanitize_characters(raw, clean):    
    for line in input_file:
        out = line
        output_file.write(line)
sanitize_characters(input_file, output_file)

In [112]:
import pandas
fp = 'Neiss_NLP_processed_1.csv'
df = pandas.read_csv(fp, encoding='latin-1')  # so the original file is in the 'latin-1' encoding, so we must read decode it from this

  interactivity=interactivity, compiler=compiler, result=result)


In [113]:
df.dtypes

CPSC_Case_Number     object
Treatment_Date       object
Age                  object
Sex                   int64
Race                  int64
Other_Race           object
Body_Part             int64
Diagnosis             int64
Other_Diagnosis      object
Disposition           int64
Location              int64
Fire_Involvement      int64
Product_1             int64
Product_2             int64
Stratum              object
PSU                   int64
Weight              float64
Narrative            object
Day_of_week          object
dtype: object

In [5]:
# Gotta come back and fix Age to Int and Cass number to Int, that Case number hsass one bad entry...

In [6]:
import numpy as np


In [8]:
#df[~df.applymap(np.isreal).all("CPSC_Case_Number")]

ValueError: No axis named CPSC_Case_Number for object type <class 'pandas.core.frame.DataFrame'>

In [9]:
df.dtypes

CPSC_Case_Number     object
Treatment_Date       object
Age                  object
Sex                   int64
Race                  int64
Other_Race           object
Body_Part             int64
Diagnosis             int64
Other_Diagnosis      object
Disposition           int64
Location              int64
Fire_Involvement      int64
Product_1             int64
Product_2             int64
Narrative_1          object
Narrative_2          object
Stratum              object
PSU                   int64
Weight              float64
Narrative            object
Day_of_week          object
dtype: object

In [44]:
df.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Body_Part,Diagnosis,Other_Diagnosis,Disposition,...,Fire_Involvement,Product_1,Product_2,Narrative_1,Narrative_2,Stratum,PSU,Weight,Narrative,Day_of_week
0,70100518,2007-01-01,10,2,1,,92,57,,1,...,0,1211,0,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,L SUSTAINING A FRACTURED RIGHT PINKY FINGER,V,61,15.1766,10 YOF WAS PLAYING FOOTBALL IN DAD'S BACKYARD ...,Monday
1,70100520,2007-01-01,43,2,1,,83,64,,1,...,0,1842,0,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,,V,61,15.1766,43 YOF FELL ON STEPS SUSTAINING A SPRAINED RIG...,Monday
2,70100521,2007-01-01,85,2,1,,79,53,,1,...,0,1864,0,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,,V,61,15.1766,85 YOF HIT LOWER BACK AGAINST COUNTER SUSTAINI...,Monday
3,70100523,2007-01-01,45,2,1,,94,71,TRAUMA EAR,1,...,0,1710,0,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,NG TRAUMA TO EAR CANAL,V,61,15.1766,45 YOF WAS CLEANING RIGHT EAR WITH A *** AND P...,Monday
4,70100525,2007-01-01,212,1,2,,92,59,,1,...,0,4074,0,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,,V,61,15.1766,12 MOM LACERATED LEFT INDEX FINGER IN HOLE OF ...,Monday
