# Cleaning Data With Pandas 📈 📈 📈

In [None]:
import pandas as pd

## NEISS data 


In [None]:
import pandas as pd

In [None]:
# None gives us HOWEVER MANY CHARACTERS THERE ARE
pd.options.display.max_colwidth = None

# csv = comma-separated values
# tsv = tab-separated values
# \t means tab
neiss = pd.read_csv("raw-data/neiss2017_edited.tsv",
                 sep="\t",
                 encoding="latin-1",
                 na_values=[0],
                 parse_dates=['Treatment_Date'])
neiss.head(2)

If you have a string column and you want to turn it into a datetime you can do this!

In [None]:
neiss['Treatment_Date'] = pd.to_datetime(neiss['Treatment_Date'])

In [None]:
neiss['Treatment_Date'].value_counts().plot()

In [None]:
neiss['Treatment_Date'].plot()

In [None]:
neiss['Treatment_Date'].dt.day_name().value_counts()

In [None]:
neiss['Treatment_Date'].dt.day_name().value_counts().reset_index()

In [None]:
neiss['Treatment_Date'].value_counts().plot()

In [None]:
neiss.resample('M', on='Treatment_Date').size().plot()

In [None]:
neiss.info()

In [None]:
neiss['Treatment_Date'].dt.day_name()

## How old is everyone that ends up in the emergency room?

In [None]:
# neiss['Age'].mean() B
# neiss['Age'].median(), or..... neiss['Age'].hist()
neiss['Age'].mean()

In [None]:
neiss.dtypes

In [None]:
neiss.head(2)

### Clean the ages up 

In [None]:
age = "21YO"

Replace YO with nothing and now we just have the age

In [None]:
age.replace("YO", "")

In [None]:
int(age.replace("YO", ""))

In [None]:
# list slicing, every character except the last two
age[:-2]

In [None]:
# .replace is used for replacing ENTIRE CELLS OF DATA
# so this won't work
neiss['Age'].replace("YO", "")
#df['Age'].replace("21YO", "HELLO")

In [None]:
# takes everything except the last two
neiss['Age'].str[:-2].astype(float)
# Take the last two characters of a column
# df['state_abbrev'] = df['address'].str[-2:]
# "New York, NY" -> "NY"

Let's clean up the age column

In [None]:
neiss['Age'] = neiss['Age'].str.replace("YO", "").astype(float)

In [None]:
neiss.head()

In [None]:
neiss['Age'].median()

In [None]:
neiss['Age'].hist()

### Um, what is happening with these over 200 year old people?

In [None]:
neiss[neiss['Age'] > 200]

In [None]:
# This would convert months to fractional for baby rows
# 212 -> (212 - 200) / 12 -> 12 / 12 -> 1 years
# 218 -> (218 - 200) / 12 -> 18 / 12 -> 1.5 years
# neiss.loc[neiss['Age'] < 200, 'Age'] = (df['Age'] - 200) / 12

In [None]:
neiss = neiss[neiss['Age'] < 200].copy()

In [None]:
neiss.head()

What about the sex of people?

In [None]:
neiss['Sex'].value_counts()

In [None]:
import numpy as np

neiss['Sex'] = neiss['Sex'].replace({
    1: 'Male',
    2: 'Female',
    0: np.nan
})
neiss['Sex'].value_counts(dropna=False, normalize=True)

And race?

In [None]:
neiss['Race'].value_counts()

### Let's look at the narratives

In [None]:
neiss['Narrative_1'].str.contains("PUNCH")

In [None]:
neiss

In [None]:
# ValueError: Cannot mask with non-boolean array containing NA / NaN values
neiss[neiss['Narrative_1'].str.contains("PUNCH", na=False)]

In [None]:
# JONATHAN SOMA PUNCHED A WALL WITH HIS HAND
# JONATHAN SOMA WAS PUNCHING TEN WALLS WITH
# PUNCHED A WALL
# PUNCH(SOME OTHER STUFF????)WALL
# PUNCH.*WALL
neiss[neiss['Narrative_1'].str.contains("PUNCH.*WALL", na=False)]

In [None]:
# PUNCHED A WALL
# PUNCHING A WALL
# PUNCHED THE WALL
# PUNCHED WALL
# . means "all kinds of stuff"
# \w means "word stuff"
# \d means "digit stuff"
neiss['Narrative_1'].str.extract("PUNCHED A (\w*)", expand=False).value_counts()

In [None]:
neiss['Narrative_1'].str.extract("PUNCHED A (.*)", expand=False)

In [None]:
neiss.head(2)

In [None]:
neiss.info()

### Save the data

In [None]:
# export the csv, first making directories that may not exist yet locally
os.makedirs('processed', exist_ok=True)

In [None]:
neiss.to_csv("processed/neiss_cleaned.csv", index=False)