# Pandas Project - Data Wrangling

In [1]:
import os
import pandas as pd

In [2]:
wd = os.getcwd() # store working dir
dataFile = wd + '/GSAF5.csv'
print(dataFile)

/home/fred/Ironhack/datamia0619/module-1/pandas-project/your-code/GSAF5.csv


In [3]:
data = pd.read_csv(dataFile, encoding = "ISO-8859-1", engine='python')


In [None]:
data.head()

In [5]:
# Identify and sum up missing values existing in each column

null_cols = data.isnull().sum()

In [6]:
null_cols

Case Number                  0
Date                         0
Year                         0
Type                         0
Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
pdf                          0
href formula                 1
href                         3
Case Number.1                0
Case Number.2                0
original order               0
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [7]:
# list all cols missing greater than 0 vals

null_cols[null_cols > 0] 

Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [8]:
# drop cols setting 5000 missing values as threshold

threshold = 5000
drop_cols = list(null_cols[null_cols > threshold].index)
print("The following columns will be dropped from the data set:\n")
for col in drop_cols:
    print('{} \n\t --contains {} null values--'.format(col, null_cols[col]))

The following columns will be dropped from the data set:

Unnamed: 22 
	 --contains 5991 null values--
Unnamed: 23 
	 --contains 5990 null values--


In [9]:
# Drop columns containing > 5000 missing values

data =  data.drop(drop_cols, axis = 1)

# re-visit header to examine changes

data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989


In [10]:
# column containing same values for case number appears three times in data. Two are dropped

data = data.drop(['Case Number.1', 'Case Number.2'], axis = 1)

# href and href contain same values, href formula and pdf column (containing case number and filename) are dropped

data = data.drop(['href formula', 'pdf'], axis = 1)

# column href renamed to pdf

data.rename(columns = {'href':'pdf_href'}, inplace=True)


In [11]:
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf_href,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5989


In [12]:
# Locate and remove rows containing 'invalid' under col 'Type'

data = data[data['Type'] !='Invalid']

In [13]:
data['Day'], data['Month'], data['Year'] = data['Date'].str.split('-', 2).str
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf_href,original order,Day,Month
0,2016.09.18.c,18-Sep-16,16,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5993,18,Sep
1,2016.09.18.b,18-Sep-16,16,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5992,18,Sep
2,2016.09.18.a,18-Sep-16,16,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5991,18,Sep
3,2016.09.17,17-Sep-16,16,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5990,17,Sep
4,2016.09.15,16-Sep-16,16,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5989,16,Sep


In [14]:
# locate and replace NaN

data = data.fillna(0)
data = data.drop(['Date'], axis = 1)

In [15]:
null_cols = data.isnull().sum()
null_cols


Case Number               0
Year                      0
Type                      0
Country                   0
Area                      0
Location                  0
Activity                  0
Name                      0
Sex                       0
Age                       0
Injury                    0
Fatal (Y/N)               0
Time                      0
Species                   0
Investigator or Source    0
pdf_href                  0
original order            0
Day                       0
Month                     0
dtype: int64

In [16]:
# Reorder columns

column_order = ['Year', 'Month', 'Day', 'Injury', 'Fatal (Y/N)', 'Name', 'Time','Country', 'Location', 'Area', 'Sex ', 'Age', 'Activity',
                'Type', 'Case Number', 'Investigator or Source', 'pdf_href', 'original order', 'Species ']
data = data[column_order]


In [17]:
data.head()

Unnamed: 0,Year,Month,Day,Injury,Fatal (Y/N),Name,Time,Country,Location,Area,Sex,Age,Activity,Type,Case Number,Investigator or Source,pdf_href,original order,Species
0,16,Sep,18,Minor injury to thigh,N,male,13h00,USA,"New Smyrna Beach, Volusia County",Florida,M,16,Surfing,Unprovoked,2016.09.18.c,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5993,0
1,16,Sep,18,Lacerations to hands,N,Chucky Luciano,11h00,USA,"New Smyrna Beach, Volusia County",Florida,M,36,Surfing,Unprovoked,2016.09.18.b,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5992,0
2,16,Sep,18,Lacerations to lower leg,N,male,10h43,USA,"New Smyrna Beach, Volusia County",Florida,M,43,Surfing,Unprovoked,2016.09.18.a,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5991,0
3,16,Sep,17,Struck by fin on chest & leg,N,Rory Angiolella,0,AUSTRALIA,Thirteenth Beach,Victoria,M,0,Surfing,Unprovoked,2016.09.17,"The Age, 9/18/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5990,0
4,16,Sep,16,No injury: Knocked off board by shark,N,male,0,AUSTRALIA,Bells Beach,Victoria,M,0,Surfing,Unprovoked,2016.09.15,"The Age, 9/16/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5989,2 m shark


In [18]:
# filtering the table even further...

filtered = data[(data['Age'] != 0) & (data['Time'] != 0) & (data['Species '] != 0) 
                & (data['Location'] != 0) & (data['Area'] !=0) & (data['Sex '] != 0)]
filtered.loc[data['Sex '] == 'lli', 'Sex '] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
filtered.head()

Unnamed: 0,Year,Month,Day,Injury,Fatal (Y/N),Name,Time,Country,Location,Area,Sex,Age,Activity,Type,Case Number,Investigator or Source,pdf_href,original order,Species
6,16,Sep,11,Minor injury to arm,N,male,15h15,USA,"Ponte Vedra, St. Johns County",Florida,M,60s,Wading,Unprovoked,2016.09.11,"News4Jax, 9/11/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5987,3' to 4' shark
7,16,Sep,7,Severe lacerations to shoulder & forearm,N,female,14h30,USA,"Makaha, Oahu",Hawaii,F,51,Swimming,Unprovoked,2016.09.07,"Hawaii News Now, 9/7/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5986,"Tiger shark, 10?"
13,16,Aug,29,Lacerations to right foot,N,Sam Cumiskey,15h00,USA,"New Smyrna Beach, Volusia County",Florida,M,25,Surfing,Unprovoked,2016.08.29.b,"News Channel 8, 8/30/16",http://sharkattackfile.net/spreadsheets/pdf_di...,5980,"Bull shark, 6'"
20,16,Jul,29,Lacerations to right hand,N,male,11h30,SPAIN,Arenales del Sol,Alicante Province,M,40,Swimming,Unprovoked,2016.07.29,"Informacion.es, 7/29/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5973,Blue shark
28,16,Jul,20,Laceration to left calf from hooked shark PROV...,N,Scott van Burck,After noon,AUSTRALIA,"20 k off The Spit, off the Gold Coast",Queensland,M,31,Fishing,Provoked,2016.07.20,"Nine News, 7/20/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5965,"reef shark, 1m"


In [21]:
filtered['Sex '].unique()
#filtered[(filtered['Case Number'] != 0) & (filtered['Year'] == 0)]

# for these results, replace date with case number

array(['M', 'F', 'Other'], dtype=object)

In [24]:
filtered.reset_index(inplace = True)

In [28]:
finalTable = filtered.drop('index', axis=1)

In [32]:
# export finalTable to CSV in working dir

finalTable.to_csv(r'shark_data_final.csv')

Unnamed: 0,Year,Month,Day,Injury,Fatal (Y/N),Name,Time,Country,Location,Area,Sex,Age,Activity,Type,Case Number,Investigator or Source,pdf_href,original order,Species
0,16,Sep,11,Minor injury to arm,N,male,15h15,USA,"Ponte Vedra, St. Johns County",Florida,M,60s,Wading,Unprovoked,2016.09.11,"News4Jax, 9/11/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5987,3' to 4' shark
1,16,Sep,07,Severe lacerations to shoulder & forearm,N,female,14h30,USA,"Makaha, Oahu",Hawaii,F,51,Swimming,Unprovoked,2016.09.07,"Hawaii News Now, 9/7/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5986,"Tiger shark, 10?"
2,16,Aug,29,Lacerations to right foot,N,Sam Cumiskey,15h00,USA,"New Smyrna Beach, Volusia County",Florida,M,25,Surfing,Unprovoked,2016.08.29.b,"News Channel 8, 8/30/16",http://sharkattackfile.net/spreadsheets/pdf_di...,5980,"Bull shark, 6'"
3,16,Jul,29,Lacerations to right hand,N,male,11h30,SPAIN,Arenales del Sol,Alicante Province,M,40,Swimming,Unprovoked,2016.07.29,"Informacion.es, 7/29/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5973,Blue shark
4,16,Jul,20,Laceration to left calf from hooked shark PROV...,N,Scott van Burck,After noon,AUSTRALIA,"20 k off The Spit, off the Gold Coast",Queensland,M,31,Fishing,Provoked,2016.07.20,"Nine News, 7/20/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5965,"reef shark, 1m"
5,16,Jul,07,Fin of hooked shark injured fisherman's forear...,N,Roger Brissom,10h00,USA,"Off Gloucester, Essec County",Massachusetts,M,59,Fishing,Provoked,2016.07.07.b,Salem News 7/8/2016,http://sharkattackfile.net/spreadsheets/pdf_di...,5957,dogfish shark
6,16,Jul,04,Right thigh injured by hooked pregnant female ...,N,Nathan Oliver,22h00,AUSTRALIA,Palm Cove,Queensland,M,34,Fishing,Provoked,2016.07.04,"Cairns Post, 7/9/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5954,Tawny nurse shark
7,16,Jun,27,Minor injury,N,male,16h20,USA,Sullivan's Island,South Carolina,M,35,0,Unprovoked,2016.06.27,"C. Creswell, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,5953,3' to 4' shark
8,16,Jun,21,Lacerations and punctures to foot,N,Jeff Schott,15h25,USA,"North Myrtle Beach, Horry County",South Carolina,M,42,Floating,Unprovoked,2016.06.21.b,"C. Creswell, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,5949,3' to 5' shark
9,16,Jun,14,Injury to lower leg,N,Marin Alice Melton,17h30,USA,"Pirates Beach, Galveston",Texas,F,6,Floating in tube,Unprovoked,2016.06.14,"Click2Houston, 6/14/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,5945,3' to 4' shark
