# Pandas Project (06-20-19) - (Week 2 Project)

## The importation of the required libraries 

In [1]:
#Importing pandas library defined as ('pd')
import pandas as pd

## Import Shark Attack data downloaded from `Readme.md`

In [2]:
# Must use encoding type (ISO-8859-1) or utf encoding error will occour
shark_data = pd.read_csv('SharkAttack_uncleaned.csv', encoding="ISO-8859-1")

## Count null(empty) entries

In [3]:
null_columns = shark_data.isnull().sum()
null_columns[null_columns > 0]

Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

## Remove any null(empty) columns if percentage is greater than or equal to 90%

In [4]:
#We use (data.drop) to remove any feilds that are null
#We use (axis=1) to only do columns not rows
#This has removed (Unnamed: 22) and (Unnamed: 23) because they are empty
#Used two methods of cleaning/manipulation
drop_columns = list(null_columns[null_columns > (shark_data.shape[0] * 0.9)].index)
shark_data = shark_data.drop(drop_columns, axis = 1)
shark_data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989


## Remove duplicate/unnecessary rows from data.

In [5]:
#We removed (Year) because it is listed in the Date
#We removed (href formula) because it is 99+% identical to (href)
#We removed (Case Number.1) + (Case Number.2) because it is 99+% identical to (Case Number)
#Used One method of cleaning/manipulation
shark_data = shark_data.drop(columns = ['Year', 'href formula', 'Case Number.1', 'Case Number.2'])

## Update column names for cleaning (Whitespaces/User Readability)

In [6]:
#Updated Type to Reason(Simpler to understand)
#Updated Area to State/Province (Universal)
#Removed whitespace from Sex, Species
#Updated lowercase columns to UpperCase
#Used One method of cleaning/manipulation
shark_data = shark_data.rename(columns = {
  'Type': 'Reason',
  'Area': 'State/Province',
  'Sex ': 'Sex',
  'Fatal (Y/N)' : 'Fatal (Y/N/U)',
  'Species ': 'Species',
  'pdf': 'PDF',
  'href': 'HREF',
  'original order': 'Original Order'
})

## Update fields for columuns with uncleaned fields

In [7]:
#Updated any empty fields with a (U) for (Unknown)
#Cleaned any fields with contained (dirty) cases to proper cases/ or unknown cases
#Used Two methods of cleaning/manipulation
shark_data['Case Number'] = shark_data['Case Number'].fillna('U')
shark_data['Date'] = shark_data['Date'].fillna('U')
shark_data['Reason'] = shark_data['Reason'].fillna('U')
shark_data['Country'] = shark_data['Country'].fillna('U')
shark_data['State/Province'] = shark_data['State/Province'].fillna('U')
shark_data['Location'] = shark_data['Location'].fillna('U')
shark_data['Activity'] = shark_data['Activity'].fillna('U')
shark_data['Name'] = shark_data['Name'].fillna('U')
shark_data['Sex'] = shark_data['Sex'].fillna('U')
shark_data['Age'] = shark_data['Age'].fillna('U')
shark_data['Injury'] = shark_data['Injury'].fillna('U')
shark_data['Fatal (Y/N/U)'] = shark_data['Fatal (Y/N/U)'].fillna('U')
shark_data['Time'] = shark_data['Time'].fillna('U')
shark_data['Species'] = shark_data['Species'].fillna('U')
shark_data['Investigator or Source'] = shark_data['Investigator or Source'].fillna('U')
shark_data['PDF'] = shark_data['PDF'].fillna('U')
shark_data['HREF'] = shark_data['HREF'].fillna('U')
shark_data['Original Order'] = shark_data['Original Order'].fillna('U')
#============================================
shark_data['Name'] = shark_data['Name'].str.replace('male', 'U')
shark_data['Name'] = shark_data['Name'].str.replace('female', 'U')
shark_data['Name'] = shark_data['Name'].str.replace('feU', 'U')
#============================================
shark_data['Sex'] = shark_data['Sex'].str.replace('M ', 'M')
shark_data['Sex'] = shark_data['Sex'].str.replace('F ', 'F')
shark_data['Sex'] = shark_data['Sex'].str.replace('N', 'U')
shark_data['Sex'] = shark_data['Sex'].str.replace('.', 'U')
shark_data['Sex'] = shark_data['Sex'].str.replace('lli', 'U')
#============================================
shark_data['Age'] = shark_data['Age'].str.replace('A.M.', 'U')
shark_data['Age'] = shark_data['Age'].str.replace('MAKE LINE GREEN', 'U')
shark_data['Age'] = shark_data['Age'].str.replace('X', 'U')
shark_data['Age'] = shark_data['Age'].str.replace('F', 'U')
shark_data['Age'] = shark_data['Age'].str.replace('\xa0 ', 'U')
#============================================
shark_data['Time'] = shark_data['Time'].str.replace('FATAL  (Wire netting installed at local beaches after this incident.)', 'U')
#============================================
shark_data['Species'] = shark_data['Species'].str.replace('\xa0 ', 'U')
shark_data['Species'] = shark_data['Species'].str.replace('Questionable incident', 'U')
shark_data['Species'] = shark_data['Species'].str.replace('Not authenticated', 'U')
#============================================
shark_data['Fatal (Y/N/U)'] = shark_data['Fatal (Y/N/U)'].str.replace('#VALUE!', 'U')
shark_data['Fatal (Y/N/U)'] = shark_data['Fatal (Y/N/U)'].str.replace('UNKNOWN', 'U')
shark_data['Fatal (Y/N/U)'] = shark_data['Fatal (Y/N/U)'].str.replace('N ', 'N')
shark_data['Fatal (Y/N/U)'] = shark_data['Fatal (Y/N/U)'].str.replace(' N', 'N')
shark_data['Fatal (Y/N/U)'] = shark_data['Fatal (Y/N/U)'].str.replace('F', 'Y')
shark_data['Fatal (Y/N/U)'] = shark_data['Fatal (Y/N/U)'].str.replace('n', 'N')
#============================================
shark_data['Reason'] = shark_data['Reason'].str.replace('Invalid', 'U')

## Checking the cleansed data for empty cells/any missed portions.

In [8]:
print(set(shark_data['Reason']))
print('\n=================')
#print(set(shark_data['Country']))
#print('\n=================')
#print(set(shark_data['State/Province']))
#print('\n=================')
#print(set(shark_data['Activity']))
#print('\n=================')
#print(set(shark_data['Name']))
#print('\n=================')
print(set(shark_data['Sex']))
print('\n=================')
#print(set(shark_data['Age']))
#print('\n=================')
#print(set(shark_data['Injury']))
#print('\n=================')
print(set(shark_data['Fatal (Y/N/U)']))
print('\n=================')
#print(set(shark_data['Time']))
#print('\n=================')
#print(set(shark_data['Species']))
#print('\n=================')
#print(set(shark_data['Investigator or Source']))
#print('\n=================')


{'Boating', 'U', 'Sea Disaster', 'Unprovoked', 'Boat', 'Provoked'}

{'F', 'U', 'M'}

{'N', 'Y', 'U'}



## Displaying cleansed data and important numbers

In [9]:
#Display some important gathered data
#Used One method of cleaning/manipulation
null_columns = shark_data.isnull().sum()
print(null_columns[null_columns > 0])
print('\n=================')
print(shark_data['Sex'].value_counts())
print('\n=================')
print(shark_data['Age'].value_counts()[: 5])# Show top 5 ages
print('\n=================')
print(shark_data['Fatal (Y/N/U)'].value_counts())

Series([], dtype: int64)

M    4837
F     585
U     570
Name: Sex, dtype: int64

U     2687
17     148
18     145
19     138
20     136
Name: Age, dtype: int64

N    4325
Y    1553
U     114
Name: Fatal (Y/N/U), dtype: int64


In [10]:
#Sorted data based on (Reason)
#Used One method of cleaning/manipulation
shark_data.sort_values(["Reason"], axis=0, ascending=True, inplace=True) 
shark_data.head(5)

Unnamed: 0,Case Number,Date,Reason,Country,State/Province,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N/U),Time,Species,Investigator or Source,PDF,HREF,Original Order
3724,1960.11.00.d,Nov-60,Boat,SOUTH AFRICA,Western Cape Province,"5 km from Gordons Bay, False Bay",Hand lining for shad,"7.5 m boat, occupants: 8 men",U,U,"No injury to occupants, shark bit 45 cm hole i...",N,U,White shark (tooth fragments recovered from hu...,"D. Davies; T. Wallett, p.27-30",1960.11.00.d-GordonsBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2269
3604,1962.03.25.a,25-Mar-62,Boat,AUSTRALIA,New South Wales,Norah Head,Fishing & spearfishing,boat of Dennis Kemp & 4 other occupants,U,U,No injury to occupants. Shark holed boat & the...,N,U,"Bronze whaler shark, 4.6 m [15']",Sunday Mirror (Sydney),1962.03.25.a-KempBoat.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2389
1824,2000.06.13,13-Jun-00,Boat,USA,Florida,"Pensacola Bay, Escambia County",Sailing,22' pleasure boat,U,U,"No injury to occupants, boat's rear platform b...",N,14h30,"Bull shark, 2.4 m [8']","Mobile Register 6/14/2000; Charlotte Observer,...",2000.06.13-boat.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,4169
3909,1958.11.05,05-Nov-58,Boat,USA,California,"Pacific Beach, San Diego County",Fishing,"4.3 m skiff, occupant: Bob Shay",U,U,"No injury to occupant, shark chasing barracuda...",N,U,White shark,C. Limbaugh,1958.11.05-boat-Shay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2084
3612,1962.02.02,02-Feb-62,Boat,NEW ZEALAND,North Island,South of New Plymouth,Fishing,"17' fishing launch, occupants: A. Burkitt & C....",U,U,U,U,U,U,SAF Case #1125,1962.02.02-NV-Burkitt-Brooke-launch.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2381


## Exporting cleansed data to new `.CSV`

In [11]:
shark_data.to_csv('sharks.csv', index=False)