In [1]:
# Data cleaning strategy
# 1. Remove all unnecessary columns
#      Blank columns at the end of the dataset
#      Case Number, Name, Investigator, PDF, original order, and extra href as we do not need those for our analysis
# 2. Remove all rows that are from before 1950
#      The record keeping seems inconsistant for the older data
#      Removes data as a result of war at sea in order to look at more "real life" scenarios comparable to today
# 3. Clean up Year column
#      Change to integer type
# 4. Clean up the 'Type' column
#      Remove rows of type "Invalid" as those look to be incidents determined to be not shark-related
#      Remove rows of type "Questionable", "Unconfirmed", "Unverified", or "Under Investigation" as they are not 
#             confirmed as shark incidents
# 5. Clean up the 'Fatal (Y/N)' column
#      Fix rows with "N" that are formatted wrong
#      Fix rows with "Y" that are formatted wrong
#      Rows that are neither "N" or "Y", mark as "Unknown"
# 6. Clean up Date column
#      Make the date column a valid and consistent date/time format
# 7. Clean up Sex column
#      Make sure all rows are either "M", "F", or "Unknown"
# 8. Clean up Activity column
#      Standardize activities??  Can keywords be pulled out??  Boating, surfing, swimming, etc...
# 9. Clean up Age column
#      Remove anything not a valid age
#      Convert to integer type
# 10. Clean up Time column
#      Convert to a valid date/time format


In [2]:
# Import dependencies
import pandas as pd
import numpy as np
import re

from sqlalchemy import create_engine
import psycopg2

#from config import db_password


In [3]:
# Set file directory path
file_dir = 'C:/Users/Bauer/Desktop/Analysis Projects/Final_Project_Team_4/Resources'
# Shark attack data file
shark_file = f'{file_dir}/GSAF5.csv'

In [4]:
# Read csv file into a dataframe
shark_attack_df = pd.read_csv(shark_file, low_memory=False)
shark_attack_df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Gender,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2021.09.10,10-Sep-21,2021,,EGYPT,,Sidi Abdel Rahmen,Swimming,Mohamed,M,...,No shark invovlement,Dr. M. Fouda & M. Salrm,2021.09.10-Mohamed.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.09.10,2021.09.10,6700.0,,
1,2021.09.09,9-Sep-21,2021,Unprovoked,USA,Florida,"Ponce Inlet, Volusia County",Surfing,Doyle Neilsen,M,...,,"Daytona Beach News-Journal, 9/14/2021",2021.09.09-Neilsen.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.09.09,2021.09.09,6699.0,,
2,2021.09.05,5-Sep-21,2021,Unprovoked,AUSTRALIA,New South Wales,Emerald Beach,Surfing,Timothy Thompson,M,...,White xhark,"B. Myatt, GSAF",2021.09.05-Thompson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.09.05,2021.09.05,6698.0,,
3,2021.09.03.b,3-Sep-21,2021,Unprovoked,British Overseas Territory,Turks and Caicos,,,male,M,...,,Anonymous,2021.09.03.b-TurksCaicos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.09.03.b,2021.09.03.b,6697.0,,
4,2021.08.28,28-Aug-21,2021,Unprovoked,USA,Texas,"Galveston Island, Galveston County",Boogie boarding,male,M,...,,"T. Craig, GSAF & K. McMurray, TrackingShark.com",2021.08.28.-Galveston.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2021.08.28,2021.08.28,6696.0,,


In [5]:
# Remove all of the extra "unnamed" columns at the end of the dataset
for column in shark_attack_df.columns:
    if 'Unnamed:' in column or 'Case Number' in column:
        shark_attack_df.drop(columns=column, inplace=True)

shark_attack_df.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Gender,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,original order
0,10-Sep-21,2021,,EGYPT,,Sidi Abdel Rahmen,Swimming,Mohamed,M,,Laceration to arm caused by metal object,,,No shark invovlement,Dr. M. Fouda & M. Salrm,2021.09.10-Mohamed.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6700.0
1,9-Sep-21,2021,Unprovoked,USA,Florida,"Ponce Inlet, Volusia County",Surfing,Doyle Neilsen,M,!6,Minor injury to right arm,N,13h20,,"Daytona Beach News-Journal, 9/14/2021",2021.09.09-Neilsen.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6699.0
2,5-Sep-21,2021,Unprovoked,AUSTRALIA,New South Wales,Emerald Beach,Surfing,Timothy Thompson,M,31,FATAL,Y,10h30,White xhark,"B. Myatt, GSAF",2021.09.05-Thompson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6698.0
3,3-Sep-21,2021,Unprovoked,British Overseas Territory,Turks and Caicos,,,male,M,,Wrist bitten,N,,,Anonymous,2021.09.03.b-TurksCaicos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6697.0
4,28-Aug-21,2021,Unprovoked,USA,Texas,"Galveston Island, Galveston County",Boogie boarding,male,M,!!,Lacerations both sides of lower leg immediatel...,N,11h45,,"T. Craig, GSAF & K. McMurray, TrackingShark.com",2021.08.28.-Galveston.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,6696.0


In [6]:
# Drop other unnecessary columns
shark_attack_df.drop(columns=['Name', 'Investigator or Source', 'href', 'pdf', 'original order'], inplace=True)

shark_attack_df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Gender,Age,Injury,Fatal (Y/N),Time,Species,href formula
0,10-Sep-21,2021,,EGYPT,,Sidi Abdel Rahmen,Swimming,M,,Laceration to arm caused by metal object,,,No shark invovlement,http://sharkattackfile.net/spreadsheets/pdf_di...
1,9-Sep-21,2021,Unprovoked,USA,Florida,"Ponce Inlet, Volusia County",Surfing,M,!6,Minor injury to right arm,N,13h20,,http://sharkattackfile.net/spreadsheets/pdf_di...
2,5-Sep-21,2021,Unprovoked,AUSTRALIA,New South Wales,Emerald Beach,Surfing,M,31,FATAL,Y,10h30,White xhark,http://sharkattackfile.net/spreadsheets/pdf_di...
3,3-Sep-21,2021,Unprovoked,British Overseas Territory,Turks and Caicos,,,M,,Wrist bitten,N,,,http://sharkattackfile.net/spreadsheets/pdf_di...
4,28-Aug-21,2021,Unprovoked,USA,Texas,"Galveston Island, Galveston County",Boogie boarding,M,!!,Lacerations both sides of lower leg immediatel...,N,11h45,,http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25835,,,,,,,,,,,,,,
25836,,,,,,,,,,,,,,
25837,,,,,,,,,,,,,,
25838,,,,,,,,,,,,,,


In [7]:
shark_attack_df.dtypes

Date            object
Year            object
Type            object
Country         object
Area            object
Location        object
Activity        object
Gender          object
Age             object
Injury          object
Fatal (Y/N)     object
Time            object
Species         object
href formula    object
dtype: object

In [8]:
# Update year column to number
shark_attack_df["Year"]=pd.to_numeric(shark_attack_df["Year"],errors='coerce')


In [9]:
# Remove any rows with data older than 1950 or are empty
shark_attack_df = shark_attack_df[shark_attack_df['Year'] >= 1950]

shark_attack_df.shape


(4887, 14)

In [10]:
#date_form_one = r'
shark_attack_df["Date"] = pd.to_datetime(shark_attack_df["Date"], errors='coerce')
shark_attack_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(4887, 14)

In [11]:
shark_attack_df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Gender,Age,Injury,Fatal (Y/N),Time,Species,href formula
0,2021-09-10,2021.0,,EGYPT,,Sidi Abdel Rahmen,Swimming,M,,Laceration to arm caused by metal object,,,No shark invovlement,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2021-09-09,2021.0,Unprovoked,USA,Florida,"Ponce Inlet, Volusia County",Surfing,M,!6,Minor injury to right arm,N,13h20,,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2021-09-05,2021.0,Unprovoked,AUSTRALIA,New South Wales,Emerald Beach,Surfing,M,31,FATAL,Y,10h30,White xhark,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2021-09-03,2021.0,Unprovoked,British Overseas Territory,Turks and Caicos,,,M,,Wrist bitten,N,,,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2021-08-28,2021.0,Unprovoked,USA,Texas,"Galveston Island, Galveston County",Boogie boarding,M,!!,Lacerations both sides of lower leg immediatel...,N,11h45,,http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4894,NaT,1950.0,Unprovoked,GREECE,,"Piraeus, Athens",Swimming,,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...
4895,1950-01-01,1950.0,Unprovoked,SINGAPORE,Singapore Harbor,,Diving for coins,M,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...
4896,1950-01-01,1950.0,Unprovoked,NEW CALEDONIA,North Province,"Voh, near meatworks","Spearfishing, but walking carrying fish on end...",M,,"Shark jumped from sea, taking fish & his right...",N,,,http://sharkattackfile.net/spreadsheets/pdf_di...
4897,NaT,1950.0,Unprovoked,NEW CALEDONIA,North Province,Mangalia Reef above Touho,"Helmet diving, collecting trochus shell",M,,"Arm bitten, surgically amputated",N,,,http://sharkattackfile.net/spreadsheets/pdf_di...


In [12]:
shark_attack_df['Type'].value_counts()

Unprovoked             3655
Provoked                469
Invalid                 381
Watercraft              251
Sea Disaster             98
Questionable             12
Boat                      7
Unverified                1
Unconfirmed               1
Under investigation       1
Name: Type, dtype: int64

In [13]:
# Remove types that are 'Invalid' as they are not shark related injuries or deaths
values_list = ['Invalid','Questionable','Unconfirmed','Unverified','Under investigation']
shark_attack_df = shark_attack_df[~shark_attack_df['Type'].isin(values_list)]
shark_attack_df.shape

(4491, 14)

In [14]:
shark_attack_df['Type'].value_counts()

Unprovoked      3655
Provoked         469
Watercraft       251
Sea Disaster      98
Boat               7
Name: Type, dtype: int64

In [20]:
shark_attack_df['Fatal (Y/N)'].value_counts()

N          3763
Y           661
UNKNOWN      43
M             1
2017          1
Name: Fatal (Y/N), dtype: int64

In [19]:
shark_attack_df['Fatal (Y/N)'] = shark_attack_df['Fatal (Y/N)'].replace(to_replace ='\s*N\s*', value = 'N', regex = True)
shark_attack_df['Fatal (Y/N)'] = shark_attack_df['Fatal (Y/N)'].replace(to_replace ='Y.*', value = 'Y', regex = True)

  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
shark_attack_df.shape

(4491, 14)

In [None]:
# Set up engine to transfer our shark attack df to PostgreSQL
#db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/shark_attack_data"
#engine = create_engine(db_string)
    
# Upload shark_attack_df to a SQL database
#shark_attack_df.to_sql(name='shark_attacks', con=engine, if_exists='replace')
    
    