# Project 3 Data Cleaning: Sharks attacks



## Modules used

In [1]:
import pandas as pd
import numpy as np
import re
import itertools

## Dataset import and exploration

In [2]:
df = pd.read_csv('./attacks.csv', encoding='ISO-8859-1')
df_clean = df.copy()
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [3]:
def check_nulls():
    return df_clean.isnull().mean() * 100
check_nulls()

Case Number               66.170353
Date                      75.500525
Year                      75.508300
Type                      75.516075
Country                   75.694903
Area                      77.269370
Location                  77.599813
Activity                  77.615364
Name                      76.316915
Sex                       77.697003
Age                       86.506240
Injury                    75.609377
Fatal (Y/N)               77.595926
Time                      88.539439
Species                   86.533453
Investigator or Source    75.566614
pdf                       75.500525
href formula              75.504412
href                      75.500525
Case Number.1             75.500525
Case Number.2             75.500525
original order            75.473312
Unnamed: 22               99.996112
Unnamed: 23               99.992225
dtype: float64

## Changes on columns name to avoid whitespaces.



In [4]:
df_clean.columns = df_clean.columns.str.strip().str.lower()
df_clean.columns = df_clean.columns.str.replace(" ","_")
df_clean.columns






Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal_(y/n)', 'time',
       'species', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number.1', 'case_number.2', 'original_order', 'unnamed:_22',
       'unnamed:_23'],
      dtype='object')

## Nulls columns cleanse:
### Since  the dataset have columns Unnamed:22 and Unnamed: 23 with almost entirely null values, they are ment to be dropped.  

In [5]:
null_columns = df_clean.isnull().mean() * 100
null_columns = null_columns[null_columns > 99]
null_columns_eliminate = list(null_columns.index)
df_clean = df_clean.drop(null_columns_eliminate, axis=1)

### Due to country, area and location columns have a large portion of null data. The efforts to reduce data were aimed in reduce as many rows as possible with null data


In [6]:
df_clean[['country','area', 'location']].isna()

Unnamed: 0,country,area,location
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
25718,True,True,True
25719,True,True,True
25720,True,True,True
25721,True,True,True


In [7]:
cleaning_country_area_location = df_clean[(df_clean['country'].isna()) & (df_clean['area'].isna()) & (df_clean['location'].isna())].index
df_clean = df_clean.drop(cleaning_country_area_location, axis=0)


In [8]:
only_area = (df_clean['location'].isna()) & (df_clean['country'].isna())
only_country = (df_clean['location'].isna()) & (df_clean['area'].isna())
only_location = (df_clean['country'].isna()) & (df_clean['area'].isna())
excluding_area = (df_clean['area'].isna())
filter_cleaned = df_clean[(only_area) | (only_country) | (only_location) | (excluding_area) ].index

df_clean = df_clean.drop(filter_cleaned, axis=0)
df_clean[df_clean['country'].isna()]

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
3387,1970.08.02,02-Aug-1970,1970.0,Invalid,,Caribbean Sea,Between St. Kitts & Nevis,Sea Disaster Sinking of ferryboat Christina,,,...,,Afternoon,Shark involvement prior to death was not confi...,"Rome News Tribune, 8/3/1970",1970.08.02-Christina-ferryboat.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1970.08.02,1970.08.02,2916.0
4266,1956.00.00.g,1956,1956.0,Sea Disaster,,Between Comores & Madagascar,Geyser Bank,Shipwreck,"Captain Eric Hunt, the cook & a French passenger",M,...,Y,,,dinofish.com,1956.00.00.g-Capt-Hunt.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1956.00.00.g,1956.00.00.g,2037.0
4498,1949.12.00.b,Dec-1949,1949.0,Sea Disaster,,Caribbean Sea,Between Cuba & Costa Rica,"Sea Disaster, sinking of the motorship Wingate","Albert Battles, James Dean & 4 crew",M,...,Y,,Shark involvement not confirmed,"Canberra Times, 1/6/1950",1949.12.00.b-Wingate.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1949.12.00.b,1949.12.00.b,1805.0
5020,1931.04.27.R,Reported 27-Apr-1931,1931.0,Unprovoked,,French Southern Territories,Île Saint-Paul,"Fishing, boat capsized",Quillezic,M,...,Y,,,"Los Angeles Times, 4/27/1931",1931.04.27.R-Quillezic.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1931.04.27.R,1931.04.27.R,1283.0
6137,1787.07.05,05-Jul-1787,1787.0,Unprovoked,,St Helena,Landing Place,Swimming,Private Isaac Hicksled,M,...,Y,,,"H.R. Janisch (1885), Extracts from the St. Hel...",1787.07.05-Hicksled.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1787.07.05,1787.07.05,166.0


In [9]:
countries = {3387:'St. Kitts Nevis', 4266:'Madagascar', 4498:'Cuba', 5020:'France', 6137:'Cabo Verde'}

# dataframe.at[index,'column-name']='new value'


for key,country in countries.items():
    df_clean['country'][key] = country

df_clean[df_clean['country'].isna()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['country'][key] = country


Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order


In [10]:
df_clean['age'].unique()
Age_list = ['Teen','28 & 26','46 & 34','28, 23 & 30','Teens','36 & 26','Â','6Â½','21 & ?','23 & 20','7      &    31','Elderly',
'Ca. 33','>50','adult','9 & 12','? & 19','23 & 26','(adult)','33 & 37','37, 67, 35, 27,  ? & 27','21, 34,24 & 35','30 & 32',
'50 & 30','17 & 35','X','"middle-age"','34 & 19','33 & 26','2 to 3 months','MAKE LINE GREEN','"young"','17 & 16','F','young','36 & 23',
'A.M.','?    &   14','2Â½','teen','Â','\xa0 ', ' ', '  ']

index_age=[]
unique_age=[]
for i in df_clean['age']:
    if i in Age_list:
        if i not in unique_age:
            index_age.append(list(df_clean.index[df_clean['age']==i]))
        unique_age.append(i)

index_age = list(itertools.chain.from_iterable(index_age))

df_clean= df_clean.drop(index_age, axis=0)
df_clean

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6296,ND.0006,Before 1906,0.0,Unprovoked,AUSTRALIA,New South Wales,,Swimming,Arab boy,M,...,Y,,Said to involve a grey nurse shark that leapt ...,"L. Becke in New York Sun, 9/9/1906; L. Schultz...",ND-0006-ArabBoy-Prymount.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0006,ND.0006,7.0
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,Y,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0


In [11]:
age_dict={"20s" : 25, "40s" : 45,"60s": 65,"18 months": 2,"30s" : 35,"50s" : 55,"18 or 20" : 19,"12 or 13" : 13,
               "8 or 10": 9,"30 or 36" : 33,"33 or 37" : 35,"mid-30s" : 35,"60's" : 65, "16 to 18" : 17,"mid-20s" : 25,
               "21 or 26" : 24, "18 to 22" : 20,"9 months" : 1,"25 or 28" : 27,"13 or 18" : 15, "7 or 8": 8,"Both 11" : 11,
               "9 or 10" : 10,"10 or 12": 11,"31 or 33" : 32,"13 or 14" : 13,'6½': 6,' 30': 30,' 28': 28,'74 ':74, '45 ':45,
              '20 ':20, ' 43':43, '2½':2}

df_clean['age'].replace(age_dict, inplace=True)

In [12]:
df_clean['age'].unique()
df_clean['age'] = np.floor(pd.to_numeric(df_clean['age'], errors='coerce')).astype('Int64')
df_clean['age'].unique()

<IntegerArray>
[  57,   11,   48, <NA>,   18,   52,   15,   12,   32,   10,   21,   34,   30,
   60,   33,   29,   54,   41,   37,   56,   19,   25,   69,   38,   55,   35,
   46,   45,   14,   28,   20,   24,   26,   49,   22,    7,   31,   17,   40,
   13,   42,    3,    8,   50,   16,   82,   73,   68,   51,   39,   58,   47,
   61,   65,   36,   66,   43,    9,   72,   59,    6,   27,   64,   23,   71,
   44,   62,   63,   70,    2,   53,   74,    5,   86,   77,   84,   75,   87,
   67,    1,   81,   78]
Length: 82, dtype: Int64

In [13]:
injuried_cleaned = df_clean[df_clean['injury'].isna()]
index_list = injuried_cleaned.index.values.tolist()
df_clean= df_clean.drop(index_list, axis=0)
df_clean

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6296,ND.0006,Before 1906,0.0,Unprovoked,AUSTRALIA,New South Wales,,Swimming,Arab boy,M,...,Y,,Said to involve a grey nurse shark that leapt ...,"L. Becke in New York Sun, 9/9/1906; L. Schultz...",ND-0006-ArabBoy-Prymount.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0006,ND.0006,7.0
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,Y,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0


In [14]:

df_clean['fatal_(y/n)'][df_clean['fatal_(y/n)'] == 'Y'] = True 
df_clean['fatal_(y/n)'][(df_clean['injury'] == 'Fatal') | (df_clean['injury'] == 'Fatal')] = False 
df_clean['fatal_(y/n)'][(df_clean['fatal_(y/n)']  == 'N') | (df_clean['fatal_(y/n)']  == ' N') | (df_clean['fatal_(y/n)'] == 'N ')] = False
df_clean['fatal_(y/n)'].unique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['fatal_(y/n)'][df_clean['fatal_(y/n)'] == 'Y'] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['fatal_(y/n)'][(df_clean['injury'] == 'Fatal') | (df_clean['injury'] == 'Fatal')] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['fatal_(y/n)'][(df_clean['fatal_(y/n)']  == 'N') | (df_clean['fatal_(y/n)']  == ' N') | (df_clean['fatal_(y/n)'] == 'N ')] = False


array([False, True, nan, 'M', 'UNKNOWN', '2017'], dtype=object)

In [15]:
df_clean.reset_index(inplace=True, drop=True)
key_words =['mortem','killed','Human','human', 'death','Death','disap','Disap','drown','Drown','Missing','missing',
'recovered','Forensic','body','bodies','corpse','decap','Decap','perish','remains']

pil=[]
j=0
for word in key_words:
    j = 0
    for i in df_clean['injury']:
        if word in i:
            pil.append(j)
            df_clean['fatal_(y/n)'][j] = True
        j = j + 1
        
pil_listset=list(set(pil))

fatal_dict={'M' : False, 'UNKNOWN':False, '2017': False}
df_clean['fatal_(y/n)'].fillna(False, inplace = True)
df_clean['fatal_(y/n)'].replace(fatal_dict, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['fatal_(y/n)'][j] = True


In [16]:
# dates = df_clean['Date'].str.replace('Before','').replace('Reported','').replace('--','-',regex=True).replace('190Feb-2010', '19-Feb-2010',regex=True).replace('Late', '', regex=True).replace('Fall', '', regex=True).replace('Summer-2008')
# df_clean['Date'] = dates
# pd.to_datetime(df_clean['Date'])
df_clean['date2'] = df_clean['date']
df_clean['date3'] = df_clean['case_number']

def cast_date(date):
    date = str(date).strip()
    date = date[:10].replace('.','-')
    if date.endswith('00'):
        date = list(date)
        date[-1] = '1'
        return  "".join(date)
    return date
    

df_clean['date'] = df_clean['case_number'].apply(cast_date)


df_clean.to_csv('comp.csv')
# df_clean[df_clean['Date2'].str.contains('ND-00')]


# date_cleaned=df_clean['Date'].str.replace('.', '-')
# df_clean['Date'] = date_cleaned
# df_clean[['Date', 'Case_Number']].head(50)

In [17]:
df_clean

check_nulls()

case_number                0.017355
date                       0.000000
year                       0.034710
type                       0.034710
country                    0.000000
area                       0.000000
location                   4.946199
activity                   7.775078
name                       2.846234
sex                        8.399861
age                       42.814995
injury                     0.000000
fatal_(y/n)                0.000000
time                      50.503297
species                   43.387713
investigator_or_source     0.242971
pdf                        0.000000
href_formula               0.017355
href                       0.000000
case_number.1              0.000000
case_number.2              0.000000
original_order             0.000000
date2                      0.000000
date3                      0.017355
dtype: float64

In [18]:

## Warning 
nd_values = df_clean[df_clean['case_number'].str.contains('ND', na=False)]
index = df_clean[df_clean['case_number'].str.contains('ND', na=False)].index
index = list(index)
nd_values

for i in index:
    if any(chr.isdigit() for chr in df_clean['date2'][i] )== False:
        df_clean['date2'][i] = None
    
    else:   
         df_clean['date2'][i] = re.findall(r'[0-9]{4}',df_clean['date2'][i])[0]
        


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['date2'][i] = re.findall(r'[0-9]{4}',df_clean['date2'][i])[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['date2'][i] = None


In [19]:
df_clean[df_clean['date2'].isna()].index
df_clean = df_clean.drop(df_clean[df_clean['date2'].isna()].index, axis=0)
df_clean[df_clean['date2'].isna()].index
df_clean

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,date2,date3
0,2018.06.25,2018-06-25,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,25-Jun-2018,2018.06.25
1,2018.06.18,2018-06-18,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,18-Jun-2018,2018.06.18
2,2018.06.09,2018-06-09,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,09-Jun-2018,2018.06.09
3,2018.06.08,2018-06-08,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,08-Jun-2018,2018.06.08
4,2018.06.04,2018-06-04,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,04-Jun-2018,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5757,ND.0006,ND-0006,0.0,Unprovoked,AUSTRALIA,New South Wales,,Swimming,Arab boy,M,...,Said to involve a grey nurse shark that leapt ...,"L. Becke in New York Sun, 9/9/1906; L. Schultz...",ND-0006-ArabBoy-Prymount.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0006,ND.0006,7.0,1906,ND.0006
5758,ND.0005,ND-0005,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0,1903,ND.0005
5759,ND.0004,ND-0004,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0,1903,ND.0004
5760,ND.0003,ND-0003,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0,1900,ND.0003


In [20]:
df_clean.to_csv('comp.csv')

In [21]:
biased_elder_ages = df_clean[(df_clean['date'].str.startswith('000')) | (df_clean['date'] == 'nan')].index
print(biased_elder_ages)

df_clean = df_clean.drop(biased_elder_ages, axis=0)
df_clean



Int64Index([5109, 5664, 5665, 5666, 5667], dtype='int64')


Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,date2,date3
0,2018.06.25,2018-06-25,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,25-Jun-2018,2018.06.25
1,2018.06.18,2018-06-18,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,18-Jun-2018,2018.06.18
2,2018.06.09,2018-06-09,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,09-Jun-2018,2018.06.09
3,2018.06.08,2018-06-08,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,08-Jun-2018,2018.06.08
4,2018.06.04,2018-06-04,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,04-Jun-2018,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5757,ND.0006,ND-0006,0.0,Unprovoked,AUSTRALIA,New South Wales,,Swimming,Arab boy,M,...,Said to involve a grey nurse shark that leapt ...,"L. Becke in New York Sun, 9/9/1906; L. Schultz...",ND-0006-ArabBoy-Prymount.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0006,ND.0006,7.0,1906,ND.0006
5758,ND.0005,ND-0005,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0,1903,ND.0005
5759,ND.0004,ND-0004,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0,1903,ND.0004
5760,ND.0003,ND-0003,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0,1900,ND.0003


In [22]:

def parse_date(row):
    row = str(row)
    if row[4:8] == '-00-':
#         print(row)
        row = list(row)
        row[6] = '1'
        print("".join(row))
        return  "".join(row)
    else:
        return row
        

df_clean['date'] = df_clean['date'].apply(parse_date)

df_clean.loc[550]

2014-01-01
2008-01-01
2008-01-01
2004-01-01
2000-01-01
1999-01-01
1999-01-01
1998-01-01
1998-01-01
1998-01-01
1996-01-01
1996-01-01
1995-01-01
1995-01-01
1995-01-01
1994-01-01
1993-01-01
1993-01-01
1993-01-01
1993-01-01
1992-01-01
1989-01-01
1989-01-01
1988-01-01
1987-01-01
1987-01-01
1986-01-01
1985-01-01
1984-01-01
1984-01-01
1983-01-01
1983-01-01
1982-01-01
1982-01-01
1982-01-01
1981-01-01
1981-01-01
1980-01-01
1980-01-01
1980-01-01
1979-01-01
1978-01-01
1978-01-01
1976-01-01
1974-01-01
1974-01-01
1973-01-01
1973-01-01
1973-01-01
1972-01-01
1972-01-01
1971-01-01
1971-01-01
1971-01-01
1970-01-01
1970-01-01
1970-01-01
1970-01-01
1970-01-01
1970-01-01
1970-01-01
1970-01-01
1969-01-01
1968-01-01
1968-01-01
1966-01-01
1965-01-01
1965-01-01
1965-01-01
1965-01-01
1965-01-01
1963-01-01
1962-01-01
1962-01-01
1962-01-01
1961-01-01
1961-01-01
1961-01-01
1960-01-01
1960-01-01
1960-01-01
1960-01-01
1960-01-01
1960-01-01
1960-01-01
1960-01-01
1960-01-01
1959-01-01
1959-01-01
1959-01-01
1959-01-01

case_number                                                    2014.00.00.b
date                                                             2014-01-01
year                                                                 2014.0
type                                                             Unprovoked
country                                                                 USA
area                                                                 Hawaii
location                                                                NaN
activity                           Free diving / Photographing pilot whales
name                                                                   male
sex                                                                       M
age                                                                    <NA>
injury                                       "Minor laceration to shoulder"
fatal_(y/n)                                                           False
time        

In [23]:
df_clean.reset_index(inplace=True, drop=True)

In [24]:
index = df_clean[df_clean['case_number'].str.contains('ND', na=False)].index
index = list(index)
for i in index:
    df_clean['date'][i] = df_clean['date2'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['date'][i] = df_clean['date2'][i]


In [25]:
df_clean['date'] = df_clean['date'].str.replace(',', '-')

In [26]:
df_clean['date'] = pd.to_datetime(df_clean['date'], errors='coerce')

dates_index = df_clean[df_clean['date'].isna()].index
df_clean = df_clean.drop(dates_index, axis=0)
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5745 entries, 0 to 5748
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   case_number             5745 non-null   object        
 1   date                    5745 non-null   datetime64[ns]
 2   year                    5743 non-null   float64       
 3   type                    5743 non-null   object        
 4   country                 5745 non-null   object        
 5   area                    5745 non-null   object        
 6   location                5463 non-null   object        
 7   activity                5298 non-null   object        
 8   name                    5582 non-null   object        
 9   sex                     5261 non-null   object        
 10  age                     3295 non-null   Int64         
 11  injury                  5745 non-null   object        
 12  fatal_(y/n)             5745 non-null   bool    

In [27]:
df_clean.drop(["case_number.1", "case_number.2", "case_number", "year", "original_order", "date2", "date3", "time"], axis=1, inplace=True)

In [28]:
df_clean[df_clean['name'].isna()]

Unnamed: 0,date,type,country,area,location,activity,name,sex,age,injury,fatal_(y/n),species,investigator_or_source,pdf,href_formula,href
80,2017-09-14,Boating,AUSTRALIA,Westerm Australia,Esperance,Fishing,,,,"sharks rammed boats, no injury to occupants",False,"White shark, 3.5m","B. Myatt, GSAF",2017.09.14-EsperanceBoats.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
116,2017-07-02,Invalid,COMOROS,Anjouan,Moya,Fishing,,,,"Skull found in shark, a probable drowning & sc...",True,Shark involvement prior to death not confirmed,"Linfo, 7/3/2017",2017.07.02-Comoros.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
144,2017-04-17,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",,,,,Minor bite to the foot,False,,"Daytona Beach News-Journal, 4/17/2017",2017.04.17.b-Volusia.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
173,2017-01-13,Unprovoked,AUSTRALIA,Queensland,Clairview,Swimming,,M,57,Injury to hand,False,,"Brisbane Times,1/13/2017",2017.01.13.b-Clairview.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
527,2014-05-04,Unprovoked,SOUTH AFRICA,Western Cape Province,Simonstown,Diving,,,,4-inch laceration to arm,False,Cow shark,"Sunday Times, 5/5/2014",2015.05.04-CowShark.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5649,1785-09-26,Unprovoked,ENGLAND,Sussex,Brighton,,,M,,Human remains recovered from shark,True,Tiger shark?,"C. Moore, GSAF",1785.09.26.R-Brighton.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5656,1733-01-01,Invalid,ICELAND,Bardestrand,Talkknefiord,,,,,"Partial hominid remains recovered from shark, ...",True,Shark involvement prior to death unconfirmed,E. Olafsen,1733.00.00-Iceland.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5666,1934-01-01,Unprovoked,URUGUAY,Rocha,"Isla Chica, La Paloma",Swimming,,,,Foot bitten,False,,"Di Candia, 2004",ND-0150-Uruguay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5674,2012-01-01,Unprovoked,SPAIN,Canary Islands,Tenerife,Skin diving,,,,Injury required 16 stitches,False,,"C. Moore, GSAF",ND-0129-Tenerife.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...


In [29]:
df_clean['name'] = df_clean['name'].fillna('Anonymous')

In [30]:
df_clean['sex'] = df_clean['sex'].str.strip()
df_clean['sex'].unique()
df_clean['sex'].value_counts()

M      4649
F       609
lli       1
N         1
.         1
Name: sex, dtype: int64

In [31]:
df_clean['sex'] = df_clean['sex'].fillna('Unknown')

def gender_identifier(row):
    identifiers = ['M', 'F', 'Unknown']
    if row not in identifiers:
        row = 'Unknown'
        return row
    else:
        return row


df_clean['sex'] = df_clean['sex'].apply(gender_identifier)

df_clean['sex'].value_counts()

M          4649
F           609
Unknown     487
Name: sex, dtype: int64

In [32]:
df_clean['investigator_or_source']

0                                        R. Collier, GSAF
1                          K.McMurray, TrackingSharks.com
2                          K.McMurray, TrackingSharks.com
3                                          B. Myatt, GSAF
4                                               A .Kipper
                              ...                        
5744    L. Becke in New York Sun, 9/9/1906; L. Schultz...
5745                     H. Taunton; N. Bartlett,  p. 234
5746                H. Taunton; N. Bartlett,  pp. 233-234
5747                 F. Schwartz, p.23; C. Creswell, GSAF
5748                                           S.W. Baker
Name: investigator_or_source, Length: 5745, dtype: object

In [33]:
df_clean.drop([4826, 3627], axis=0, inplace=True)

In [34]:
df_clean['investigator_or_source'].fillna('no_source', inplace=True)

In [35]:
def finding_verbs_gerund(string):
    string = str(string)
    if 'ing' in string:
        new_word = re.findall(r"[A-Za-z]+(?:ing)", string)
        if len(new_word) > 1:
            return new_word[-1]
        else: 
            return ''.join(new_word)
    else:
        return string
    
df_clean['activity'] = df_clean['activity'].apply(finding_verbs_gerund)


In [36]:
df_clean['species'].fillna('Unidentified', inplace=True)

In [37]:
check_nulls()

date                       0.000000
type                       0.034825
country                    0.000000
area                       0.000000
location                   4.910326
activity                   0.000000
name                       0.000000
sex                        0.000000
age                       42.625805
injury                     0.000000
fatal_(y/n)                0.000000
species                    0.000000
investigator_or_source     0.000000
pdf                        0.000000
href_formula               0.017413
href                       0.000000
dtype: float64

In [38]:
df_clean

Unnamed: 0,date,type,country,area,location,activity,name,sex,age,injury,fatal_(y/n),species,investigator_or_source,pdf,href_formula,href
0,2018-06-25,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",False,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2018-06-18,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,False,Unidentified,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2018-06-09,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,False,Unidentified,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2018-06-08,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,False,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2018-06-04,Provoked,MEXICO,Colima,La Ticla,diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,False,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5744,1906-01-01,Unprovoked,AUSTRALIA,New South Wales,,Swimming,Arab boy,M,,FATAL,True,Said to involve a grey nurse shark that leapt ...,"L. Becke in New York Sun, 9/9/1906; L. Schultz...",ND-0006-ArabBoy-Prymount.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5745,1903-01-01,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,True,Unidentified,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5746,1903-01-01,Unprovoked,AUSTRALIA,Western Australia,,diving,Ahmun,M,,FATAL,True,Unidentified,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
5747,1900-01-01,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,True,Unidentified,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...


In [39]:
check_nulls()
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5743 entries, 0 to 5748
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date                    5743 non-null   datetime64[ns]
 1   type                    5741 non-null   object        
 2   country                 5743 non-null   object        
 3   area                    5743 non-null   object        
 4   location                5461 non-null   object        
 5   activity                5743 non-null   object        
 6   name                    5743 non-null   object        
 7   sex                     5743 non-null   object        
 8   age                     3295 non-null   Int64         
 9   injury                  5743 non-null   object        
 10  fatal_(y/n)             5743 non-null   bool          
 11  species                 5743 non-null   object        
 12  investigator_or_source  5743 non-null   object  