In [3]:
import pandas as pd
import numpy as np

In [4]:
#initial data loading, encoding is latin1 (not utf-8 as usual)
init_kelp = pd.read_csv('kelpfarmsdata.csv',encoding='latin1')
init_kelp.head()

Unnamed: 0,latitude,longitude,ocean depth (m),depth of sample (m),temperature of ocean (C),dissolved oxygen (Âµmol kg-1),apparent oxygen utilization (Âµmol kg-1),temperature of pH measurement (C),pH
0,56.509,-170.8147,122,113.9,2.199,316.01,2,-999,7.9743
1,56.509,-170.8147,122,100.1,2.098,319.32,2,-999,7.9977
2,56.509,-170.8147,122,59.4,1.637,334.65,2,-999,8.0398
3,56.509,-170.8147,122,41.6,0.763,342.38,2,-999,8.0659
4,56.509,-170.8147,122,30.7,-0.318,359.24,2,-999,8.0661


In [5]:
init_kelp.columns


Index(['latitude', 'longitude', 'ocean depth (m)', 'depth of sample (m)',
       'temperature of ocean (C)', 'dissolved oxygen (Âµmol kg-1)',
       'apparent oxygen utilization (Âµmol kg-1)',
       'temperature of pH measurement (C)', 'pH'],
      dtype='object')

In [6]:
#drops unnecessary columns
dropped = init_kelp.drop(['temperature of pH measurement (C)','depth of sample (m)','dissolved oxygen (Âµmol kg-1)','apparent oxygen utilization (Âµmol kg-1)'],axis =1)
dropped.head()

Unnamed: 0,latitude,longitude,ocean depth (m),temperature of ocean (C),pH
0,56.509,-170.8147,122,2.199,7.9743
1,56.509,-170.8147,122,2.098,7.9977
2,56.509,-170.8147,122,1.637,8.0398
3,56.509,-170.8147,122,0.763,8.0659
4,56.509,-170.8147,122,-0.318,8.0661


In [7]:
dropped.shape[0]

28206

In [38]:
#values with '-999' mean there are no values, we calculate how many there are to see what to do with them
mask = (dropped == -999).any(axis=1)
neg_999 = dropped[mask]
n = neg_999.shape[0]
print(n)

15240


In [39]:
#replace them with NAN values that are easier to fix
dropped.replace(-999, np.nan, inplace=True)

In [40]:
#uses knn imputer from sklearn to impute the data based on 5 nearest values. 
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(dropped)
imputed_df = pd.DataFrame(imputed_data, columns=dropped.columns)


In [41]:
#checks again to see if there are any -999 values remaining
mask = (imputed_df == -999).any(axis=1)
neg_999 = imputed_df[mask]
n = neg_999.shape[0]
print(n)

0


In [42]:
#renames columns
new_column_names = {
    'ocean depth (m)': 'Depth',
    'temperature of ocean (C)' : 'Temperature',
}
 
imputed_df.rename(columns=new_column_names, inplace=True)


In [43]:
imputed_df.head()

Unnamed: 0,latitude,longitude,Depth,Temperature,pH
0,56.509,-170.8147,122.0,2.199,7.9743
1,56.509,-170.8147,122.0,2.098,7.9977
2,56.509,-170.8147,122.0,1.637,8.0398
3,56.509,-170.8147,122.0,0.763,8.0659
4,56.509,-170.8147,122.0,-0.318,8.0661


In [44]:
#final df
imputed_df.to_csv('kelp.csv', encoding='utf-8', index=False)