In [24]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Set default font size
plt.rcParams['font.size'] = 24

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns
sns.set(font_scale = 2)

# Splitting data into training and testing
from sklearn.model_selection import train_test_split

In [39]:
# Read in data into a dataframe 
headings = ['season','size','speed','mxPH','mnO2','Cl', 'NO3','NH4','oPO4','PO4','Chla','a1','a2','a3','a4', 'a5','a6','a7'];
data = pd.read_csv('algal_bloom.csv', 
                   skiprows=list(range(10)), 
                   names=headings)

# Display top of dataframe
data.head()

display(data)

Unnamed: 0,season,size,speed,mxPH,mnO2,Cl,NO3,NH4,oPO4,PO4,Chla,a1,a2,a3,a4,a5,a6,a7
0,spring,small_,medium,8.35,8,57.75,1.288,370,428.75,558.75,1.3,1.4,7.6,4.8,1.9,6.7,0,2.10000\
1,autumn,small_,medium,8.1,11.4,40.02,5.33,346.66699,125.667,187.05701,15.6,3.3,53.6,1.9,0.0,0.0,0,9.70000\
2,spring,small_,medium,8.07,4.8,77.364,2.302,98.182,61.182,138.7,1.4,3.1,41.0,18.9,0.0,1.4,0,1.40000\
3,autumn,small_,medium,8.06,9,55.35,10.416,233.7,58.222,97.58,10.5,9.2,2.9,7.5,0.0,7.5,4.1,1.00000\
4,winter,small_,high__,8.25,13.1,65.75,9.248,430,18.25,56.667,28.4,15.1,14.6,1.4,0.0,22.5,12.6,2.90000\
5,summer,small_,high__,8.15,10.3,73.25,1.535,110,61.25,111.75,3.2,2.4,1.2,3.2,3.9,5.8,6.8,0.00000\
6,autumn,small_,high__,8.05,10.6,59.067,4.99,205.66701,44.667,77.434,6.9,18.2,1.6,0.0,0.0,5.5,8.7,0.00000\
7,winter,small_,medium,8.7,3.4,21.95,0.886,102.75,36.3,71,5.544,25.4,5.4,2.5,0.0,0.0,0,0.00000\
8,winter,small_,high__,7.93,9.9,8,1.39,5.8,27.25,46.6,0.8,17.0,0.0,0.0,2.9,0.0,0,1.70000\
9,spring,small_,high__,7.7,10.2,8,1.527,21.571,12.75,20.75,0.8,16.6,0.0,0.0,0.0,1.2,0,6.00000\


In [32]:
print(type(data['Cl'][5]))

<class 'numpy.float64'>


In [41]:
print(type(data['NO3'][18]))
print(float(data['NO3'][18]))

<class 'str'>


ValueError: could not convert string to float: '2.822008777.59961'

In [43]:
# Replace all occurrences of Not Available with numpy not a number
data = data.replace({'XXXXXXX': np.nan})

# Replace all weird numerical values (>1 floating points, / in number etc...)
# Replace with start of number to 3dp

# Detecting numbers 
for heading in headings[3:]:
    print(heading)
    for row in data[heading]:
        
        try:
            float(row)
            #df.loc[cnt, 'OWN_OCCUPIED']=np.nan
        except ValueError:
            print(row)



mxPH
mnO2
Cl
NO3
2.822008777.59961
4.825001729.00000
2.818003515.00000
0.050006400.00000
3.444001911.00000
0.921001386.25000
1.051002082.85010
1.720002167.37012
4.030005738.33008
7.160004073.33008
6.513003466.65991
7.740001990.16003
45.6500024064.00000
4.908001131.66003
3.685001495.00000
3.561001168.00000
3.923001081.66003
NH4
oPO4
PO4
Chla
a1
a2
a3
a4
a5
a6
1.40000\
0.00000\
0.00000\
1.90000\
1.00000\
6.10000\
1.90000\
12.50000\
2.40000\
0.00000\
0.00000\
7.00000\
0.00000\
2.70000\
1.90000\
0.00000\
1.70000\
a7
2.10000\
9.70000\
1.40000\
1.00000\
2.90000\
0.00000\
0.00000\
0.00000\
1.70000\
6.00000\
1.50000\
2.10000\
4.10000\
0.00000\
0.00000\
0.00000\
7.20000\
2.20000\
0.00000\
0.00000\
0.00000\
1.90000\
0.00000\
0.00000\
2.10000\
2.50000\
0.00000\
0.00000\
2.90000\
1.20000\
0.00000\
0.00000\
10.20000\
0.00000\
7.10000\
1.60000\
0.00000\
0.00000\
0.00000\
0.00000\
0.00000\
0.00000\
0.00000\
3.20000\
3.90000\
3.20000\
0.00000\
0.00000\
0.00000\
0.00000\
0.00000\
0.00000\
0.00000\
0.00

In [29]:
# Missing values and data types
# Convert all numerical values to float


# Iterate through the columns
for col in list(data.columns):
    # Select columns that should be numeric
    if ('mxPH' in col or 
        'mn02' in col or 
        'Cl' in col or 
        'NO3' in col or 
        'NH4' in col or 
        'oPO4' in col or 
        'PO4' in col or 
        'Chla' in col or 
        'a1' in col or 
        'a2' in col or 
        'a3' in col or
        'a4' in col or 
        'a5' in col or 
        'a6' in col or
        'a7' in col
       ):
        # Convert the data type to float
        data[col] = data[col].str[:9]
        data[col] = data[col].astype(float)

AttributeError: Can only use .str accessor with string values, which use np.object_ dtype in pandas