## 0. Import Libraries

In [48]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

### 0.1 Load Dataset, Format Data, Undersample, and Split Train/Test Data

In [136]:
df_test = pd.read_csv('calilp_dhw_label_giov.csv')

df_test.crw_baa = pd.to_numeric(df_test.crw_baa)
df_test.loc[(df_test['class'] == 'Coral/Algae') & (df_test.crw_baa > 0), 'class'] = "Coral"
df_test.loc[(df_test['class'] != 'Coral'), 'class'] = 'Other'

distance_filter = 50

data = df_test[df_test.ac_distances < distance_filter].iloc[:, 1:301]
data = data.apply(pd.to_numeric, errors='coerce')
data['label'], class_label = df_test['class'][df_test.ac_distances < distance_filter].factorize()
data = data.dropna()

#extract predictors and labels
X = data.iloc[:, 0:300]
y = data['label']

#undersample from scikitlearn package
rus = RandomUnderSampler(random_state=12)
X_res, y_res = rus.fit_resample(X, y)

#split data for train/test
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.15, random_state=8) 

## 1.0 Train Initial Models

In [130]:
#random forest - full
clf = RandomForestClassifier(max_depth = 2, random_state=0)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc,prec,rec)
print(np.mean(cross_val_score(clf, X_res, y_res, cv=3)))

0.6115593920060048 0.5874241588527302 0.7877218934911243
0.6046275966897484


In [131]:
#logit - full    
clf = LogisticRegression(random_state=8).fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc,prec,rec)
print(np.mean(cross_val_score(clf, X_res, y_res, cv=3)))

0.6213173203227622 0.6032510535821795 0.7411242603550295
0.6155491752519281


## 2.0 Train Models with Filtered Feature Set

In [139]:
#filtering features now
X = data.iloc[:, 0:11]
X = X.join(data.iloc[:, 205:211])
X = X.join(data.iloc[:, 220:226])

y = data['label']
y = y[y.index.isin(list(X.index))]

#undersample from scikitlearn package
rus = RandomUnderSampler(random_state=12)
X_res, y_res = rus.fit_resample(X, y)

#split data for train/test
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.15, random_state=8) 

In [84]:
#random forest - slim features
clf = RandomForestClassifier(max_depth = 2, random_state=0)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc,prec,rec)
print(np.mean(cross_val_score(clf, X_res, y_res, cv=3)))

0.589041095890411 0.6048123980424144 0.5484467455621301
0.5863592861566177


In [85]:
#logit - slim features    
clf = LogisticRegression(random_state=8).fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc,prec,rec)
print(np.mean(cross_val_score(clf, X_res, y_res, cv=3)))

0.5106023644210922 0.5995850622406639 0.10687869822485208
0.5241794741879188


## 3.0 Augment CALIPSO Data with Giovanni

In [125]:
#adding gio data
data = df_test[df_test.ac_distances < distance_filter].iloc[:, 1:301]
data = data.apply(pd.to_numeric, errors='coerce')
data[['par','chlor_a', 'org_part', 'ino_part']] = df_test[['par','chlor_a', 'org_part', 'ino_part']][df_test.ac_distances < distance_filter]
data['label'], class_label = df_test['class'][df_test.ac_distances < distance_filter].factorize()
data = data.dropna()

#extract predictors and labels
X = data.iloc[:, 0:304]
y = data['label']

#undersample from scikitlearn package
rus = RandomUnderSampler(random_state=12)
X_res, y_res = rus.fit_resample(X, y)

#split data for train/test
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.15, random_state=8) 

In [127]:
#random forest - gio
clf = RandomForestClassifier(max_depth = 2, random_state=0)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc,prec,rec)
print(np.mean(cross_val_score(clf, X_res, y_res, cv=3)))

0.7124137931034483 0.6901217861975643 0.7306590257879656
0.6909786881853921


In [128]:
#logit - gio
clf = LogisticRegression(max_iter=200, random_state=8).fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc,prec,rec)
print(np.mean(cross_val_score(clf, X_res, y_res, cv=10)))

0.6862068965517242 0.6551724137931034 0.7349570200573066
0.692022134153783


## 4.0 Filter Feature Set and Apply Giovanni Augmentation

In [89]:
#filtering features now and adding the gio
X = data.iloc[:, 0:11]
X = X.join(data.iloc[:, 205:211])
X = X.join(data.iloc[:, 220:226])
X[['par','chlor_a','org_part','ino_part']] = data[['par','chlor_a','org_part','ino_part']]

y = data['label']
y = y[y.index.isin(list(X.index))]

#undersample from scikitlearn package
rus = RandomUnderSampler(random_state=12)
X_res, y_res = rus.fit_resample(X, y)

#split data for train/test
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.15, random_state=8) 

In [90]:
#random forest - gio and slim features
clf = RandomForestClassifier(max_depth = 2, random_state=0)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc,prec,rec)
print(np.mean(cross_val_score(clf, X_res, y_res, cv=3)))

0.6689655172413793 0.633578431372549 0.7406876790830945
0.6831160769708257


In [91]:
#logit - gio and slim features    
clf = LogisticRegression(max_iter=200, random_state=8).fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc,prec,rec)
print(np.mean(cross_val_score(clf, X_res, y_res, cv=5)))

0.6689655172413793 0.6322815533980582 0.7464183381088825
0.6688440202844944
