In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
import math 

In [3]:
df = pd.read_csv(r'C:\Users\harr1\Downloads\V0_BENT_LPI.csv')                                               #######
print (df.info())

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229198 entries, 0 to 229197
Data columns (total 31 columns):
OBJECTID           229198 non-null int64
ROUNDID            229198 non-null int64
MISSIONID          229198 non-null object
REGION_NAME        229198 non-null object
ISLAND             229198 non-null object
SITEVISITID        229198 non-null int64
SITE               229198 non-null object
LATITUDE           229184 non-null float64
LONGITUDE          229184 non-null float64
REEF_ZONE          229198 non-null object
DEPTH_BIN          224864 non-null object
MIN_Z_M            209939 non-null float64
MAX_Z_M            209939 non-null float64
DATE_              229198 non-null object
OBS_YEAR           229198 non-null int64
LPITRANSECTRUN     229198 non-null int64
DIVER              229198 non-null object
MINDEPTH           180610 non-null float64
MAXDEPTH           182488 non-null float64
METHODCODE         229198 non-null int64
CALIBRATION        99078 non-null float64
TRANNUM

In [4]:
#Drop columns that are not important                                                                                                       
df = df.drop(['MINDEPTH', 'MAXDEPTH', 'CALIBRATION', 'MIN_Z_M', 'MAX_Z_M', 'OBJECTID', 'ROUNDID',
             'TRANNUM', 'LPI_SEG', 'METHODCODE'], axis=1)
#No information about how 2009 data collected, so drop this year from dataset
df = df[df['OBS_YEAR']!=2009]
#Only interested in corals, so eliminate anything that is not a coral
df = df[df['BENTHICCATEGORY'] == 'CORL']
#Drop nulls
df= df.dropna()

In [5]:
#Get dummies for island, region, family, diver, and reef zone                                         
df = pd.concat([df, df['FAMILY'].str.get_dummies(sep=',')], axis=1)
df = pd.concat([df, df['ISLAND'].str.get_dummies(sep=',')], axis=1)
df = pd.concat([df, df['REEF_ZONE'].str.get_dummies(sep=',')], axis=1)
df = pd.concat([df, df['REGION_NAME'].str.get_dummies(sep=',')], axis=1)
df = pd.concat([df, df['DIVER'].str.get_dummies(sep=',')], axis = 1)

In [7]:
#convert date from dd-ABREV-yyyy to yyyymmdd                                                                     ####

from datetime import datetime

df['Date']= [datetime.strptime(x, '%d-%b-%y').strftime('%Y-%m-%d') for x in df['DATE_']]

df['Month'] = [datetime.strptime(x, '%Y-%m-%d').strftime('%Y-%m') for x in df['Date']]

for x in df['Date']:
    x.replace("-", "")
    
for y in df['Month']:
    y.replace("-", "")
    

#Sort values as they are out of order in the original dataframe
df_sorted = df.sort_values('Date')




In [8]:
#Calculate the Transects taken per day
#Calculate the total number of corals found per day

df_sorted['Transect/day'] = df_sorted['LPITRANSECTRUN'].groupby(df_sorted['Date']).transform('nunique')                         ######
df_sorted['Coral/day'] = df_sorted['COUNT'].groupby(df_sorted['Date']).transform('sum')


In [9]:
# Transforming data based on samples taken during observations                                     

# Since 2005-2008 data was collected every 50cm and 2010-2012 data collected every 20cm, we need to standardize the data
# 2005-2008 data multiplied by 2.5 as there were 2.5X fewer observations made per transect. Data divided by number 
# of transects taken per day to standardize how many corals found per meter

df_2005 = df_sorted[df_sorted['Date']<='2008-12-31']
df_2010 = df_sorted[df_sorted['Date']>'2009-12-31']


df_2005['Coral/m']= (((df_2005['Coral/day']*2.5)/df_2005['Transect/day'])/25)
df_2010['Coral/m']= (((df_2010['Coral/day'])/df_2010['Transect/day'])/25)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [10]:
#Add the transformed data back into the sorted dataframe
df_sorted= pd.concat([df_2005, df_2010], axis=0)
print (df_sorted['Coral/m'])

1         0.875000
98        0.875000
100       0.875000
105       0.875000
11        0.875000
96        0.875000
8         0.875000
5         0.875000
4         0.875000
3         0.875000
10        0.875000
93        1.550000
92        1.550000
89        1.550000
87        1.550000
84        1.550000
83        1.550000
81        1.550000
79        1.550000
77        1.550000
76        1.550000
55        1.550000
49        1.550000
94        1.550000
47        1.550000
52        1.550000
44        1.550000
74        1.150000
71        1.150000
58        1.150000
            ...   
211918    1.266667
211865    1.266667
211790    1.266667
211864    1.266667
211855    1.266667
211797    1.266667
211798    1.266667
211799    1.266667
211800    1.266667
211805    1.266667
211806    1.266667
211812    1.266667
211816    1.266667
211822    1.266667
211824    1.266667
211825    1.266667
211832    1.266667
211833    1.266667
211834    1.266667
211837    1.266667
211838    1.266667
211839    1.

## Feature Selection

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn import tree
from IPython.display import Image
import pydotplus
import graphviz
from sklearn.tree import DecisionTreeRegressor



In [12]:
data = df[['OBS_YEAR',
           'LATITUDE', 
           'LONGITUDE', 
           #Reef zone
           'Forereef', 'Lagoon', 'Protected Slope', 'Backreef',
           #Families
           'Acroporidae', 'Agariciidae', 'Pocilloporidae', 'Faviidae', 'Siderastreidae',
 'Poritidae', 'Merulinidae', 'Milleporidae', 'Dendrophylliidae', 'Fungiidae',
 'Mussidae', 'Oculinidae', 'Pectiniidae', 'Helioporidae', 'Astrocoeniidae',
 'Caryophylliidae', 'Stylasteridae',
           #Divers
           'BVA', 'JCK', 'JSH', 'CLR', 'RDW', 'RO',
           #Islands
           'Guam', 'Santa Rosa', 'Wake', 'Hawaii', 'Kauai', 'Kaula', 'Lanai',
       'Lehua', 'Maui', 'Molokai', 'Niihau', 'Oahu', 'French Frigate',
       'Kure', 'Laysan', 'Lisianski', 'Maro', 'Midway', 'Necker',
       'Pearl & Hermes', 'Baker', 'Howland', 'Jarvis', 'Johnston',
       'Kingman', 'Palmyra', 'Ofu & Olosega', 'Rose', 'Swains', 'Tau',
       'Tutuila', 'Agrihan', 'Aguijan', 'Alamagan', 'Asuncion',
       'Farallon de Pajaros', 'Guguan', 'Maug', 'Pagan', 'Rota', 'Saipan',
       'Sarigan', 'Tinian',
           #Regions
           'Mariana Archipelago', 'Pacific Remote Island Areas',
       'Main Hawaiian Islands', 'Northwestern Hawaiian Islands',
       'American Samoa'
          ]]
target = df_sorted['Coral/m']
           

                

In [13]:
data_names=['OBS_YEAR',
           'LATITUDE', 
           'LONGITUDE', 
           #Reef zone
           'Forereef', 'Lagoon', 'Protected Slope', 'Backreef',
           #Families
           'Acroporidae', 'Agariciidae', 'Pocilloporidae', 'Faviidae', 'Siderastreidae',
 'Poritidae', 'Merulinidae', 'Milleporidae', 'Dendrophylliidae', 'Fungiidae',
 'Mussidae', 'Oculinidae', 'Pectiniidae', 'Helioporidae', 'Astrocoeniidae',
 'Caryophylliidae', 'Stylasteridae',
           #Divers
           'BVA', 'JCK', 'JSH', 'CLR', 'RDW', 'RO',
           #Islands
           'Guam', 'Santa Rosa', 'Wake', 'Hawaii', 'Kauai', 'Kaula', 'Lanai',
       'Lehua', 'Maui', 'Molokai', 'Niihau', 'Oahu', 'French Frigate',
       'Kure', 'Laysan', 'Lisianski', 'Maro', 'Midway', 'Necker',
       'Pearl & Hermes', 'Baker', 'Howland', 'Jarvis', 'Johnston',
       'Kingman', 'Palmyra', 'Ofu & Olosega', 'Rose', 'Swains', 'Tau',
       'Tutuila', 'Agrihan', 'Aguijan', 'Alamagan', 'Asuncion',
       'Farallon de Pajaros', 'Guguan', 'Maug', 'Pagan', 'Rota', 'Saipan',
       'Sarigan', 'Tinian',
           #Regions
           'Mariana Archipelago', 'Pacific Remote Island Areas',
       'Main Hawaiian Islands', 'Northwestern Hawaiian Islands',
       'American Samoa'
          ]

In [14]:
RandForest= ensemble.RandomForestRegressor()

RandForest.fit(data,target)




RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [15]:


RandForest.feature_importances_
RandForest_ft_series = pd.Series(data=RandForest.feature_importances_, index=data.columns)
###################################
sorted_series = RandForest_ft_series.sort_values(ascending = False)
sorted_series

OBS_YEAR                       3.520281e-01
LATITUDE                       2.925017e-01
LONGITUDE                      1.963834e-01
Lanai                          2.381714e-02
Pacific Remote Island Areas    1.982375e-02
Lagoon                         1.851463e-02
Poritidae                      1.372094e-02
Pearl & Hermes                 1.268406e-02
CLR                            1.027174e-02
Niihau                         7.735686e-03
Tau                            6.666781e-03
Wake                           5.753583e-03
JCK                            3.139494e-03
Forereef                       2.982991e-03
Oahu                           2.822317e-03
Swains                         2.694338e-03
Baker                          2.616932e-03
Tutuila                        2.544593e-03
BVA                            2.120030e-03
Acroporidae                    1.969020e-03
American Samoa                 1.398802e-03
Pocilloporidae                 1.335854e-03
Laysan                         1

## Linear Regressions: NW Hawaiian vs Pacific Remote

In [16]:
df_NWHI= df_sorted[df_sorted['REGION_NAME']=='Northwestern Hawaiian Islands']

print(df_NWHI['ISLAND'].unique())
print(df_NWHI['DIVER'].unique())

['Necker' 'French Frigate' 'Maro' 'Laysan' 'Pearl & Hermes' 'Midway'
 'Kure' 'Lisianski']
['BVA' 'EEK' 'CLR' 'PSV' 'RDW']


In [17]:

data_NW = df_NWHI[[
                'LATITUDE', 
                'LONGITUDE', 
    'OBS_YEAR',
    'French Frigate', 'Kure', 'Laysan', 'Lisianski', 'Maro', 'Midway', 'Necker', 'Pearl & Hermes',
    'BVA', 'EEK', 'CLR', 'RDW', 'PSV']]

target_NW = df_NWHI['Coral/m']

data_NW_names=['LATITUDE', 
                'LONGITUDE', 
    'OBS_YEAR',
    'French Frigate', 'Kure', 'Laysan', 'Lisianski', 'Maro', 'Midway', 'Necker', 'Pearl & Hermes',
    'BVA', 'EEK', 'CLR', 'RDW', 'PSV']
############################################

regr_NW = linear_model.LinearRegression()

regr_NW.fit(data_NW, target_NW)

cross_variable_NW=cross_val_score(regr_NW, data_NW, target_NW, cv=10)
print('Cross Validation: \n', cross_variable_NW)
print ('Mean:', np.mean(cross_variable_NW))
print ('std:',np.std(cross_variable_NW))
print('Coefficients: \n', regr_NW.coef_)
print('Intercept: \n', regr_NW.intercept_)
print ('Regression score: \n', regr_NW.score(data_NW,target_NW))

Cross Validation: 
 [-4.38728201e+22 -3.38878496e-02  3.06259507e-01  3.89589451e-01
 -9.36534916e-01  1.92574172e-01 -8.58611213e+00 -2.75823827e+01
 -7.49485056e-02 -3.97902225e-02]
Mean: -4.3872820097818003e+21
std: 1.31618460293454e+22
Coefficients: 
 [-6.09749680e+00 -2.42887025e+00 -1.30053692e+00 -3.12674800e+11
 -3.12674800e+11 -3.12674800e+11 -3.12674800e+11 -3.12674800e+11
 -3.12674800e+11 -3.12674800e+11 -3.12674800e+11 -2.92440583e+11
 -2.92440583e+11 -2.92440583e+11 -2.92440583e+11 -2.92440583e+11]
Intercept: 
 605115385026.6493
Regression score: 
 0.6768727120267547


In [18]:
df_PRI= df_sorted[df_sorted['REGION_NAME']=='Pacific Remote Island Areas']

print(df_PRI['ISLAND'].unique())
print(df_PRI['DIVER'].unique())

['Wake' 'Johnston' 'Howland' 'Baker' 'Jarvis' 'Palmyra' 'Kingman']
['VB' 'BVA' 'CLR' 'NNP' 'RDW']


In [19]:


data_PRI = df_PRI[[
                'LATITUDE', 
                'LONGITUDE',
    'OBS_YEAR',
                'Wake', 'Baker', 'Howland', 'Jarvis', 'Johnston', 'Kingman', 'Palmyra',
    'VB',  'BVA', 'CLR', 'NNP', 'RDW'       
          ]]
target_PRI = df_PRI['Coral/m']

data_PRI_names = ['LATITUDE', 
                'LONGITUDE',
    'OBS_YEAR',
                'Wake', 'Baker', 'Howland', 'Jarvis', 'Johnston', 'Kingman', 'Palmyra',
    'VB',  'BVA', 'CLR', 'NNP', 'RDW'       
          ]
############################################

regr_PRI = linear_model.LinearRegression()

regr_PRI.fit(data_PRI, target_PRI)

cross_variable_PRI=cross_val_score(regr_PRI, data_PRI, target_PRI, cv=10)
print('Cross Validation: \n', cross_variable_PRI)
print ('Mean:', np.mean(cross_variable_PRI))
print ('std:',np.std(cross_variable_PRI))
print('Coefficients: \n', regr_PRI.coef_)
print('Intercept: \n', regr_PRI.intercept_)
print ('Regression score: \n', regr_PRI.score(data_PRI,target_PRI))

Cross Validation: 
 [-0.03057147  0.25071588  0.04942736 -0.42756215 -0.26564785 -1.04707388
 -0.06624968 -3.47853977 -0.05161445  0.03885202]
Mean: -0.5028264002257709
std: 1.048638908818488
Coefficients: 
 [-5.57170899e+00 -3.52935278e+00 -1.12264010e-01  1.08018391e+03
 -2.36862989e+02 -2.33684835e+02 -1.81871812e+02 -1.20156660e+02
 -1.52839884e+02 -1.54767732e+02 -6.91184029e-02 -4.12175581e-01
 -1.92773035e-01  3.78451793e-01  2.95615227e-01]
Intercept: 
 -157.35397872222566
Regression score: 
 0.19507201258467488


   ## Random forest
    

In [24]:
#Random forest for all data
RandForest= ensemble.RandomForestRegressor()

%timeit RandForest.fit(data,target)

cross_variable=cross_val_score(RandForest, data, target, cv=10)
print('Cross Validation: \n', cross_variable)
print ('Mean:', np.mean(cross_variable))
print ('std:',np.std(cross_variable))
print ('Regression score: \n', RandForest.score(data,target))



1.23 s ± 21.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Cross Validation: 
 [-0.81958368 -0.15894791 -0.20083773 -0.12036037 -0.51318119 -1.19605777
 -1.02072297 -1.25363443 -2.4113888  -0.51248311]
Mean: -0.8207197970533763
std: 0.6633862451292123
Regression score: 
 0.9217930884896011


## Neural Networks

#MLP so slow! Ran with 4,25 hidden layers and the score was only .064. That run was 15+17 s per loop.

In [35]:
# Import the model.
from sklearn.neural_network import MLPRegressor


mlp = MLPRegressor(hidden_layer_sizes=(25,100))
%timeit mlp.fit(data, target)

The slowest run took 26.29 times longer than the fastest. This could mean that an intermediate result is being cached.
25.1 s ± 38.2 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
mlp.score(data, target)

0.3948459738228207

In [37]:
from sklearn.model_selection import cross_val_score
cross_score = cross_val_score(mlp, data, target, cv=5)
print('Cross scores:', cross_score)
print('Stdev:', np.std(cross_score))

Cross scores: [-0.3204384  -2.7196599  -7.14702305 -2.86858487 -1.49341705]
Stdev: 2.3109671273373995


## With (25,100) hidden layers, this model is much improved but if you look at the times for this model vs the random forest, it would take many more hidden layers for this neural network and would be incredibly slow. It is already much slower than the random forest and less effective